|
1 | 1 | import random |
| 2 | +import re |
2 | 3 |
|
3 | 4 | import pytest |
4 | 5 |
|
@@ -320,45 +321,7 @@ def make_entry(title=None, summary=None, content=None): |
320 | 321 | make_entry(title='title', content=('value', 'text/html')), |
321 | 322 | True, |
322 | 323 | ), |
323 | | - ( |
324 | | - make_entry(title='title', summary='one ' * 40), |
325 | | - make_entry(title='title', summary='one ' * 39 + 'two '), |
326 | | - True, |
327 | | - ), |
328 | | - ( |
329 | | - make_entry(title='title', summary='one ' * 40), |
330 | | - make_entry(title='title', summary='one ' * 38), |
331 | | - True, |
332 | | - ), |
333 | | - ( |
334 | | - make_entry(title='title', summary='one ' * 40), |
335 | | - make_entry(title='title', summary='one ' * 20 + 'two ' * 3 + 17 * 'one '), |
336 | | - False, |
337 | | - ), |
338 | | - ( |
339 | | - make_entry(title='title', summary='one ' * 50), |
340 | | - make_entry( |
341 | | - title='title', summary='one ' * 30 + 'two ' + 17 * 'one ' + 'three ' |
342 | | - ), |
343 | | - True, |
344 | | - ), |
345 | | - ( |
346 | | - make_entry(title='title', summary='one ' * 50), |
347 | | - make_entry(title='title', summary='one ' * 30 + 'two ' * 5 + 25 * 'one '), |
348 | | - False, |
349 | | - ), |
350 | | - ( |
351 | | - make_entry(title='title', summary='one ' * 70), |
352 | | - make_entry( |
353 | | - title='title', summary='one ' * 30 + 'two ' * 5 + 33 * 'one ' + 'three ' |
354 | | - ), |
355 | | - True, |
356 | | - ), |
357 | | - ( |
358 | | - make_entry(title='title', summary='one ' * 70), |
359 | | - make_entry(title='title', summary='one ' * 30 + 'two ' * 10 + 30 * 'one '), |
360 | | - False, |
361 | | - ), |
| 324 | + # TODO: test fuzzy matching (just one test) |
362 | 325 | # TODO: test normalization |
363 | 326 | ] |
364 | 327 |
|
@@ -579,16 +542,107 @@ def test_tokenize(tokenize, input, expected): |
579 | 542 | assert tokenize(input) == expected |
580 | 543 |
|
581 | 544 |
|
| 545 | +def with_edits(text, edits, end_at=None): |
| 546 | + if end_at: |
| 547 | + text = re.search(rf"(?s).*?{end_at}", text)[0] |
| 548 | + edited = text |
| 549 | + for edit in edits: |
| 550 | + edited = edited.replace(*edit) |
| 551 | + return text, edited |
| 552 | + |
| 553 | + |
| 554 | +TEXT = """\ |
| 555 | +So, you're doing some I/O bound stuff, in parallel. |
| 556 | +
|
| 557 | +Maybe you're scraping some websites – a lot of websites. |
| 558 | +
|
| 559 | +Maybe you're updating or deleting millions of DynamoDB items. |
| 560 | +
|
| 561 | +You've got your [ThreadPoolExecutor], |
| 562 | +you've increased the number of threads and tuned connection limits... |
| 563 | +but after some point, **it's just not getting any faster**. |
| 564 | +You look at your Python process, |
| 565 | +and you see CPU utilization hovers above 100%. |
| 566 | +
|
| 567 | +You *could* split the work into batches |
| 568 | +and have a [ProcessPoolExecutor] |
| 569 | +run your original code in separate processes. |
| 570 | +But that requires yet more code, and a bunch of changes, which is no fun. |
| 571 | +And maybe your input is not that easy to split into batches. |
| 572 | +
|
| 573 | +If only we had an executor that |
| 574 | +**worked seamlessly across processes and threads**. |
| 575 | +
|
| 576 | +Well, you're in luck, since that's exactly what we're building today! |
| 577 | +
|
| 578 | +And even better, in a couple years you won't even need it anymore. |
| 579 | +
|
| 580 | +--- |
| 581 | +
|
| 582 | +**asyncio-thread-runner** allows you to run async code from sync code. |
| 583 | +
|
| 584 | +This is useful when you're doing some sync stuff, but: |
| 585 | +
|
| 586 | +* you also need to do some async stuff, **without** making **everything async** |
| 587 | +* maybe the sync stuff is an existing application |
| 588 | +* maybe you still want to use your favorite sync library |
| 589 | +* or maybe you need just a little async, without having to pay the full price |
| 590 | +
|
| 591 | +Features: |
| 592 | +
|
| 593 | +* unlike [asyncio.run()], it provides a **long-lived event loop** |
| 594 | +* unlike [asyncio.Runner], it runs in a dedicated thread, and you can use it from **multiple threads** |
| 595 | +* it allows you to use **async context managers** and **iterables** from sync code |
| 596 | +* check out [this article](https://death.andgravity.com/asyncio-bridge) for why these are useful |
| 597 | +
|
| 598 | +""" |
| 599 | +EDITS = [ |
| 600 | + ("you're", "youre"), |
| 601 | + ("I/O bound", "IO-bound"), |
| 602 | + ("parallel", "paralel"), |
| 603 | + ("Maybe", "And", 1), |
| 604 | + ("a lot", "lots"), |
| 605 | + ("PoolExecutor", " Pool Executor"), |
| 606 | + ("work", "input"), |
| 607 | + ("you won't even need it anymore", ""), |
| 608 | +] |
| 609 | +EXTRA_EDITS = EDITS + [ |
| 610 | + ("So", "Soo"), |
| 611 | + ("stuff", "thing"), |
| 612 | + ("millions", "billions"), |
| 613 | + ("across processes and threads", ""), |
| 614 | +] |
| 615 | + |
| 616 | + |
582 | 617 | IS_DUPLICATE_DATA = [ |
583 | 618 | ('one two three four', 'one two three four', True), |
584 | | - ('one two three four', 'one two thre four', False), |
| 619 | + ('one two three four', 'one two thre four', True), |
585 | 620 | ('one two three four', 'one two three five', False), |
| 621 | + ('hello', 'helo', True), |
| 622 | + ('hello', 'helio', False), |
| 623 | + (*with_edits(TEXT, EDITS, "you're"), True), |
| 624 | + (*with_edits(TEXT, EXTRA_EDITS, "you're"), False), |
| 625 | + (*with_edits(TEXT, EDITS, "parallel"), True), |
| 626 | + (*with_edits(TEXT, EXTRA_EDITS, "parallel"), False), |
| 627 | + (*with_edits(TEXT, EDITS, "items"), True), |
| 628 | + (*with_edits(TEXT, EXTRA_EDITS, "items"), False), |
| 629 | + (*with_edits(TEXT, EDITS, "anymore"), True), |
| 630 | + (*with_edits(TEXT, EXTRA_EDITS, "anymore"), False), |
| 631 | + (*with_edits(TEXT, EDITS, "$"), True), |
| 632 | + (*with_edits(TEXT, EXTRA_EDITS, "$"), False), |
586 | 633 | ] |
587 | 634 |
|
588 | 635 |
|
589 | | -@pytest.mark.parametrize('one, two, expected', IS_DUPLICATE_DATA) |
| 636 | +def long_ids(s): |
| 637 | + if isinstance(s, str): |
| 638 | + if len(s) > 20: |
| 639 | + return s[:6] + '...' + s[-10:] |
| 640 | + |
| 641 | + |
| 642 | +@pytest.mark.parametrize('one, two, expected', IS_DUPLICATE_DATA, ids=long_ids) |
590 | 643 | def test_is_duplicate(one, two, expected): |
591 | | - assert is_duplicate(one.split(), two.split()) == expected |
| 644 | + actual = is_duplicate(tokenize_content(one), tokenize_content(two)) |
| 645 | + assert actual == expected |
592 | 646 |
|
593 | 647 |
|
594 | 648 | @pytest.mark.parametrize( |
|
0 commit comments