Skip to content

Commit 4102a05

Browse files
committed
entry_dedupe: better is_duplicate(), increased thresholds, tests. #371
1 parent d4ac1e9 commit 4102a05

File tree

2 files changed

+125
-73
lines changed

2 files changed

+125
-73
lines changed

src/reader/plugins/entry_dedupe.py

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,6 @@
156156
from functools import lru_cache
157157
from itertools import chain
158158
from itertools import islice
159-
from typing import NamedTuple
160159

161160
from reader._storage._html_utils import strip_html
162161
from reader._utils import BetterStrPartial as partial
@@ -576,47 +575,46 @@ def strip_accents(s):
576575
# text similarity
577576

578577

579-
class _Threshold(NamedTuple):
580-
length: int
581-
similarity: float
582-
583-
584-
# thredsholds originally chosen in
585-
# https://github.com/lemon24/reader/issues/202#issuecomment-904139483
586-
# all figures in comments for 4-grams, substitutions only
587-
_THRESHOLDS = [
588-
# 2 fully-spaced subs in the middle,
589-
# 4 subs with consecutive on odd or even indexes in the middle,
590-
# 7 subs with consecutive indexes in the middle,
591-
# 10 subs at one end
592-
_Threshold(64, 0.7),
593-
# 1 substitution in the middle,
594-
# or ~4 at the ends
595-
_Threshold(48, 0.8),
596-
# 1 substitution at the end
597-
_Threshold(32, 0.9),
578+
# [(length, tokens_are_chars, n, threshold), ...]
579+
_IS_DUPLICATE_THRESHOLDS = [
580+
# for shorter texts, we use character ngrams instead of word ngrams,
581+
# since they're more forgiving of small changes (e.g. typos);
582+
# thresholds based on the "reasonable" edits in test_is_duplicate TEXT
583+
(12, True, 3, 0.6),
584+
(200, True, 3, 0.7),
585+
(400, True, 4, 0.7),
586+
(800, True, 4, 0.8),
587+
# for longer texts, we switch to words, since character ngrams are slow
588+
(1600, False, 3, 0.7),
589+
# thresholds based on the 0.8 value mentioned in [1],
590+
# but increasing towards 0.9 since 0.8 seems too low, e.g.
591+
# removing 10 words from the middle of 100 -> similarity 0.84 (n=4)
592+
# [1]: https://github.com/lemon24/reader/issues/202#issuecomment-904139483
593+
(2400, False, 3, 0.8),
594+
(3600, False, 4, 0.8),
595+
(4800, False, 4, 0.9),
598596
]
599597

600598

601599
def is_duplicate(one, two):
602-
# original logic doesn't handle short text well,
603-
# so it just returns false if the inputs are not identical
604-
605600
if one == two:
606601
return True
607602

608-
min_length = min(len(one), len(two))
603+
avg_length = (sum(map(len, one)) + sum(map(len, two))) / 2
609604

610-
if min_length < min(t.length for t in _THRESHOLDS):
611-
return False
605+
for length, *params in _IS_DUPLICATE_THRESHOLDS: # pragma: no cover
606+
tokens_are_chars, n, threshold = params
607+
if avg_length <= length:
608+
break
612609

613-
similarity = jaccard_similarity(ngrams(one, 4), ngrams(two, 4))
610+
if tokens_are_chars:
611+
one = ' '.join(one)
612+
two = ' '.join(two)
614613

615-
for threshold in _THRESHOLDS:
616-
if min_length >= threshold.length and similarity >= threshold.similarity:
617-
return True
614+
pad = min(len(one), len(two)) < 100
615+
similarity = jaccard_similarity(ngrams(one, n, pad), ngrams(two, n, pad))
618616

619-
return False
617+
return similarity >= threshold
620618

621619

622620
def jaccard_similarity(one, two):

tests/test_plugins_entry_dedupe.py

Lines changed: 96 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import random
2+
import re
23

34
import pytest
45

@@ -320,45 +321,7 @@ def make_entry(title=None, summary=None, content=None):
320321
make_entry(title='title', content=('value', 'text/html')),
321322
True,
322323
),
323-
(
324-
make_entry(title='title', summary='one ' * 40),
325-
make_entry(title='title', summary='one ' * 39 + 'two '),
326-
True,
327-
),
328-
(
329-
make_entry(title='title', summary='one ' * 40),
330-
make_entry(title='title', summary='one ' * 38),
331-
True,
332-
),
333-
(
334-
make_entry(title='title', summary='one ' * 40),
335-
make_entry(title='title', summary='one ' * 20 + 'two ' * 3 + 17 * 'one '),
336-
False,
337-
),
338-
(
339-
make_entry(title='title', summary='one ' * 50),
340-
make_entry(
341-
title='title', summary='one ' * 30 + 'two ' + 17 * 'one ' + 'three '
342-
),
343-
True,
344-
),
345-
(
346-
make_entry(title='title', summary='one ' * 50),
347-
make_entry(title='title', summary='one ' * 30 + 'two ' * 5 + 25 * 'one '),
348-
False,
349-
),
350-
(
351-
make_entry(title='title', summary='one ' * 70),
352-
make_entry(
353-
title='title', summary='one ' * 30 + 'two ' * 5 + 33 * 'one ' + 'three '
354-
),
355-
True,
356-
),
357-
(
358-
make_entry(title='title', summary='one ' * 70),
359-
make_entry(title='title', summary='one ' * 30 + 'two ' * 10 + 30 * 'one '),
360-
False,
361-
),
324+
# TODO: test fuzzy matching (just one test)
362325
# TODO: test normalization
363326
]
364327

@@ -579,16 +542,107 @@ def test_tokenize(tokenize, input, expected):
579542
assert tokenize(input) == expected
580543

581544

545+
def with_edits(text, edits, end_at=None):
546+
if end_at:
547+
text = re.search(rf"(?s).*?{end_at}", text)[0]
548+
edited = text
549+
for edit in edits:
550+
edited = edited.replace(*edit)
551+
return text, edited
552+
553+
554+
TEXT = """\
555+
So, you're doing some I/O bound stuff, in parallel.
556+
557+
Maybe you're scraping some websites – a lot of websites.
558+
559+
Maybe you're updating or deleting millions of DynamoDB items.
560+
561+
You've got your [ThreadPoolExecutor],
562+
you've increased the number of threads and tuned connection limits...
563+
but after some point, **it's just not getting any faster**.
564+
You look at your Python process,
565+
and you see CPU utilization hovers above 100%.
566+
567+
You *could* split the work into batches
568+
and have a [ProcessPoolExecutor]
569+
run your original code in separate processes.
570+
But that requires yet more code, and a bunch of changes, which is no fun.
571+
And maybe your input is not that easy to split into batches.
572+
573+
If only we had an executor that
574+
**worked seamlessly across processes and threads**.
575+
576+
Well, you're in luck, since that's exactly what we're building today!
577+
578+
And even better, in a couple years you won't even need it anymore.
579+
580+
---
581+
582+
**asyncio-thread-runner** allows you to run async code from sync code.
583+
584+
This is useful when you're doing some sync stuff, but:
585+
586+
* you also need to do some async stuff, **without** making **everything async**
587+
* maybe the sync stuff is an existing application
588+
* maybe you still want to use your favorite sync library
589+
* or maybe you need just a little async, without having to pay the full price
590+
591+
Features:
592+
593+
* unlike [asyncio.run()], it provides a **long-lived event loop**
594+
* unlike [asyncio.Runner], it runs in a dedicated thread, and you can use it from **multiple threads**
595+
* it allows you to use **async context managers** and **iterables** from sync code
596+
* check out [this article](https://death.andgravity.com/asyncio-bridge) for why these are useful
597+
598+
"""
599+
EDITS = [
600+
("you're", "youre"),
601+
("I/O bound", "IO-bound"),
602+
("parallel", "paralel"),
603+
("Maybe", "And", 1),
604+
("a lot", "lots"),
605+
("PoolExecutor", " Pool Executor"),
606+
("work", "input"),
607+
("you won't even need it anymore", ""),
608+
]
609+
EXTRA_EDITS = EDITS + [
610+
("So", "Soo"),
611+
("stuff", "thing"),
612+
("millions", "billions"),
613+
("across processes and threads", ""),
614+
]
615+
616+
582617
IS_DUPLICATE_DATA = [
583618
('one two three four', 'one two three four', True),
584-
('one two three four', 'one two thre four', False),
619+
('one two three four', 'one two thre four', True),
585620
('one two three four', 'one two three five', False),
621+
('hello', 'helo', True),
622+
('hello', 'helio', False),
623+
(*with_edits(TEXT, EDITS, "you're"), True),
624+
(*with_edits(TEXT, EXTRA_EDITS, "you're"), False),
625+
(*with_edits(TEXT, EDITS, "parallel"), True),
626+
(*with_edits(TEXT, EXTRA_EDITS, "parallel"), False),
627+
(*with_edits(TEXT, EDITS, "items"), True),
628+
(*with_edits(TEXT, EXTRA_EDITS, "items"), False),
629+
(*with_edits(TEXT, EDITS, "anymore"), True),
630+
(*with_edits(TEXT, EXTRA_EDITS, "anymore"), False),
631+
(*with_edits(TEXT, EDITS, "$"), True),
632+
(*with_edits(TEXT, EXTRA_EDITS, "$"), False),
586633
]
587634

588635

589-
@pytest.mark.parametrize('one, two, expected', IS_DUPLICATE_DATA)
636+
def long_ids(s):
637+
if isinstance(s, str):
638+
if len(s) > 20:
639+
return s[:6] + '...' + s[-10:]
640+
641+
642+
@pytest.mark.parametrize('one, two, expected', IS_DUPLICATE_DATA, ids=long_ids)
590643
def test_is_duplicate(one, two, expected):
591-
assert is_duplicate(one.split(), two.split()) == expected
644+
actual = is_duplicate(tokenize_content(one), tokenize_content(two))
645+
assert actual == expected
592646

593647

594648
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)