entry_dedupe: better is_duplicate(), increased thresholds, tests. #371

lemon24 · lemon24 · commit 4102a0566c63 · 2025-11-12T23:10:44.000+02:00
diff --git a/src/reader/plugins/entry_dedupe.py b/src/reader/plugins/entry_dedupe.py
@@ -156,7 +156,6 @@
 from functools import lru_cache
 from itertools import chain
 from itertools import islice
-from typing import NamedTuple
 
 from reader._storage._html_utils import strip_html
 from reader._utils import BetterStrPartial as partial
@@ -576,47 +575,46 @@ def strip_accents(s):
 # text similarity
 
 
-class _Threshold(NamedTuple):
-    length: int
-    similarity: float
-
-
-# thredsholds originally chosen in
-# https://github.com/lemon24/reader/issues/202#issuecomment-904139483
-# all figures in comments for 4-grams, substitutions only
-_THRESHOLDS = [
-    # 2 fully-spaced subs in the middle,
-    # 4 subs with consecutive on odd or even indexes in the middle,
-    # 7 subs with consecutive indexes in the middle,
-    # 10 subs at one end
-    _Threshold(64, 0.7),
-    # 1 substitution in the middle,
-    # or ~4 at the ends
-    _Threshold(48, 0.8),
-    # 1 substitution at the end
-    _Threshold(32, 0.9),
+# [(length, tokens_are_chars, n, threshold), ...]
+_IS_DUPLICATE_THRESHOLDS = [
+    # for shorter texts, we use character ngrams instead of word ngrams,
+    # since they're more forgiving of small changes (e.g. typos);
+    # thresholds based on the "reasonable" edits in test_is_duplicate TEXT
+    (12, True, 3, 0.6),
+    (200, True, 3, 0.7),
+    (400, True, 4, 0.7),
+    (800, True, 4, 0.8),
+    # for longer texts, we switch to words, since character ngrams are slow
+    (1600, False, 3, 0.7),
+    # thresholds based on the 0.8 value mentioned in [1],
+    # but increasing towards 0.9 since 0.8 seems too low, e.g.
+    # removing 10 words from the middle of 100 -> similarity 0.84 (n=4)
+    # [1]: https://github.com/lemon24/reader/issues/202#issuecomment-904139483
+    (2400, False, 3, 0.8),
+    (3600, False, 4, 0.8),
+    (4800, False, 4, 0.9),
 ]
 
 
 def is_duplicate(one, two):
-    # original logic doesn't handle short text well,
-    # so it just returns false if the inputs are not identical
-
     if one == two:
         return True
 
-    min_length = min(len(one), len(two))
+    avg_length = (sum(map(len, one)) + sum(map(len, two))) / 2
 
-    if min_length < min(t.length for t in _THRESHOLDS):
-        return False
+    for length, *params in _IS_DUPLICATE_THRESHOLDS:  # pragma: no cover
+        tokens_are_chars, n, threshold = params
+        if avg_length <= length:
+            break
 
-    similarity = jaccard_similarity(ngrams(one, 4), ngrams(two, 4))
+    if tokens_are_chars:
+        one = ' '.join(one)
+        two = ' '.join(two)
 
-    for threshold in _THRESHOLDS:
-        if min_length >= threshold.length and similarity >= threshold.similarity:
-            return True
+    pad = min(len(one), len(two)) < 100
+    similarity = jaccard_similarity(ngrams(one, n, pad), ngrams(two, n, pad))
 
-    return False
+    return similarity >= threshold
 
 
 def jaccard_similarity(one, two):
diff --git a/tests/test_plugins_entry_dedupe.py b/tests/test_plugins_entry_dedupe.py
@@ -1,4 +1,5 @@
 import random
+import re
 
 import pytest
 
@@ -320,45 +321,7 @@ def make_entry(title=None, summary=None, content=None):
         make_entry(title='title', content=('value', 'text/html')),
         True,
     ),
-    (
-        make_entry(title='title', summary='one ' * 40),
-        make_entry(title='title', summary='one ' * 39 + 'two '),
-        True,
-    ),
-    (
-        make_entry(title='title', summary='one ' * 40),
-        make_entry(title='title', summary='one ' * 38),
-        True,
-    ),
-    (
-        make_entry(title='title', summary='one ' * 40),
-        make_entry(title='title', summary='one ' * 20 + 'two ' * 3 + 17 * 'one '),
-        False,
-    ),
-    (
-        make_entry(title='title', summary='one ' * 50),
-        make_entry(
-            title='title', summary='one ' * 30 + 'two ' + 17 * 'one ' + 'three '
-        ),
-        True,
-    ),
-    (
-        make_entry(title='title', summary='one ' * 50),
-        make_entry(title='title', summary='one ' * 30 + 'two ' * 5 + 25 * 'one '),
-        False,
-    ),
-    (
-        make_entry(title='title', summary='one ' * 70),
-        make_entry(
-            title='title', summary='one ' * 30 + 'two ' * 5 + 33 * 'one ' + 'three '
-        ),
-        True,
-    ),
-    (
-        make_entry(title='title', summary='one ' * 70),
-        make_entry(title='title', summary='one ' * 30 + 'two ' * 10 + 30 * 'one '),
-        False,
-    ),
+    # TODO: test fuzzy matching (just one test)
     # TODO: test normalization
 ]
 
@@ -579,16 +542,107 @@ def test_tokenize(tokenize, input, expected):
     assert tokenize(input) == expected
 
 
+def with_edits(text, edits, end_at=None):
+    if end_at:
+        text = re.search(rf"(?s).*?{end_at}", text)[0]
+    edited = text
+    for edit in edits:
+        edited = edited.replace(*edit)
+    return text, edited
+
+
+TEXT = """\
+So, you're doing some I/O bound stuff, in parallel.
+
+Maybe you're scraping some websites – a lot of websites.
+
+Maybe you're updating or deleting millions of DynamoDB items.
+
+You've got your [ThreadPoolExecutor],
+you've increased the number of threads and tuned connection limits...
+but after some point, **it's just not getting any faster**.
+You look at your Python process,
+and you see CPU utilization hovers above 100%.
+
+You *could* split the work into batches
+and have a [ProcessPoolExecutor]
+run your original code in separate processes.
+But that requires yet more code, and a bunch of changes, which is no fun.
+And maybe your input is not that easy to split into batches.
+
+If only we had an executor that
+**worked seamlessly across processes and threads**.
+
+Well, you're in luck, since that's exactly what we're building today!
+
+And even better, in a couple years you won't even need it anymore.
+
+---
+
+**asyncio-thread-runner** allows you to run async code from sync code.
+
+This is useful when you're doing some sync stuff, but:
+
+* you also need to do some async stuff, **without** making **everything async**
+* maybe the sync stuff is an existing application
+* maybe you still want to use your favorite sync library
+* or maybe you need just a little async, without having to pay the full price
+
+Features:
+
+* unlike [asyncio.run()], it provides a **long-lived event loop**
+* unlike [asyncio.Runner], it runs in a dedicated thread, and you can use it from **multiple threads**
+* it allows you to use **async context managers** and **iterables** from sync code
+* check out [this article](https://death.andgravity.com/asyncio-bridge) for why these are useful
+
+"""
+EDITS = [
+    ("you're", "youre"),
+    ("I/O bound", "IO-bound"),
+    ("parallel", "paralel"),
+    ("Maybe", "And", 1),
+    ("a lot", "lots"),
+    ("PoolExecutor", " Pool Executor"),
+    ("work", "input"),
+    ("you won't even need it anymore", ""),
+]
+EXTRA_EDITS = EDITS + [
+    ("So", "Soo"),
+    ("stuff", "thing"),
+    ("millions", "billions"),
+    ("across processes and threads", ""),
+]
+
+
 IS_DUPLICATE_DATA = [
     ('one two three four', 'one two three four', True),
-    ('one two three four', 'one two thre four', False),
+    ('one two three four', 'one two thre four', True),
     ('one two three four', 'one two three five', False),
+    ('hello', 'helo', True),
+    ('hello', 'helio', False),
+    (*with_edits(TEXT, EDITS, "you're"), True),
+    (*with_edits(TEXT, EXTRA_EDITS, "you're"), False),
+    (*with_edits(TEXT, EDITS, "parallel"), True),
+    (*with_edits(TEXT, EXTRA_EDITS, "parallel"), False),
+    (*with_edits(TEXT, EDITS, "items"), True),
+    (*with_edits(TEXT, EXTRA_EDITS, "items"), False),
+    (*with_edits(TEXT, EDITS, "anymore"), True),
+    (*with_edits(TEXT, EXTRA_EDITS, "anymore"), False),
+    (*with_edits(TEXT, EDITS, "$"), True),
+    (*with_edits(TEXT, EXTRA_EDITS, "$"), False),
 ]
 
 
-@pytest.mark.parametrize('one, two, expected', IS_DUPLICATE_DATA)
+def long_ids(s):
+    if isinstance(s, str):
+        if len(s) > 20:
+            return s[:6] + '...' + s[-10:]
+
+
+@pytest.mark.parametrize('one, two, expected', IS_DUPLICATE_DATA, ids=long_ids)
 def test_is_duplicate(one, two, expected):
-    assert is_duplicate(one.split(), two.split()) == expected
+    actual = is_duplicate(tokenize_content(one), tokenize_content(two))
+    assert actual == expected
 
 
 @pytest.mark.parametrize(