Skip to content

Commit 60cdfa8

Browse files
committed
entry_dedupe: link, published, and published minute heuristics. #371
1 parent 76fa3d0 commit 60cdfa8

File tree

2 files changed

+79
-37
lines changed

2 files changed

+79
-37
lines changed

src/reader/plugins/entry_dedupe.py

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@
156156
from functools import lru_cache
157157
from itertools import chain
158158
from itertools import islice
159+
from urllib.parse import urlparse
159160

160161
from reader._storage._html_utils import strip_html
161162
from reader._utils import BetterStrPartial as partial
@@ -324,25 +325,56 @@ def group_entries(all_entries, new_entries, is_duplicate):
324325

325326

326327
def title_grouper(entries, new_entries):
328+
return group_by(lambda e: tokenize_title(e.title), entries, new_entries)
329+
330+
331+
def link_grouper(entries, new_entries):
332+
return group_by(lambda e: normalize_url(e.link), entries, new_entries)
333+
334+
335+
def normalize_url(url):
336+
try:
337+
url = urlparse(url)
338+
except ValueError:
339+
return None
340+
341+
scheme = url.scheme.lower()
342+
if scheme == 'http':
343+
scheme = 'https'
344+
345+
netloc = url.netloc.lower()
346+
path = url.path.rstrip('/')
347+
348+
return url._replace(scheme=scheme, netloc=netloc, path=path).geturl()
349+
350+
351+
def published_grouper(entries, new_entries):
327352
def key(e):
328-
return tokenize_title(e.title)
353+
dt = e.published or e.updated
354+
if not dt:
355+
return None
356+
return dt.isoformat(timespec='minutes')
329357

330358
return group_by(key, entries, new_entries)
331359

332360

333-
GROUPERS = [title_grouper]
361+
def published_day_grouper(entries, new_entries):
362+
def key(e):
363+
dt = e.published or e.updated
364+
if not dt:
365+
return None
366+
return dt.date()
367+
368+
return group_by(key, entries, new_entries)
369+
370+
371+
GROUPERS = [title_grouper, link_grouper, published_grouper, published_day_grouper]
334372

335373

336374
# entry (content) similarity
337375

338376

339377
def is_duplicate_entry(one, two):
340-
# TODO: remove title checks once thresholds are increased for #371
341-
if not one.title or not two.title:
342-
return False
343-
if tokenize_title(one.title) != tokenize_title(two.title):
344-
return False
345-
346378
one_fields = _content_fields(one)
347379
two_fields = _content_fields(two)
348380

tests/test_plugins_entry_dedupe.py

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -29,40 +29,50 @@ def with_plugin():
2929
"""Tell reader to use the plugin from the beginning."""
3030

3131

32-
def test_duplicates_are_deleted(reader, with_plugin, parser):
33-
# detailed matching in test_is_duplicate_entry
32+
def test_only_duplicates_are_deleted(reader, parser):
33+
# detailed/fuzzy content matching tested in test_is_duplicate*
3434

3535
reader.add_feed(parser.feed(1))
3636

37-
yesterday = datetime(2010, 1, 1)
38-
parser.entry(1, 1, yesterday, title='title', summary='value')
39-
reader.update_feeds()
37+
parser.entry(
38+
1,
39+
'different',
40+
datetime(2010, 1, 1, 2, 3, 4),
41+
title='title',
42+
link='link',
43+
summary='another',
44+
)
45+
parser.entry(1, 'title', title='title', summary='value')
46+
parser.entry(1, 'title-x', summary='value')
47+
parser.entry(1, 'link', link='link', summary='value')
48+
parser.entry(1, 'link-x', link='link')
49+
parser.entry(1, 'published', published=datetime(2010, 1, 1, 2, 3), summary='value')
50+
parser.entry(1, 'published-x', published=datetime(2010, 1, 1, 2, 3))
51+
parser.entry(1, 'published-day', datetime(2010, 1, 1), summary='value')
52+
parser.entry(1, 'published-day-x', datetime(2010, 1, 1))
4053

41-
today = datetime(2010, 1, 2)
42-
parser.entry(1, 2, today, title='title', summary='value')
4354
reader.update_feeds()
4455

45-
assert {e.id for e in reader.get_entries()} == {'1, 2'}
46-
47-
48-
def test_non_duplicates_are_ignored(reader, with_plugin, parser):
49-
# detailed matching in test_is_duplicate_entry
50-
51-
reader.add_feed(parser.feed(1))
52-
53-
yesterday = datetime(2010, 1, 1)
54-
parser.entry(1, 1, None, title=None)
55-
parser.entry(1, 2, yesterday, title='title')
56-
parser.entry(1, 3, yesterday, summary='value')
57-
parser.entry(1, 4, yesterday, title='title', summary='another')
58-
parser.entry(1, 5, yesterday, title='another', summary='value')
59-
reader.update_feeds()
56+
init_reader(reader)
6057

61-
today = datetime(2010, 1, 2)
62-
parser.entry(1, 6, today, title='title', summary='value')
58+
parser.entry(
59+
1,
60+
'entry',
61+
datetime(2010, 1, 1, 2, 3, 4),
62+
title='title',
63+
link='link',
64+
summary='value',
65+
)
6366
reader.update_feeds()
6467

65-
assert {eval(e.id)[1] for e in reader.get_entries()} == {1, 2, 3, 4, 5, 6}
68+
assert {e.id for e in reader.get_entries()} == {
69+
'different',
70+
'title-x',
71+
'link-x',
72+
'published-x',
73+
'published-day-x',
74+
'entry',
75+
}
6676

6777

6878
def test_duplicates_in_another_feed_are_ignored(reader, with_plugin, parser):
@@ -270,11 +280,11 @@ def make_entry(title=None, summary=None, content=None):
270280
IS_DUPLICATE_ENTRY_DATA = [
271281
(make_entry(), make_entry(), False),
272282
(make_entry(title='title'), make_entry(title='title'), False),
273-
(make_entry(summary='summary'), make_entry(summary='summary'), False),
283+
(make_entry(summary='summary'), make_entry(summary='summary'), True),
274284
(
275285
make_entry(content=('value', 'text/html')),
276286
make_entry(content=('value', 'text/html')),
277-
False,
287+
True,
278288
),
279289
(
280290
make_entry(title='title', summary='summary'),
@@ -284,7 +294,7 @@ def make_entry(title=None, summary=None, content=None):
284294
(
285295
make_entry(title='title', summary='summary'),
286296
make_entry(title='other', summary='summary'),
287-
False,
297+
True,
288298
),
289299
(
290300
make_entry(title='title', summary='summary'),
@@ -299,7 +309,7 @@ def make_entry(title=None, summary=None, content=None):
299309
(
300310
make_entry(title='title', content=('value', 'text/html')),
301311
make_entry(title='other', content=('value', 'text/html')),
302-
False,
312+
True,
303313
),
304314
(
305315
make_entry(title='title', content=('value', 'text/html')),

0 commit comments

Comments
 (0)