mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
News download: Add a field to allow recipe authors to tell calibre to remove duplicate articles that a re present in more than one section from the download.
This commit is contained in:
parent
7cd23bbeaa
commit
c35a81dafa
@ -265,6 +265,12 @@ class Feed(object):
|
|||||||
if i > -1:
|
if i > -1:
|
||||||
self.articles[i:i+1] = []
|
self.articles[i:i+1] = []
|
||||||
|
|
||||||
|
def remove_article(self, article):
|
||||||
|
try:
|
||||||
|
self.articles.remove(article)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
class FeedCollection(list):
|
class FeedCollection(list):
|
||||||
|
|
||||||
def __init__(self, feeds):
|
def __init__(self, feeds):
|
||||||
|
@ -321,6 +321,15 @@ class BasicNewsRecipe(Recipe):
|
|||||||
#: The string will be used as the disabled message
|
#: The string will be used as the disabled message
|
||||||
recipe_disabled = None
|
recipe_disabled = None
|
||||||
|
|
||||||
|
#: Ignore duplicates of articles that are present in more than one section.
|
||||||
|
#: A duplicate article is an article that has the same title and/or URL.
|
||||||
|
#: To ignore articles with the same title, set this to:
|
||||||
|
#: ignore_duplicate_articles = {'title'}
|
||||||
|
#: To use URLs instead, set it to:
|
||||||
|
#: ignore_duplicate_articles = {'url'}
|
||||||
|
#: To match on title or URL, set it to:
|
||||||
|
#: ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
ignore_duplicate_articles = None
|
||||||
|
|
||||||
# See the built-in profiles for examples of these settings.
|
# See the built-in profiles for examples of these settings.
|
||||||
|
|
||||||
@ -1019,6 +1028,24 @@ class BasicNewsRecipe(Recipe):
|
|||||||
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
|
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
|
||||||
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||||
|
|
||||||
|
def remove_duplicate_articles(self, feeds):
|
||||||
|
seen_keys = defaultdict(set)
|
||||||
|
remove = []
|
||||||
|
for f in feeds:
|
||||||
|
for article in f:
|
||||||
|
for key in self.ignore_duplicate_articles:
|
||||||
|
val = getattr(article, key)
|
||||||
|
seen = seen_keys[key]
|
||||||
|
if val:
|
||||||
|
if val in seen:
|
||||||
|
remove.append((f, article))
|
||||||
|
else:
|
||||||
|
seen.add(val)
|
||||||
|
|
||||||
|
for feed, article in remove:
|
||||||
|
self.log.debug('Removing duplicate article: %s from section: %s'%(
|
||||||
|
article.title, feed.title))
|
||||||
|
feed.remove_article(article)
|
||||||
|
|
||||||
def build_index(self):
|
def build_index(self):
|
||||||
self.report_progress(0, _('Fetching feeds...'))
|
self.report_progress(0, _('Fetching feeds...'))
|
||||||
@ -1033,6 +1060,9 @@ class BasicNewsRecipe(Recipe):
|
|||||||
if not feeds:
|
if not feeds:
|
||||||
raise ValueError('No articles found, aborting')
|
raise ValueError('No articles found, aborting')
|
||||||
|
|
||||||
|
if self.ignore_duplicate_articles is not None:
|
||||||
|
self.remove_duplicate_articles(feeds)
|
||||||
|
|
||||||
#feeds = FeedCollection(feeds)
|
#feeds = FeedCollection(feeds)
|
||||||
|
|
||||||
self.report_progress(0, _('Trying to download cover...'))
|
self.report_progress(0, _('Trying to download cover...'))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user