News download: Add a field to allow recipe authors to tell calibre to remove duplicate articles that a re present in more than one section from the download.

2025-08-30 23:00:21 -04:00 · 2012-10-07 22:50:04 +05:30 · 2012-10-07 22:50:04 +05:30 · c35a81dafa
commit c35a81dafa
parent 7cd23bbeaa
2 changed files with 36 additions and 0 deletions
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -265,6 +265,12 @@ class Feed(object):
        if i > -1:
            self.articles[i:i+1] = []
    def remove_article(self, article):
        try:
            self.articles.remove(article)
        except ValueError:
            pass
 class FeedCollection(list):
    def __init__(self, feeds):
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -321,6 +321,15 @@ class BasicNewsRecipe(Recipe):
    #: The string will be used as the disabled message
    recipe_disabled = None
    #: Ignore duplicates of articles that are present in more than one section.
    #: A duplicate article is an article that has the same title and/or URL.
    #: To ignore articles with the same title, set this to:
    #: ignore_duplicate_articles = {'title'}
    #: To use URLs instead, set it to:
    #: ignore_duplicate_articles = {'url'}
    #: To match on title or URL, set it to:
    #: ignore_duplicate_articles = {'title', 'url'}
    ignore_duplicate_articles = None
    # See the built-in profiles for examples of these settings.
@ -1019,6 +1028,24 @@ class BasicNewsRecipe(Recipe):
            url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
        return self._fetch_article(url, dir,  f, a, num_of_feeds)
    def remove_duplicate_articles(self, feeds):
        seen_keys = defaultdict(set)
        remove = []
        for f in feeds:
            for article in f:
                for key in self.ignore_duplicate_articles:
                    val = getattr(article, key)
                    seen = seen_keys[key]
                    if val:
                        if val in seen:
                            remove.append((f, article))
                        else:
                            seen.add(val)
        for feed, article in remove:
            self.log.debug('Removing duplicate article: %s from section: %s'%(
                article.title, feed.title))
            feed.remove_article(article)
    def build_index(self):
        self.report_progress(0, _('Fetching feeds...'))
@ -1033,6 +1060,9 @@ class BasicNewsRecipe(Recipe):
        if not feeds:
            raise ValueError('No articles found, aborting')
        if self.ignore_duplicate_articles is not None:
            self.remove_duplicate_articles(feeds)
        #feeds = FeedCollection(feeds)
        self.report_progress(0, _('Trying to download cover...'))