News download: Add API to cleanly abort the download of an article during the preprocess stage based on the article's contents.

2025-08-07 09:01:38 -04:00 · 2015-03-25 17:19:52 +05:30 · 2015-03-25 17:19:52 +05:30 · 93916c5ffd
commit 93916c5ffd
parent da8a8781f7
2 changed files with 26 additions and 10 deletions
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -22,8 +22,7 @@ from calibre.web import Recipe
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
 from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
-from calibre.web.fetch.simple import option_parser as web2disk_option_parser
-from calibre.web.fetch.simple import RecursiveFetcher
+from calibre.web.fetch.simple import option_parser as web2disk_option_parser, RecursiveFetcher, AbortArticle
 from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.utils.date import now as nowf
@ -584,6 +583,12 @@ class BasicNewsRecipe(Recipe):
        '''
        return None

+    def abort_article(self, msg=None):
+        ''' Call this method inside any of the preprocess methods to abort the
+        download for the current article. Useful to skip articles that contain
+        inappropriate content, such as pure video articles. '''
+        raise AbortArticle(msg or _('Article download aborted'))
+
    def preprocess_raw_html(self, raw_html, url):
        '''
        This method is called with the source of each downloaded :term:`HTML` file, before
@ -1572,13 +1577,19 @@ class BasicNewsRecipe(Recipe):

    def error_in_article_download(self, request, traceback):
        self.jobs_done += 1
-        self.log.error('Failed to download article:', request.article.title,
-        'from', request.article.url)
-        self.log.debug(traceback)
-        self.log.debug('\n')
-        self.report_progress(float(self.jobs_done)/len(self.jobs),
-                _('Article download failed: %s')%force_unicode(request.article.title))
-        self.failed_downloads.append((request.feed, request.article, traceback))
+        if traceback and re.search('^AbortArticle:', traceback, flags=re.M) is not None:
+            self.log.warn('Aborted download of article:', request.article.title,
+                          'from', request.article.url)
+            self.report_progress(float(self.jobs_done)/len(self.jobs),
+                _('Article download aborted: %s')%force_unicode(request.article.title))
+        else:
+            self.log.error('Failed to download article:', request.article.title,
+            'from', request.article.url)
+            self.log.debug(traceback)
+            self.log.debug('\n')
+            self.report_progress(float(self.jobs_done)/len(self.jobs),
+                    _('Article download failed: %s')%force_unicode(request.article.title))
+            self.failed_downloads.append((request.feed, request.article, traceback))

    def parse_feeds(self):
        '''
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -23,6 +23,9 @@ from calibre.utils.magick import Image
 from calibre.utils.magick.draw import identify_data, thumbnail
 from calibre.utils.imghdr import what

+class AbortArticle(Exception):
+    pass
+
 class FetchError(Exception):
    pass

@ -567,7 +570,9 @@ class RecursiveFetcher(object):

                    save_soup(soup, res)
                    self.localize_link(tag, 'href', res)
-                except Exception:
+                except Exception as err:
+                    if isinstance(err, AbortArticle):
+                        raise
                    self.failed_links.append((iurl, traceback.format_exc()))
                    self.log.exception('Could not fetch link', iurl)
                finally: