From 93916c5ffd99610473ef036b30e7ef1655e319d4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 25 Mar 2015 17:19:52 +0530 Subject: [PATCH] News download: Add API to cleanly abort the download of an article during the preprocess stage based on the article's contents. --- src/calibre/web/feeds/news.py | 29 ++++++++++++++++++++--------- src/calibre/web/fetch/simple.py | 7 ++++++- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 3213be798f..1b5aaeb19c 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -22,8 +22,7 @@ from calibre.web import Recipe from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed -from calibre.web.fetch.simple import option_parser as web2disk_option_parser -from calibre.web.fetch.simple import RecursiveFetcher +from calibre.web.fetch.simple import option_parser as web2disk_option_parser, RecursiveFetcher, AbortArticle from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import now as nowf @@ -584,6 +583,12 @@ class BasicNewsRecipe(Recipe): ''' return None + def abort_article(self, msg=None): + ''' Call this method inside any of the preprocess methods to abort the + download for the current article. Useful to skip articles that contain + inappropriate content, such as pure video articles. ''' + raise AbortArticle(msg or _('Article download aborted')) + def preprocess_raw_html(self, raw_html, url): ''' This method is called with the source of each downloaded :term:`HTML` file, before @@ -1572,13 +1577,19 @@ class BasicNewsRecipe(Recipe): def error_in_article_download(self, request, traceback): self.jobs_done += 1 - self.log.error('Failed to download article:', request.article.title, - 'from', request.article.url) - self.log.debug(traceback) - self.log.debug('\n') - self.report_progress(float(self.jobs_done)/len(self.jobs), - _('Article download failed: %s')%force_unicode(request.article.title)) - self.failed_downloads.append((request.feed, request.article, traceback)) + if traceback and re.search('^AbortArticle:', traceback, flags=re.M) is not None: + self.log.warn('Aborted download of article:', request.article.title, + 'from', request.article.url) + self.report_progress(float(self.jobs_done)/len(self.jobs), + _('Article download aborted: %s')%force_unicode(request.article.title)) + else: + self.log.error('Failed to download article:', request.article.title, + 'from', request.article.url) + self.log.debug(traceback) + self.log.debug('\n') + self.report_progress(float(self.jobs_done)/len(self.jobs), + _('Article download failed: %s')%force_unicode(request.article.title)) + self.failed_downloads.append((request.feed, request.article, traceback)) def parse_feeds(self): ''' diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 8c7f32c750..1491b6405a 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -23,6 +23,9 @@ from calibre.utils.magick import Image from calibre.utils.magick.draw import identify_data, thumbnail from calibre.utils.imghdr import what +class AbortArticle(Exception): + pass + class FetchError(Exception): pass @@ -567,7 +570,9 @@ class RecursiveFetcher(object): save_soup(soup, res) self.localize_link(tag, 'href', res) - except Exception: + except Exception as err: + if isinstance(err, AbortArticle): + raise self.failed_links.append((iurl, traceback.format_exc())) self.log.exception('Could not fetch link', iurl) finally: