News download: Add API to cleanly abort the download of an article during the preprocess stage based on the article's contents.

This commit is contained in:
Kovid Goyal 2015-03-25 17:19:52 +05:30
parent da8a8781f7
commit 93916c5ffd
2 changed files with 26 additions and 10 deletions

View File

@ -22,8 +22,7 @@ from calibre.web import Recipe
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
from calibre.web.fetch.simple import RecursiveFetcher
from calibre.web.fetch.simple import option_parser as web2disk_option_parser, RecursiveFetcher, AbortArticle
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import now as nowf
@ -584,6 +583,12 @@ class BasicNewsRecipe(Recipe):
'''
return None
def abort_article(self, msg=None):
''' Call this method inside any of the preprocess methods to abort the
download for the current article. Useful to skip articles that contain
inappropriate content, such as pure video articles. '''
raise AbortArticle(msg or _('Article download aborted'))
def preprocess_raw_html(self, raw_html, url):
'''
This method is called with the source of each downloaded :term:`HTML` file, before
@ -1572,13 +1577,19 @@ class BasicNewsRecipe(Recipe):
def error_in_article_download(self, request, traceback):
self.jobs_done += 1
self.log.error('Failed to download article:', request.article.title,
'from', request.article.url)
self.log.debug(traceback)
self.log.debug('\n')
self.report_progress(float(self.jobs_done)/len(self.jobs),
_('Article download failed: %s')%force_unicode(request.article.title))
self.failed_downloads.append((request.feed, request.article, traceback))
if traceback and re.search('^AbortArticle:', traceback, flags=re.M) is not None:
self.log.warn('Aborted download of article:', request.article.title,
'from', request.article.url)
self.report_progress(float(self.jobs_done)/len(self.jobs),
_('Article download aborted: %s')%force_unicode(request.article.title))
else:
self.log.error('Failed to download article:', request.article.title,
'from', request.article.url)
self.log.debug(traceback)
self.log.debug('\n')
self.report_progress(float(self.jobs_done)/len(self.jobs),
_('Article download failed: %s')%force_unicode(request.article.title))
self.failed_downloads.append((request.feed, request.article, traceback))
def parse_feeds(self):
'''

View File

@ -23,6 +23,9 @@ from calibre.utils.magick import Image
from calibre.utils.magick.draw import identify_data, thumbnail
from calibre.utils.imghdr import what
class AbortArticle(Exception):
pass
class FetchError(Exception):
pass
@ -567,7 +570,9 @@ class RecursiveFetcher(object):
save_soup(soup, res)
self.localize_link(tag, 'href', res)
except Exception:
except Exception as err:
if isinstance(err, AbortArticle):
raise
self.failed_links.append((iurl, traceback.format_exc()))
self.log.exception('Could not fetch link', iurl)
finally: