mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-07 09:01:38 -04:00
News download: Add API to cleanly abort the download of an article during the preprocess stage based on the article's contents.
This commit is contained in:
parent
da8a8781f7
commit
93916c5ffd
@ -22,8 +22,7 @@ from calibre.web import Recipe
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
|
||||
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
|
||||
from calibre.web.fetch.simple import RecursiveFetcher
|
||||
from calibre.web.fetch.simple import option_parser as web2disk_option_parser, RecursiveFetcher, AbortArticle
|
||||
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.utils.date import now as nowf
|
||||
@ -584,6 +583,12 @@ class BasicNewsRecipe(Recipe):
|
||||
'''
|
||||
return None
|
||||
|
||||
def abort_article(self, msg=None):
|
||||
''' Call this method inside any of the preprocess methods to abort the
|
||||
download for the current article. Useful to skip articles that contain
|
||||
inappropriate content, such as pure video articles. '''
|
||||
raise AbortArticle(msg or _('Article download aborted'))
|
||||
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
'''
|
||||
This method is called with the source of each downloaded :term:`HTML` file, before
|
||||
@ -1572,13 +1577,19 @@ class BasicNewsRecipe(Recipe):
|
||||
|
||||
def error_in_article_download(self, request, traceback):
|
||||
self.jobs_done += 1
|
||||
self.log.error('Failed to download article:', request.article.title,
|
||||
'from', request.article.url)
|
||||
self.log.debug(traceback)
|
||||
self.log.debug('\n')
|
||||
self.report_progress(float(self.jobs_done)/len(self.jobs),
|
||||
_('Article download failed: %s')%force_unicode(request.article.title))
|
||||
self.failed_downloads.append((request.feed, request.article, traceback))
|
||||
if traceback and re.search('^AbortArticle:', traceback, flags=re.M) is not None:
|
||||
self.log.warn('Aborted download of article:', request.article.title,
|
||||
'from', request.article.url)
|
||||
self.report_progress(float(self.jobs_done)/len(self.jobs),
|
||||
_('Article download aborted: %s')%force_unicode(request.article.title))
|
||||
else:
|
||||
self.log.error('Failed to download article:', request.article.title,
|
||||
'from', request.article.url)
|
||||
self.log.debug(traceback)
|
||||
self.log.debug('\n')
|
||||
self.report_progress(float(self.jobs_done)/len(self.jobs),
|
||||
_('Article download failed: %s')%force_unicode(request.article.title))
|
||||
self.failed_downloads.append((request.feed, request.article, traceback))
|
||||
|
||||
def parse_feeds(self):
|
||||
'''
|
||||
|
@ -23,6 +23,9 @@ from calibre.utils.magick import Image
|
||||
from calibre.utils.magick.draw import identify_data, thumbnail
|
||||
from calibre.utils.imghdr import what
|
||||
|
||||
class AbortArticle(Exception):
|
||||
pass
|
||||
|
||||
class FetchError(Exception):
|
||||
pass
|
||||
|
||||
@ -567,7 +570,9 @@ class RecursiveFetcher(object):
|
||||
|
||||
save_soup(soup, res)
|
||||
self.localize_link(tag, 'href', res)
|
||||
except Exception:
|
||||
except Exception as err:
|
||||
if isinstance(err, AbortArticle):
|
||||
raise
|
||||
self.failed_links.append((iurl, traceback.format_exc()))
|
||||
self.log.exception('Could not fetch link', iurl)
|
||||
finally:
|
||||
|
Loading…
x
Reference in New Issue
Block a user