From 7cf81e7bff74d91e3e114f4b4ca4bb559c0f6542 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 12 Jun 2010 17:08:26 -0600 Subject: [PATCH] News download: Fix prepreprocess_html method --- src/calibre/manual/news_recipe.rst | 2 +- src/calibre/web/feeds/news.py | 11 ++++++----- src/calibre/web/fetch/simple.py | 6 ++++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/calibre/manual/news_recipe.rst b/src/calibre/manual/news_recipe.rst index 14cc41d436..7e5045ea47 100644 --- a/src/calibre/manual/news_recipe.rst +++ b/src/calibre/manual/news_recipe.rst @@ -111,7 +111,7 @@ Pre/post processing of downloaded HTML .. automember:: BasicNewsRecipe.remove_javascript -.. automethod:: BasicNewsRecipe.prepreprocess_html +.. automethod:: BasicNewsRecipe.skip_ad_pages .. automethod:: BasicNewsRecipe.preprocess_html diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index f54d5bde9d..9e05babecc 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -413,18 +413,19 @@ class BasicNewsRecipe(Recipe): return url return article.get('link', None) - def prepreprocess_html(self, soup): + def skip_ad_pages(self, soup): ''' This method is called with the source of each downloaded :term:`HTML` file, before any of the cleanup attributes like remove_tags, keep_only_tags are applied. Note that preprocess_regexps will have already been applied. - It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`. - It should return `soup` after processing it. + It is meant to allow the recipe to skip ad pages. If the soup represents + an ad page, return the HTML of the real page. Otherwise return + None. `soup`: A `BeautifulSoup `_ instance containing the downloaded :term:`HTML`. ''' - return soup + return None def preprocess_html(self, soup): @@ -628,7 +629,7 @@ class BasicNewsRecipe(Recipe): self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0] for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', - 'prepreprocess_html', 'preprocess_html', 'remove_tags_after', + 'skip_ad_pages', 'preprocess_html', 'remove_tags_after', 'remove_tags_before', 'is_link_wanted'): setattr(self.web2disk_options, extra, getattr(self, extra)) self.web2disk_options.postprocess_html = self._postprocess_html diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index bde91ec0d2..b6186f785d 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -136,7 +136,7 @@ class RecursiveFetcher(object): self.remove_tags_before = getattr(options, 'remove_tags_before', None) self.keep_only_tags = getattr(options, 'keep_only_tags', []) self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) - self.prepreprocess_html_ext = getattr(options, 'prepreprocess_html', lambda soup: soup) + self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None) self.postprocess_html_ext= getattr(options, 'postprocess_html', None) self._is_link_wanted = getattr(options, 'is_link_wanted', default_is_link_wanted) @@ -154,7 +154,9 @@ class RecursiveFetcher(object): nmassage.append((re.compile(r'', re.DOTALL), lambda m: '')) soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) - soup = self.prepreprocess_html_ext(soup) + replace = self.prepreprocess_html_ext(soup) + if replace is not None: + soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) if self.keep_only_tags: body = Tag(soup, 'body')