From 7cf81e7bff74d91e3e114f4b4ca4bb559c0f6542 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 12 Jun 2010 17:08:26 -0600
Subject: [PATCH] News download: Fix prepreprocess_html method

---
 src/calibre/manual/news_recipe.rst |  2 +-
 src/calibre/web/feeds/news.py      | 11 ++++++-----
 src/calibre/web/fetch/simple.py    |  6 ++++--
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/calibre/manual/news_recipe.rst b/src/calibre/manual/news_recipe.rst
index 14cc41d436..7e5045ea47 100644
--- a/src/calibre/manual/news_recipe.rst
+++ b/src/calibre/manual/news_recipe.rst
@@ -111,7 +111,7 @@ Pre/post processing of downloaded HTML
 
 .. automember:: BasicNewsRecipe.remove_javascript
 
-.. automethod:: BasicNewsRecipe.prepreprocess_html
+.. automethod:: BasicNewsRecipe.skip_ad_pages
 
 .. automethod:: BasicNewsRecipe.preprocess_html
 
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index f54d5bde9d..9e05babecc 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -413,18 +413,19 @@ class BasicNewsRecipe(Recipe):
                     return url
         return article.get('link',  None)
 
-    def prepreprocess_html(self, soup):
+    def skip_ad_pages(self, soup):
         '''
         This method is called with the source of each downloaded :term:`HTML` file, before
         any of the cleanup attributes like remove_tags, keep_only_tags are
         applied. Note that preprocess_regexps will have already been applied.
-        It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`.
-        It should return `soup` after processing it.
+        It is meant to allow the recipe to skip ad pages. If the soup represents
+        an ad page, return the HTML of the real page. Otherwise return
+        None.
 
         `soup`: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
         instance containing the downloaded :term:`HTML`.
         '''
-        return soup
+        return None
 
 
     def preprocess_html(self, soup):
@@ -628,7 +629,7 @@ class BasicNewsRecipe(Recipe):
 
         self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
         for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
-                      'prepreprocess_html', 'preprocess_html', 'remove_tags_after',
+                      'skip_ad_pages', 'preprocess_html', 'remove_tags_after',
                       'remove_tags_before', 'is_link_wanted'):
             setattr(self.web2disk_options, extra, getattr(self, extra))
         self.web2disk_options.postprocess_html = self._postprocess_html
diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py
index bde91ec0d2..b6186f785d 100644
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@@ -136,7 +136,7 @@ class RecursiveFetcher(object):
         self.remove_tags_before  = getattr(options, 'remove_tags_before', None)
         self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
         self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
-        self.prepreprocess_html_ext = getattr(options, 'prepreprocess_html', lambda soup: soup)
+        self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
         self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
         self._is_link_wanted     = getattr(options, 'is_link_wanted',
                 default_is_link_wanted)
@@ -154,7 +154,9 @@ class RecursiveFetcher(object):
         nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
         soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
 
-        soup = self.prepreprocess_html_ext(soup)
+        replace = self.prepreprocess_html_ext(soup)
+        if replace is not None:
+            soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
 
         if self.keep_only_tags:
             body = Tag(soup, 'body')