From 72ac7359286c540f90003ad1d3ac3966b41c697b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 24 Aug 2011 20:23:26 -0600
Subject: [PATCH] News download: Use the algorithms from Redability to
 automatically cleanup downloaded HTML. You can turn this on in your own
 recipes by adding auto_cleanup=True to the recipe. It is turned on by default
 for basic recipes created via the GUI.

---
 src/calibre/gui2/dialogs/user_profiles.py |  1 +
 src/calibre/web/feeds/news.py             | 47 +++++++++++++++++++----
 src/calibre/web/fetch/simple.py           | 10 +++--
 3 files changed, 47 insertions(+), 11 deletions(-)
diff --git a/src/calibre/gui2/dialogs/user_profiles.py b/src/calibre/gui2/dialogs/user_profiles.py
index 92d20a6f03..b81b5271bc 100644
--- a/src/calibre/gui2/dialogs/user_profiles.py
+++ b/src/calibre/gui2/dialogs/user_profiles.py
@@ -219,6 +219,7 @@ class %(classname)s(%(base_class)s):
     title          = %(title)s
     oldest_article = %(oldest_article)d
     max_articles_per_feed = %(max_articles)d
+    auto_cleanup = True
 
     feeds          = %(feeds)s
 '''%dict(classname=classname, title=repr(title),
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index d7d9b0643a..06bde76c6a 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -138,6 +138,12 @@ class BasicNewsRecipe(Recipe):
     #: Reverse the order of articles in each feed
     reverse_article_order = False
 
+    #: Automatically extract all the text from downloaded article pages. Uses
+    #: the algorithms from the readability project. Setting this to True, means
+    #: that you do not have to worry about cleaning up the downloaded HTML
+    #: manually (though manual cleanup will always be superior).
+    auto_cleanup = False
+
     #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
     #: It will be inserted into `<style>` tags, just before the closing
     #: `</head>` tag thereby overriding all :term:`CSS` except that which is
@@ -452,6 +458,35 @@ class BasicNewsRecipe(Recipe):
         '''
         return None
 
+    def preprocess_raw_html(self, raw_html, url):
+        '''
+        This method is called with the source of each downloaded :term:`HTML` file, before
+        it is parsed into an object tree. raw_html is a unicode string
+        representing the raw HTML downloaded from the web. url is the URL from
+        which the HTML was downloaded.
+
+        Note that this method acts *before* preprocess_regexps.
+
+        This method must return the processed raw_html as a unicode object.
+        '''
+        return raw_html
+
+    def preprocess_raw_html_(self, raw_html, url):
+        raw_html = self.preprocess_raw_html(raw_html, url)
+        if self.auto_cleanup:
+            try:
+                data = self.extract_readable_article(raw_html, url)
+            except:
+                self.log.exception('Auto cleanup of URL: %r failed'%url)
+            else:
+                article_html = data[0]
+                extracted_title = data[1]
+                article_html = re.sub(ur'</?(html|body)/?>', u'', article_html)
+                article_html = u'<h1>%s</h1>%s'%(extracted_title, article_html)
+                raw_html = (
+                    u'<html><head><title>%s</title></head><body>%s</body></html>'%
+                    (extracted_title, article_html))
+        return raw_html
 
     def preprocess_html(self, soup):
         '''
@@ -515,13 +550,13 @@ class BasicNewsRecipe(Recipe):
             entity_to_unicode(match, encoding=enc)))
         return BeautifulSoup(_raw, markupMassage=massage)
 
-    def extract_readable_article(self, html, base_url):
+    def extract_readable_article(self, html, url):
         '''
         Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
         Based on the original readability algorithm by Arc90.
         '''
         from calibre.ebooks.readability import readability
-        doc = readability.Document(html, self.log, url=base_url)
+        doc = readability.Document(html, self.log, url=url)
         article_html = doc.summary()
         extracted_title = doc.title()
         return (article_html, extracted_title)
@@ -671,6 +706,7 @@ class BasicNewsRecipe(Recipe):
             setattr(self.web2disk_options, extra, getattr(self, extra))
         self.web2disk_options.postprocess_html = self._postprocess_html
         self.web2disk_options.encoding = self.encoding
+        self.web2disk_options.preprocess_raw_html = self.preprocess_raw_html_
 
         if self.delay > 0:
             self.simultaneous_downloads = 1
@@ -1417,12 +1453,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
 
 class AutomaticNewsRecipe(BasicNewsRecipe):
 
-    keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
-
-    def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
-        if self.use_embedded_content:
-            self.web2disk_options.keep_only_tags = []
-        return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
+    auto_cleanup = True
 
 class CalibrePeriodical(BasicNewsRecipe):
 
diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py
index 35069a428b..5730d84aa6 100644
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@@ -130,6 +130,8 @@ class RecursiveFetcher(object):
         self.remove_tags_before  = getattr(options, 'remove_tags_before', None)
         self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
         self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
+        self.preprocess_raw_html = getattr(options, 'preprocess_raw_html',
+                lambda raw, url: raw)
         self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
         self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
         self._is_link_wanted     = getattr(options, 'is_link_wanted',
@@ -139,14 +141,16 @@ class RecursiveFetcher(object):
         self.failed_links = []
         self.job_info = job_info
 
-    def get_soup(self, src):
+    def get_soup(self, src, url=None):
         nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
         nmassage.extend(self.preprocess_regexps)
         nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
         # Remove comments as they can leave detritus when extracting tags leaves
         # multiple nested comments
         nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
-        soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
+        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
+        usrc = self.preprocess_raw_html(usrc, url)
+        soup = BeautifulSoup(usrc, markupMassage=nmassage)
 
         replace = self.prepreprocess_html_ext(soup)
         if replace is not None:
@@ -425,7 +429,7 @@ class RecursiveFetcher(object):
                     else:
                         dsrc = xml_to_unicode(dsrc, self.verbose)[0]
 
-                    soup = self.get_soup(dsrc)
+                    soup = self.get_soup(dsrc, url=iurl)
 
                     base = soup.find('base', href=True)
                     if base is not None: