News download: Use the algorithms from Redability to automatically cleanup downloaded HTML. You can turn this on in your own recipes by adding auto_cleanup=True to the recipe. It is turned on by default for basic recipes created via the GUI.

2025-07-09 03:04:10 -04:00 · 2011-08-24 20:23:26 -06:00 · 2011-08-24 20:23:26 -06:00 · 72ac735928
commit 72ac735928
parent 985d382f1a
3 changed files with 47 additions and 11 deletions
--- a/src/calibre/gui2/dialogs/user_profiles.py
+++ b/src/calibre/gui2/dialogs/user_profiles.py
@ -219,6 +219,7 @@ class %(classname)s(%(base_class)s):
    title          = %(title)s
    oldest_article = %(oldest_article)d
    max_articles_per_feed = %(max_articles)d
+    auto_cleanup = True

    feeds          = %(feeds)s
 '''%dict(classname=classname, title=repr(title),
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -138,6 +138,12 @@ class BasicNewsRecipe(Recipe):
    #: Reverse the order of articles in each feed
    reverse_article_order = False

+    #: Automatically extract all the text from downloaded article pages. Uses
+    #: the algorithms from the readability project. Setting this to True, means
+    #: that you do not have to worry about cleaning up the downloaded HTML
+    #: manually (though manual cleanup will always be superior).
+    auto_cleanup = False
+
    #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
    #: It will be inserted into `<style>` tags, just before the closing
    #: `</head>` tag thereby overriding all :term:`CSS` except that which is
@ -452,6 +458,35 @@ class BasicNewsRecipe(Recipe):
        '''
        return None

+    def preprocess_raw_html(self, raw_html, url):
+        '''
+        This method is called with the source of each downloaded :term:`HTML` file, before
+        it is parsed into an object tree. raw_html is a unicode string
+        representing the raw HTML downloaded from the web. url is the URL from
+        which the HTML was downloaded.
+
+        Note that this method acts *before* preprocess_regexps.
+
+        This method must return the processed raw_html as a unicode object.
+        '''
+        return raw_html
+
+    def preprocess_raw_html_(self, raw_html, url):
+        raw_html = self.preprocess_raw_html(raw_html, url)
+        if self.auto_cleanup:
+            try:
+                data = self.extract_readable_article(raw_html, url)
+            except:
+                self.log.exception('Auto cleanup of URL: %r failed'%url)
+            else:
+                article_html = data[0]
+                extracted_title = data[1]
+                article_html = re.sub(ur'</?(html|body)/?>', u'', article_html)
+                article_html = u'<h1>%s</h1>%s'%(extracted_title, article_html)
+                raw_html = (
+                    u'<html><head><title>%s</title></head><body>%s</body></html>'%
+                    (extracted_title, article_html))
+        return raw_html

    def preprocess_html(self, soup):
        '''
@ -515,13 +550,13 @@ class BasicNewsRecipe(Recipe):
            entity_to_unicode(match, encoding=enc)))
        return BeautifulSoup(_raw, markupMassage=massage)

-    def extract_readable_article(self, html, base_url):
+    def extract_readable_article(self, html, url):
        '''
        Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
        Based on the original readability algorithm by Arc90.
        '''
        from calibre.ebooks.readability import readability
-        doc = readability.Document(html, self.log, url=base_url)
+        doc = readability.Document(html, self.log, url=url)
        article_html = doc.summary()
        extracted_title = doc.title()
        return (article_html, extracted_title)
@ -671,6 +706,7 @@ class BasicNewsRecipe(Recipe):
            setattr(self.web2disk_options, extra, getattr(self, extra))
        self.web2disk_options.postprocess_html = self._postprocess_html
        self.web2disk_options.encoding = self.encoding
+        self.web2disk_options.preprocess_raw_html = self.preprocess_raw_html_

        if self.delay > 0:
            self.simultaneous_downloads = 1
@ -1417,12 +1453,7 @@ class CustomIndexRecipe(BasicNewsRecipe):

 class AutomaticNewsRecipe(BasicNewsRecipe):

-    keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
-
-    def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
-        if self.use_embedded_content:
-            self.web2disk_options.keep_only_tags = []
-        return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
+    auto_cleanup = True

 class CalibrePeriodical(BasicNewsRecipe):

--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -130,6 +130,8 @@ class RecursiveFetcher(object):
        self.remove_tags_before  = getattr(options, 'remove_tags_before', None)
        self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
+        self.preprocess_raw_html = getattr(options, 'preprocess_raw_html',
+                lambda raw, url: raw)
        self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
        self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
        self._is_link_wanted     = getattr(options, 'is_link_wanted',
@ -139,14 +141,16 @@ class RecursiveFetcher(object):
        self.failed_links = []
        self.job_info = job_info

-    def get_soup(self, src):
+    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
-        soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
+        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
+        usrc = self.preprocess_raw_html(usrc, url)
+        soup = BeautifulSoup(usrc, markupMassage=nmassage)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
@ -425,7 +429,7 @@ class RecursiveFetcher(object):
                    else:
                        dsrc = xml_to_unicode(dsrc, self.verbose)[0]

-                    soup = self.get_soup(dsrc)
+                    soup = self.get_soup(dsrc, url=iurl)

                    base = soup.find('base', href=True)
                    if base is not None: