Make use of readability a little more robust

2025-08-30 23:00:21 -04:00 · 2011-08-25 06:57:58 -06:00 · 2011-08-25 06:57:58 -06:00 · 4f21f06f76
commit 4f21f06f76
parent 67b7615a8e
3 changed files with 41 additions and 19 deletions
--- a/recipes/hackernews.recipe
+++ b/recipes/hackernews.recipe
@ -38,11 +38,7 @@ class HackerNews(BasicNewsRecipe):
        html = f.read()
        f.close()

-        data = self.extract_readable_article(html, url)
-        article_html = data[0]
-        extracted_title = data[1]
-        article_html = u'<cite><strong>' + extracted_title + u'</strong></cite><span> (' + self.prettyify_url(url) + u')</span><br/>' + article_html
-        return u'<html><head><title>' + extracted_title + u'</title></head><body>' + article_html + u'</body></html>'
+        return self.extract_readable_article(html, url)

    def get_hn_content(self, url):
        self.log('get_hn_content(' + url + ')')
--- a/src/calibre/ebooks/readability/readability.py
+++ b/src/calibre/ebooks/readability/readability.py
@ -2,14 +2,15 @@ import re, sys
 from collections import defaultdict

 from lxml.etree import tostring
-from lxml.html import fragment_fromstring, document_fromstring
+from lxml.html import (fragment_fromstring, document_fromstring,
+        tostring as htostring)

 from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title
 from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes

 def tounicode(tree_or_node, **kwargs):
    kwargs['encoding'] = unicode
-    return tostring(tree_or_node, **kwargs)
+    return htostring(tree_or_node, **kwargs)


 REGEXES = {
@ -144,6 +145,7 @@ class Document:

        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
        output = document_fromstring('<div/>')
+        parent = output.xpath('//div')[0]
        best_elem = best_candidate['elem']
        for sibling in best_elem.getparent().getchildren():
            #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@ -165,10 +167,10 @@ class Document:
                    append = True

            if append:
-                output.append(sibling)
+                parent.append(sibling)
        #if output is not None:
        #   output.append(best_elem)
-        return output
+        return output.find('body')

    def select_best_candidate(self, candidates):
        sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -475,17 +475,10 @@ class BasicNewsRecipe(Recipe):
        raw_html = self.preprocess_raw_html(raw_html, url)
        if self.auto_cleanup:
            try:
-                data = self.extract_readable_article(raw_html, url)
+                raw_html = self.extract_readable_article(raw_html, url)
            except:
                self.log.exception('Auto cleanup of URL: %r failed'%url)
-            else:
-                article_html = data[0]
-                extracted_title = data[1]
-                article_html = re.sub(ur'</?(html|body)/?>', u'', article_html)
-                article_html = u'<h1>%s</h1>%s'%(extracted_title, article_html)
-                raw_html = (
-                    u'<html><head><title>%s</title></head><body>%s</body></html>'%
-                    (extracted_title, article_html))
+
        return raw_html

    def preprocess_html(self, soup):
@ -556,10 +549,41 @@ class BasicNewsRecipe(Recipe):
        Based on the original readability algorithm by Arc90.
        '''
        from calibre.ebooks.readability import readability
+        from lxml.html import (fragment_fromstring, tostring,
+                document_fromstring)
+
        doc = readability.Document(html, self.log, url=url)
        article_html = doc.summary()
        extracted_title = doc.title()
-        return (article_html, extracted_title)
+
+        frag = fragment_fromstring(article_html)
+        if frag.tag == 'html':
+            root = frag
+        elif frag.tag == 'body':
+            root = document_fromstring(
+                u'<html><head><title>%s</title></head></html>' %
+                extracted_title)
+            root.append(frag)
+        else:
+            root = document_fromstring(
+                u'<html><head><title>%s</title></head><body/></html>' %
+                extracted_title)
+            root.xpath('//body')[0].append(frag)
+
+        body = root.xpath('//body')[0]
+        has_title = False
+        for x in body.iterdescendants():
+            if x.text == extracted_title:
+                has_title = True
+        inline_titles = body.xpath('//h1|//h2')
+        if not has_title and not inline_titles:
+            heading = body.makeelement('h2')
+            heading.text = extracted_title
+            body.insert(0, heading)
+
+        raw_html = tostring(root, encoding=unicode)
+
+        return raw_html

    def sort_index_by(self, index, weights):
        '''