Make use of readability a little more robust

2025-07-09 03:04:10 -04:00 · 2011-08-25 06:57:58 -06:00 · 2011-08-25 06:57:58 -06:00 · 4f21f06f76
commit 4f21f06f76
parent 67b7615a8e
3 changed files with 41 additions and 19 deletions
--- a/recipes/hackernews.recipe
+++ b/recipes/hackernews.recipe
@ -38,11 +38,7 @@ class HackerNews(BasicNewsRecipe):
        html = f.read()
        f.close()
-        data = self.extract_readable_article(html, url)
+        return self.extract_readable_article(html, url)
        article_html = data[0]
        extracted_title = data[1]
        article_html = u'<cite><strong>' + extracted_title + u'</strong></cite><span> (' + self.prettyify_url(url) + u')</span><br/>' + article_html
        return u'<html><head><title>' + extracted_title + u'</title></head><body>' + article_html + u'</body></html>'
    def get_hn_content(self, url):
        self.log('get_hn_content(' + url + ')')
--- a/src/calibre/ebooks/readability/readability.py
+++ b/src/calibre/ebooks/readability/readability.py
@ -2,14 +2,15 @@ import re, sys
 from collections import defaultdict
 from lxml.etree import tostring
-from lxml.html import fragment_fromstring, document_fromstring
+from lxml.html import (fragment_fromstring, document_fromstring,
        tostring as htostring)
 from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title
 from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes
 def tounicode(tree_or_node, **kwargs):
    kwargs['encoding'] = unicode
-    return tostring(tree_or_node, **kwargs)
+    return htostring(tree_or_node, **kwargs)
 REGEXES = {
@ -144,6 +145,7 @@ class Document:
        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
        output = document_fromstring('<div/>')
        parent = output.xpath('//div')[0]
        best_elem = best_candidate['elem']
        for sibling in best_elem.getparent().getchildren():
            #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@ -165,10 +167,10 @@ class Document:
                    append = True
            if append:
-                output.append(sibling)
+                parent.append(sibling)
        #if output is not None:
        #   output.append(best_elem)
-        return output
+        return output.find('body')
    def select_best_candidate(self, candidates):
        sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -475,17 +475,10 @@ class BasicNewsRecipe(Recipe):
        raw_html = self.preprocess_raw_html(raw_html, url)
        if self.auto_cleanup:
            try:
-                data = self.extract_readable_article(raw_html, url)
+                raw_html = self.extract_readable_article(raw_html, url)
            except:
                self.log.exception('Auto cleanup of URL: %r failed'%url)
-            else:
+
                article_html = data[0]
                extracted_title = data[1]
                article_html = re.sub(ur'</?(html|body)/?>', u'', article_html)
                article_html = u'<h1>%s</h1>%s'%(extracted_title, article_html)
                raw_html = (
                    u'<html><head><title>%s</title></head><body>%s</body></html>'%
                    (extracted_title, article_html))
        return raw_html
    def preprocess_html(self, soup):
@ -556,10 +549,41 @@ class BasicNewsRecipe(Recipe):
        Based on the original readability algorithm by Arc90.
        '''
        from calibre.ebooks.readability import readability
        from lxml.html import (fragment_fromstring, tostring,
                document_fromstring)
        doc = readability.Document(html, self.log, url=url)
        article_html = doc.summary()
        extracted_title = doc.title()
-        return (article_html, extracted_title)
+
        frag = fragment_fromstring(article_html)
        if frag.tag == 'html':
            root = frag
        elif frag.tag == 'body':
            root = document_fromstring(
                u'<html><head><title>%s</title></head></html>' %
                extracted_title)
            root.append(frag)
        else:
            root = document_fromstring(
                u'<html><head><title>%s</title></head><body/></html>' %
                extracted_title)
            root.xpath('//body')[0].append(frag)
        body = root.xpath('//body')[0]
        has_title = False
        for x in body.iterdescendants():
            if x.text == extracted_title:
                has_title = True
        inline_titles = body.xpath('//h1|//h2')
        if not has_title and not inline_titles:
            heading = body.makeelement('h2')
            heading.text = extracted_title
            body.insert(0, heading)
        raw_html = tostring(root, encoding=unicode)
        return raw_html
    def sort_index_by(self, index, weights):
        '''