From 4f21f06f76d490230bb949fb7d87f24af2c52e73 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 25 Aug 2011 06:57:58 -0600
Subject: [PATCH] Make use of readability a little more robust

---
 recipes/hackernews.recipe                     |  6 +--
 src/calibre/ebooks/readability/readability.py | 10 +++--
 src/calibre/web/feeds/news.py                 | 44 ++++++++++++++-----
 3 files changed, 41 insertions(+), 19 deletions(-)
diff --git a/recipes/hackernews.recipe b/recipes/hackernews.recipe
index 846c302a6e..fa4b58864d 100644
--- a/recipes/hackernews.recipe
+++ b/recipes/hackernews.recipe
@@ -38,11 +38,7 @@ class HackerNews(BasicNewsRecipe):
         html = f.read()
         f.close()
 
-        data = self.extract_readable_article(html, url)
-        article_html = data[0]
-        extracted_title = data[1]
-        article_html = u'<cite><strong>' + extracted_title + u'</strong></cite><span> (' + self.prettyify_url(url) + u')</span><br/>' + article_html
-        return u'<html><head><title>' + extracted_title + u'</title></head><body>' + article_html + u'</body></html>'
+        return self.extract_readable_article(html, url)
 
     def get_hn_content(self, url):
         self.log('get_hn_content(' + url + ')')
diff --git a/src/calibre/ebooks/readability/readability.py b/src/calibre/ebooks/readability/readability.py
index 956f6f87e4..7713584d14 100644
--- a/src/calibre/ebooks/readability/readability.py
+++ b/src/calibre/ebooks/readability/readability.py
@@ -2,14 +2,15 @@ import re, sys
 from collections import defaultdict
 
 from lxml.etree import tostring
-from lxml.html import fragment_fromstring, document_fromstring
+from lxml.html import (fragment_fromstring, document_fromstring,
+        tostring as htostring)
 
 from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title
 from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes
 
 def tounicode(tree_or_node, **kwargs):
     kwargs['encoding'] = unicode
-    return tostring(tree_or_node, **kwargs)
+    return htostring(tree_or_node, **kwargs)
 
 
 REGEXES = {
@@ -144,6 +145,7 @@ class Document:
 
         sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
         output = document_fromstring('<div/>')
+        parent = output.xpath('//div')[0]
         best_elem = best_candidate['elem']
         for sibling in best_elem.getparent().getchildren():
             #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@@ -165,10 +167,10 @@ class Document:
                     append = True
 
             if append:
-                output.append(sibling)
+                parent.append(sibling)
         #if output is not None:
         #   output.append(best_elem)
-        return output
+        return output.find('body')
 
     def select_best_candidate(self, candidates):
         sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index 06bde76c6a..1945425392 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -475,17 +475,10 @@ class BasicNewsRecipe(Recipe):
         raw_html = self.preprocess_raw_html(raw_html, url)
         if self.auto_cleanup:
             try:
-                data = self.extract_readable_article(raw_html, url)
+                raw_html = self.extract_readable_article(raw_html, url)
             except:
                 self.log.exception('Auto cleanup of URL: %r failed'%url)
-            else:
-                article_html = data[0]
-                extracted_title = data[1]
-                article_html = re.sub(ur'</?(html|body)/?>', u'', article_html)
-                article_html = u'<h1>%s</h1>%s'%(extracted_title, article_html)
-                raw_html = (
-                    u'<html><head><title>%s</title></head><body>%s</body></html>'%
-                    (extracted_title, article_html))
+
         return raw_html
 
     def preprocess_html(self, soup):
@@ -556,10 +549,41 @@ class BasicNewsRecipe(Recipe):
         Based on the original readability algorithm by Arc90.
         '''
         from calibre.ebooks.readability import readability
+        from lxml.html import (fragment_fromstring, tostring,
+                document_fromstring)
+
         doc = readability.Document(html, self.log, url=url)
         article_html = doc.summary()
         extracted_title = doc.title()
-        return (article_html, extracted_title)
+
+        frag = fragment_fromstring(article_html)
+        if frag.tag == 'html':
+            root = frag
+        elif frag.tag == 'body':
+            root = document_fromstring(
+                u'<html><head><title>%s</title></head></html>' %
+                extracted_title)
+            root.append(frag)
+        else:
+            root = document_fromstring(
+                u'<html><head><title>%s</title></head><body/></html>' %
+                extracted_title)
+            root.xpath('//body')[0].append(frag)
+
+        body = root.xpath('//body')[0]
+        has_title = False
+        for x in body.iterdescendants():
+            if x.text == extracted_title:
+                has_title = True
+        inline_titles = body.xpath('//h1|//h2')
+        if not has_title and not inline_titles:
+            heading = body.makeelement('h2')
+            heading.text = extracted_title
+            body.insert(0, heading)
+
+        raw_html = tostring(root, encoding=unicode)
+
+        return raw_html
 
     def sort_index_by(self, index, weights):
         '''