diff --git a/recipes/hackernews.recipe b/recipes/hackernews.recipe
index 846c302a6e..fa4b58864d 100644
--- a/recipes/hackernews.recipe
+++ b/recipes/hackernews.recipe
@@ -38,11 +38,7 @@ class HackerNews(BasicNewsRecipe):
html = f.read()
f.close()
- data = self.extract_readable_article(html, url)
- article_html = data[0]
- extracted_title = data[1]
- article_html = u'' + extracted_title + u' (' + self.prettyify_url(url) + u')
' + article_html
- return u'
' + extracted_title + u'' + article_html + u''
+ return self.extract_readable_article(html, url)
def get_hn_content(self, url):
self.log('get_hn_content(' + url + ')')
diff --git a/src/calibre/ebooks/readability/readability.py b/src/calibre/ebooks/readability/readability.py
index 956f6f87e4..7713584d14 100644
--- a/src/calibre/ebooks/readability/readability.py
+++ b/src/calibre/ebooks/readability/readability.py
@@ -2,14 +2,15 @@ import re, sys
from collections import defaultdict
from lxml.etree import tostring
-from lxml.html import fragment_fromstring, document_fromstring
+from lxml.html import (fragment_fromstring, document_fromstring,
+ tostring as htostring)
from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title
from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes
def tounicode(tree_or_node, **kwargs):
kwargs['encoding'] = unicode
- return tostring(tree_or_node, **kwargs)
+ return htostring(tree_or_node, **kwargs)
REGEXES = {
@@ -144,6 +145,7 @@ class Document:
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
output = document_fromstring('')
+ parent = output.xpath('//div')[0]
best_elem = best_candidate['elem']
for sibling in best_elem.getparent().getchildren():
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@@ -165,10 +167,10 @@ class Document:
append = True
if append:
- output.append(sibling)
+ parent.append(sibling)
#if output is not None:
# output.append(best_elem)
- return output
+ return output.find('body')
def select_best_candidate(self, candidates):
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index 06bde76c6a..1945425392 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -475,17 +475,10 @@ class BasicNewsRecipe(Recipe):
raw_html = self.preprocess_raw_html(raw_html, url)
if self.auto_cleanup:
try:
- data = self.extract_readable_article(raw_html, url)
+ raw_html = self.extract_readable_article(raw_html, url)
except:
self.log.exception('Auto cleanup of URL: %r failed'%url)
- else:
- article_html = data[0]
- extracted_title = data[1]
- article_html = re.sub(ur'?(html|body)/?>', u'', article_html)
- article_html = u'%s
%s'%(extracted_title, article_html)
- raw_html = (
- u'%s%s'%
- (extracted_title, article_html))
+
return raw_html
def preprocess_html(self, soup):
@@ -556,10 +549,41 @@ class BasicNewsRecipe(Recipe):
Based on the original readability algorithm by Arc90.
'''
from calibre.ebooks.readability import readability
+ from lxml.html import (fragment_fromstring, tostring,
+ document_fromstring)
+
doc = readability.Document(html, self.log, url=url)
article_html = doc.summary()
extracted_title = doc.title()
- return (article_html, extracted_title)
+
+ frag = fragment_fromstring(article_html)
+ if frag.tag == 'html':
+ root = frag
+ elif frag.tag == 'body':
+ root = document_fromstring(
+ u'%s' %
+ extracted_title)
+ root.append(frag)
+ else:
+ root = document_fromstring(
+ u'%s