From 4f21f06f76d490230bb949fb7d87f24af2c52e73 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 25 Aug 2011 06:57:58 -0600 Subject: [PATCH] Make use of readability a little more robust --- recipes/hackernews.recipe | 6 +-- src/calibre/ebooks/readability/readability.py | 10 +++-- src/calibre/web/feeds/news.py | 44 ++++++++++++++----- 3 files changed, 41 insertions(+), 19 deletions(-) diff --git a/recipes/hackernews.recipe b/recipes/hackernews.recipe index 846c302a6e..fa4b58864d 100644 --- a/recipes/hackernews.recipe +++ b/recipes/hackernews.recipe @@ -38,11 +38,7 @@ class HackerNews(BasicNewsRecipe): html = f.read() f.close() - data = self.extract_readable_article(html, url) - article_html = data[0] - extracted_title = data[1] - article_html = u'' + extracted_title + u' (' + self.prettyify_url(url) + u')
' + article_html - return u'' + extracted_title + u'' + article_html + u'' + return self.extract_readable_article(html, url) def get_hn_content(self, url): self.log('get_hn_content(' + url + ')') diff --git a/src/calibre/ebooks/readability/readability.py b/src/calibre/ebooks/readability/readability.py index 956f6f87e4..7713584d14 100644 --- a/src/calibre/ebooks/readability/readability.py +++ b/src/calibre/ebooks/readability/readability.py @@ -2,14 +2,15 @@ import re, sys from collections import defaultdict from lxml.etree import tostring -from lxml.html import fragment_fromstring, document_fromstring +from lxml.html import (fragment_fromstring, document_fromstring, + tostring as htostring) from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes def tounicode(tree_or_node, **kwargs): kwargs['encoding'] = unicode - return tostring(tree_or_node, **kwargs) + return htostring(tree_or_node, **kwargs) REGEXES = { @@ -144,6 +145,7 @@ class Document: sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) output = document_fromstring('
') + parent = output.xpath('//div')[0] best_elem = best_candidate['elem'] for sibling in best_elem.getparent().getchildren(): #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text @@ -165,10 +167,10 @@ class Document: append = True if append: - output.append(sibling) + parent.append(sibling) #if output is not None: # output.append(best_elem) - return output + return output.find('body') def select_best_candidate(self, candidates): sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 06bde76c6a..1945425392 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -475,17 +475,10 @@ class BasicNewsRecipe(Recipe): raw_html = self.preprocess_raw_html(raw_html, url) if self.auto_cleanup: try: - data = self.extract_readable_article(raw_html, url) + raw_html = self.extract_readable_article(raw_html, url) except: self.log.exception('Auto cleanup of URL: %r failed'%url) - else: - article_html = data[0] - extracted_title = data[1] - article_html = re.sub(ur'', u'', article_html) - article_html = u'

%s

%s'%(extracted_title, article_html) - raw_html = ( - u'%s%s'% - (extracted_title, article_html)) + return raw_html def preprocess_html(self, soup): @@ -556,10 +549,41 @@ class BasicNewsRecipe(Recipe): Based on the original readability algorithm by Arc90. ''' from calibre.ebooks.readability import readability + from lxml.html import (fragment_fromstring, tostring, + document_fromstring) + doc = readability.Document(html, self.log, url=url) article_html = doc.summary() extracted_title = doc.title() - return (article_html, extracted_title) + + frag = fragment_fromstring(article_html) + if frag.tag == 'html': + root = frag + elif frag.tag == 'body': + root = document_fromstring( + u'%s' % + extracted_title) + root.append(frag) + else: + root = document_fromstring( + u'%s' % + extracted_title) + root.xpath('//body')[0].append(frag) + + body = root.xpath('//body')[0] + has_title = False + for x in body.iterdescendants(): + if x.text == extracted_title: + has_title = True + inline_titles = body.xpath('//h1|//h2') + if not has_title and not inline_titles: + heading = body.makeelement('h2') + heading.text = extracted_title + body.insert(0, heading) + + raw_html = tostring(root, encoding=unicode) + + return raw_html def sort_index_by(self, index, weights): '''