Make use of readability a little more robust

This commit is contained in:
Kovid Goyal 2011-08-25 06:57:58 -06:00
parent 67b7615a8e
commit 4f21f06f76
3 changed files with 41 additions and 19 deletions

View File

@ -38,11 +38,7 @@ class HackerNews(BasicNewsRecipe):
html = f.read() html = f.read()
f.close() f.close()
data = self.extract_readable_article(html, url) return self.extract_readable_article(html, url)
article_html = data[0]
extracted_title = data[1]
article_html = u'<cite><strong>' + extracted_title + u'</strong></cite><span> (' + self.prettyify_url(url) + u')</span><br/>' + article_html
return u'<html><head><title>' + extracted_title + u'</title></head><body>' + article_html + u'</body></html>'
def get_hn_content(self, url): def get_hn_content(self, url):
self.log('get_hn_content(' + url + ')') self.log('get_hn_content(' + url + ')')

View File

@ -2,14 +2,15 @@ import re, sys
from collections import defaultdict from collections import defaultdict
from lxml.etree import tostring from lxml.etree import tostring
from lxml.html import fragment_fromstring, document_fromstring from lxml.html import (fragment_fromstring, document_fromstring,
tostring as htostring)
from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title
from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes
def tounicode(tree_or_node, **kwargs): def tounicode(tree_or_node, **kwargs):
kwargs['encoding'] = unicode kwargs['encoding'] = unicode
return tostring(tree_or_node, **kwargs) return htostring(tree_or_node, **kwargs)
REGEXES = { REGEXES = {
@ -144,6 +145,7 @@ class Document:
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
output = document_fromstring('<div/>') output = document_fromstring('<div/>')
parent = output.xpath('//div')[0]
best_elem = best_candidate['elem'] best_elem = best_candidate['elem']
for sibling in best_elem.getparent().getchildren(): for sibling in best_elem.getparent().getchildren():
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@ -165,10 +167,10 @@ class Document:
append = True append = True
if append: if append:
output.append(sibling) parent.append(sibling)
#if output is not None: #if output is not None:
# output.append(best_elem) # output.append(best_elem)
return output return output.find('body')
def select_best_candidate(self, candidates): def select_best_candidate(self, candidates):
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)

View File

@ -475,17 +475,10 @@ class BasicNewsRecipe(Recipe):
raw_html = self.preprocess_raw_html(raw_html, url) raw_html = self.preprocess_raw_html(raw_html, url)
if self.auto_cleanup: if self.auto_cleanup:
try: try:
data = self.extract_readable_article(raw_html, url) raw_html = self.extract_readable_article(raw_html, url)
except: except:
self.log.exception('Auto cleanup of URL: %r failed'%url) self.log.exception('Auto cleanup of URL: %r failed'%url)
else:
article_html = data[0]
extracted_title = data[1]
article_html = re.sub(ur'</?(html|body)/?>', u'', article_html)
article_html = u'<h1>%s</h1>%s'%(extracted_title, article_html)
raw_html = (
u'<html><head><title>%s</title></head><body>%s</body></html>'%
(extracted_title, article_html))
return raw_html return raw_html
def preprocess_html(self, soup): def preprocess_html(self, soup):
@ -556,10 +549,41 @@ class BasicNewsRecipe(Recipe):
Based on the original readability algorithm by Arc90. Based on the original readability algorithm by Arc90.
''' '''
from calibre.ebooks.readability import readability from calibre.ebooks.readability import readability
from lxml.html import (fragment_fromstring, tostring,
document_fromstring)
doc = readability.Document(html, self.log, url=url) doc = readability.Document(html, self.log, url=url)
article_html = doc.summary() article_html = doc.summary()
extracted_title = doc.title() extracted_title = doc.title()
return (article_html, extracted_title)
frag = fragment_fromstring(article_html)
if frag.tag == 'html':
root = frag
elif frag.tag == 'body':
root = document_fromstring(
u'<html><head><title>%s</title></head></html>' %
extracted_title)
root.append(frag)
else:
root = document_fromstring(
u'<html><head><title>%s</title></head><body/></html>' %
extracted_title)
root.xpath('//body')[0].append(frag)
body = root.xpath('//body')[0]
has_title = False
for x in body.iterdescendants():
if x.text == extracted_title:
has_title = True
inline_titles = body.xpath('//h1|//h2')
if not has_title and not inline_titles:
heading = body.makeelement('h2')
heading.text = extracted_title
body.insert(0, heading)
raw_html = tostring(root, encoding=unicode)
return raw_html
def sort_index_by(self, index, weights): def sort_index_by(self, index, weights):
''' '''