mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make use of readability a little more robust
This commit is contained in:
parent
67b7615a8e
commit
4f21f06f76
@ -38,11 +38,7 @@ class HackerNews(BasicNewsRecipe):
|
||||
html = f.read()
|
||||
f.close()
|
||||
|
||||
data = self.extract_readable_article(html, url)
|
||||
article_html = data[0]
|
||||
extracted_title = data[1]
|
||||
article_html = u'<cite><strong>' + extracted_title + u'</strong></cite><span> (' + self.prettyify_url(url) + u')</span><br/>' + article_html
|
||||
return u'<html><head><title>' + extracted_title + u'</title></head><body>' + article_html + u'</body></html>'
|
||||
return self.extract_readable_article(html, url)
|
||||
|
||||
def get_hn_content(self, url):
|
||||
self.log('get_hn_content(' + url + ')')
|
||||
|
@ -2,14 +2,15 @@ import re, sys
|
||||
from collections import defaultdict
|
||||
|
||||
from lxml.etree import tostring
|
||||
from lxml.html import fragment_fromstring, document_fromstring
|
||||
from lxml.html import (fragment_fromstring, document_fromstring,
|
||||
tostring as htostring)
|
||||
|
||||
from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title
|
||||
from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes
|
||||
|
||||
def tounicode(tree_or_node, **kwargs):
|
||||
kwargs['encoding'] = unicode
|
||||
return tostring(tree_or_node, **kwargs)
|
||||
return htostring(tree_or_node, **kwargs)
|
||||
|
||||
|
||||
REGEXES = {
|
||||
@ -144,6 +145,7 @@ class Document:
|
||||
|
||||
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
|
||||
output = document_fromstring('<div/>')
|
||||
parent = output.xpath('//div')[0]
|
||||
best_elem = best_candidate['elem']
|
||||
for sibling in best_elem.getparent().getchildren():
|
||||
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
|
||||
@ -165,10 +167,10 @@ class Document:
|
||||
append = True
|
||||
|
||||
if append:
|
||||
output.append(sibling)
|
||||
parent.append(sibling)
|
||||
#if output is not None:
|
||||
# output.append(best_elem)
|
||||
return output
|
||||
return output.find('body')
|
||||
|
||||
def select_best_candidate(self, candidates):
|
||||
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
|
||||
|
@ -475,17 +475,10 @@ class BasicNewsRecipe(Recipe):
|
||||
raw_html = self.preprocess_raw_html(raw_html, url)
|
||||
if self.auto_cleanup:
|
||||
try:
|
||||
data = self.extract_readable_article(raw_html, url)
|
||||
raw_html = self.extract_readable_article(raw_html, url)
|
||||
except:
|
||||
self.log.exception('Auto cleanup of URL: %r failed'%url)
|
||||
else:
|
||||
article_html = data[0]
|
||||
extracted_title = data[1]
|
||||
article_html = re.sub(ur'</?(html|body)/?>', u'', article_html)
|
||||
article_html = u'<h1>%s</h1>%s'%(extracted_title, article_html)
|
||||
raw_html = (
|
||||
u'<html><head><title>%s</title></head><body>%s</body></html>'%
|
||||
(extracted_title, article_html))
|
||||
|
||||
return raw_html
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
@ -556,10 +549,41 @@ class BasicNewsRecipe(Recipe):
|
||||
Based on the original readability algorithm by Arc90.
|
||||
'''
|
||||
from calibre.ebooks.readability import readability
|
||||
from lxml.html import (fragment_fromstring, tostring,
|
||||
document_fromstring)
|
||||
|
||||
doc = readability.Document(html, self.log, url=url)
|
||||
article_html = doc.summary()
|
||||
extracted_title = doc.title()
|
||||
return (article_html, extracted_title)
|
||||
|
||||
frag = fragment_fromstring(article_html)
|
||||
if frag.tag == 'html':
|
||||
root = frag
|
||||
elif frag.tag == 'body':
|
||||
root = document_fromstring(
|
||||
u'<html><head><title>%s</title></head></html>' %
|
||||
extracted_title)
|
||||
root.append(frag)
|
||||
else:
|
||||
root = document_fromstring(
|
||||
u'<html><head><title>%s</title></head><body/></html>' %
|
||||
extracted_title)
|
||||
root.xpath('//body')[0].append(frag)
|
||||
|
||||
body = root.xpath('//body')[0]
|
||||
has_title = False
|
||||
for x in body.iterdescendants():
|
||||
if x.text == extracted_title:
|
||||
has_title = True
|
||||
inline_titles = body.xpath('//h1|//h2')
|
||||
if not has_title and not inline_titles:
|
||||
heading = body.makeelement('h2')
|
||||
heading.text = extracted_title
|
||||
body.insert(0, heading)
|
||||
|
||||
raw_html = tostring(root, encoding=unicode)
|
||||
|
||||
return raw_html
|
||||
|
||||
def sort_index_by(self, index, weights):
|
||||
'''
|
||||
|
Loading…
x
Reference in New Issue
Block a user