mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make use of readability a little more robust
This commit is contained in:
parent
67b7615a8e
commit
4f21f06f76
@ -38,11 +38,7 @@ class HackerNews(BasicNewsRecipe):
|
|||||||
html = f.read()
|
html = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
data = self.extract_readable_article(html, url)
|
return self.extract_readable_article(html, url)
|
||||||
article_html = data[0]
|
|
||||||
extracted_title = data[1]
|
|
||||||
article_html = u'<cite><strong>' + extracted_title + u'</strong></cite><span> (' + self.prettyify_url(url) + u')</span><br/>' + article_html
|
|
||||||
return u'<html><head><title>' + extracted_title + u'</title></head><body>' + article_html + u'</body></html>'
|
|
||||||
|
|
||||||
def get_hn_content(self, url):
|
def get_hn_content(self, url):
|
||||||
self.log('get_hn_content(' + url + ')')
|
self.log('get_hn_content(' + url + ')')
|
||||||
|
@ -2,14 +2,15 @@ import re, sys
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from lxml.etree import tostring
|
from lxml.etree import tostring
|
||||||
from lxml.html import fragment_fromstring, document_fromstring
|
from lxml.html import (fragment_fromstring, document_fromstring,
|
||||||
|
tostring as htostring)
|
||||||
|
|
||||||
from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title
|
from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title
|
||||||
from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes
|
from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes
|
||||||
|
|
||||||
def tounicode(tree_or_node, **kwargs):
|
def tounicode(tree_or_node, **kwargs):
|
||||||
kwargs['encoding'] = unicode
|
kwargs['encoding'] = unicode
|
||||||
return tostring(tree_or_node, **kwargs)
|
return htostring(tree_or_node, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
REGEXES = {
|
REGEXES = {
|
||||||
@ -144,6 +145,7 @@ class Document:
|
|||||||
|
|
||||||
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
|
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
|
||||||
output = document_fromstring('<div/>')
|
output = document_fromstring('<div/>')
|
||||||
|
parent = output.xpath('//div')[0]
|
||||||
best_elem = best_candidate['elem']
|
best_elem = best_candidate['elem']
|
||||||
for sibling in best_elem.getparent().getchildren():
|
for sibling in best_elem.getparent().getchildren():
|
||||||
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
|
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
|
||||||
@ -165,10 +167,10 @@ class Document:
|
|||||||
append = True
|
append = True
|
||||||
|
|
||||||
if append:
|
if append:
|
||||||
output.append(sibling)
|
parent.append(sibling)
|
||||||
#if output is not None:
|
#if output is not None:
|
||||||
# output.append(best_elem)
|
# output.append(best_elem)
|
||||||
return output
|
return output.find('body')
|
||||||
|
|
||||||
def select_best_candidate(self, candidates):
|
def select_best_candidate(self, candidates):
|
||||||
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
|
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
|
||||||
|
@ -475,17 +475,10 @@ class BasicNewsRecipe(Recipe):
|
|||||||
raw_html = self.preprocess_raw_html(raw_html, url)
|
raw_html = self.preprocess_raw_html(raw_html, url)
|
||||||
if self.auto_cleanup:
|
if self.auto_cleanup:
|
||||||
try:
|
try:
|
||||||
data = self.extract_readable_article(raw_html, url)
|
raw_html = self.extract_readable_article(raw_html, url)
|
||||||
except:
|
except:
|
||||||
self.log.exception('Auto cleanup of URL: %r failed'%url)
|
self.log.exception('Auto cleanup of URL: %r failed'%url)
|
||||||
else:
|
|
||||||
article_html = data[0]
|
|
||||||
extracted_title = data[1]
|
|
||||||
article_html = re.sub(ur'</?(html|body)/?>', u'', article_html)
|
|
||||||
article_html = u'<h1>%s</h1>%s'%(extracted_title, article_html)
|
|
||||||
raw_html = (
|
|
||||||
u'<html><head><title>%s</title></head><body>%s</body></html>'%
|
|
||||||
(extracted_title, article_html))
|
|
||||||
return raw_html
|
return raw_html
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
@ -556,10 +549,41 @@ class BasicNewsRecipe(Recipe):
|
|||||||
Based on the original readability algorithm by Arc90.
|
Based on the original readability algorithm by Arc90.
|
||||||
'''
|
'''
|
||||||
from calibre.ebooks.readability import readability
|
from calibre.ebooks.readability import readability
|
||||||
|
from lxml.html import (fragment_fromstring, tostring,
|
||||||
|
document_fromstring)
|
||||||
|
|
||||||
doc = readability.Document(html, self.log, url=url)
|
doc = readability.Document(html, self.log, url=url)
|
||||||
article_html = doc.summary()
|
article_html = doc.summary()
|
||||||
extracted_title = doc.title()
|
extracted_title = doc.title()
|
||||||
return (article_html, extracted_title)
|
|
||||||
|
frag = fragment_fromstring(article_html)
|
||||||
|
if frag.tag == 'html':
|
||||||
|
root = frag
|
||||||
|
elif frag.tag == 'body':
|
||||||
|
root = document_fromstring(
|
||||||
|
u'<html><head><title>%s</title></head></html>' %
|
||||||
|
extracted_title)
|
||||||
|
root.append(frag)
|
||||||
|
else:
|
||||||
|
root = document_fromstring(
|
||||||
|
u'<html><head><title>%s</title></head><body/></html>' %
|
||||||
|
extracted_title)
|
||||||
|
root.xpath('//body')[0].append(frag)
|
||||||
|
|
||||||
|
body = root.xpath('//body')[0]
|
||||||
|
has_title = False
|
||||||
|
for x in body.iterdescendants():
|
||||||
|
if x.text == extracted_title:
|
||||||
|
has_title = True
|
||||||
|
inline_titles = body.xpath('//h1|//h2')
|
||||||
|
if not has_title and not inline_titles:
|
||||||
|
heading = body.makeelement('h2')
|
||||||
|
heading.text = extracted_title
|
||||||
|
body.insert(0, heading)
|
||||||
|
|
||||||
|
raw_html = tostring(root, encoding=unicode)
|
||||||
|
|
||||||
|
return raw_html
|
||||||
|
|
||||||
def sort_index_by(self, index, weights):
|
def sort_index_by(self, index, weights):
|
||||||
'''
|
'''
|
||||||
|
Loading…
x
Reference in New Issue
Block a user