Add readability to calibre and Hacker News by Tom Scholl. Fixes #833261 (Add readability lib for use in recipes)

This commit is contained in:
Kovid Goyal 2011-08-24 17:48:14 -06:00
commit a7f9931262
9 changed files with 812 additions and 0 deletions

View File

@ -28,6 +28,12 @@ License: other
are permitted in any medium without royalty provided the copyright are permitted in any medium without royalty provided the copyright
notice and this notice are preserved. notice and this notice are preserved.
Files: src/calibre/ebooks/readability/*
Copyright: Unknown
License: Apache 2.0
The full text of the Apache 2.0 license is available at:
http://www.apache.org/licenses/LICENSE-2.0
Files: /src/cherrypy/* Files: /src/cherrypy/*
Copyright: Copyright (c) 2004-2007, CherryPy Team (team@cherrypy.org) Copyright: Copyright (c) 2004-2007, CherryPy Team (team@cherrypy.org)
Copyright: Copyright (C) 2005, Tiago Cogumbreiro <cogumbreiro@users.sf.net> Copyright: Copyright (C) 2005, Tiago Cogumbreiro <cogumbreiro@users.sf.net>

90
recipes/hackernews.recipe Normal file
View File

@ -0,0 +1,90 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
Hacker News
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from urlparse import urlparse
class HackerNews(BasicNewsRecipe):
title = 'Hacker News'
__author__ = 'Tom Scholl'
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
publisher = 'Y Combinator'
category = 'news, programming, it, technology'
masthead_url = 'http://i55.tinypic.com/2u6io76.png'
cover_url = 'http://i55.tinypic.com/2u6io76.png'
delay = 1
max_articles_per_feed = 30
use_embedded_content = False
no_stylesheets = True
encoding = 'utf-8'
language = 'en'
requires_version = (0,8,16)
feeds = [
(u'Hacker News', 'http://news.ycombinator.com/rss')
]
temp_files = []
articles_are_obfuscated = True
def get_readable_content(self, url):
self.log('get_readable_content(' + url + ')')
br = self.get_browser()
f = br.open(url)
html = f.read()
f.close()
data = self.extract_readable_article(html, url)
article_html = data[0]
extracted_title = data[1]
article_html = u'<cite><strong>' + extracted_title + u'</strong></cite><span> (' + self.prettyify_url(url) + u')</span><br/>' + article_html
return u'<html><head><title>' + extracted_title + u'</title></head><body>' + article_html + u'</body></html>'
def get_hn_content(self, url):
self.log('get_hn_content(' + url + ')')
# this could be improved
br = self.get_browser()
f = br.open(url)
html = f.read()
f.close()
return html
def get_obfuscated_article(self, url):
if url.startswith('http://news.ycombinator.com'):
content = self.get_hn_content(url)
else:
# TODO: use content-type header instead of url
is_image = False
for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp',]:
if url.endswith(ext):
is_image = True
break
if is_image:
self.log('using image_content (' + url + ')')
content = u'<html><body><img src="' + url + u'"></body></html>'
else:
content = self.get_readable_content(url)
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(content)
self.temp_files[-1].close()
return self.temp_files[-1].name
def is_link_wanted(self, url, tag):
if url.endswith('.pdf'):
return False
return True
def prettyify_url(self, url):
return urlparse(url).hostname
def populate_article_metadata(self, article, soup, first):
article.text_summary = self.prettyify_url(article.url)
article.summary = article.text_summary

View File

@ -0,0 +1,37 @@
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
This is a python port of a ruby port of arc90's readability project, taken
from https://github.com/buriy/python-readability
The original readability project:
http://lab.arc90.com/experiments/readability/
In few words,
Given a html document, it pulls out the main body text and cleans it up.
It also can clean up title based on latest readability.js code.
Based on:
- Latest readability.js ( https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js )
- Ruby port by starrhorne and iterationlabs
- Python port by gfxmonk ( https://github.com/gfxmonk/python-readability , based on BeautifulSoup )
- Decruft effort to move to lxml ( http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/ )
- "BR to P" fix from readability.js which improves quality for smaller texts.
- Github users contributions.
Installation::
easy_install readability-lxml
or
pip install readability-lxml
Usage::
from readability.readability import Document
import urllib
html = urllib.urlopen(url).read()
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
Command-line usage::
python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,32 @@
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
import re
from lxml.html.clean import Cleaner
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
htmlstrip = re.compile("<" # open
"([^>]+) " # prefix
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
"([^>]*)" # postfix
">" # end
, re.I)
def clean_attributes(html):
while htmlstrip.search(html):
html = htmlstrip.sub('<\\1\\2>', html)
return html
def normalize_spaces(s):
if not s: return ''
"""replace any sequence of whitespace
characters with a single space"""
return ' '.join(s.split())
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False,
page_structure=False, processing_instructions=True, embedded=False,
frames=False, forms=False, annoying_tags=False, remove_tags=None,
remove_unknown_tags=False, safe_attrs_only=False)

View File

@ -0,0 +1,25 @@
def save_to_file(text, filename):
f = open(filename, 'wt')
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
f.write(text.encode('utf-8'))
f.close()
uids = {}
def describe(node, depth=2):
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
if node.get('id', ''): name += '#'+node.get('id')
if node.get('class', ''):
name += '.' + node.get('class').replace(' ','.')
if name[:4] in ['div#', 'div.']:
name = name[3:]
if name in ['tr', 'td', 'div', 'p']:
if not node in uids:
uid = uids[node] = len(uids)+1
else:
uid = uids.get(node)
name += "%02d" % (uid)
if depth and node.getparent() is not None:
return name+' - '+describe(node.getparent(), depth-1)
return name

View File

@ -0,0 +1,103 @@
import re
from lxml.html import tostring
import lxml.html
from calibre.ebooks.readability.cleaners import normalize_spaces, clean_attributes
from calibre.ebooks.chardet import xml_to_unicode
def build_doc(page):
page_unicode = xml_to_unicode(page, strip_encoding_pats=True)[0]
doc = lxml.html.document_fromstring(page_unicode)
return doc
def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
def normalize_entities(cur_title):
entities = {
u'\u2014':'-',
u'\u2013':'-',
u'&mdash;': '-',
u'&ndash;': '-',
u'\u00A0': ' ',
u'\u00AB': '"',
u'\u00BB': '"',
u'&quot;': '"',
}
for c, r in entities.iteritems():
if c in cur_title:
cur_title = cur_title.replace(c, r)
return cur_title
def norm_title(title):
return normalize_entities(normalize_spaces(title))
def get_title(doc):
title = doc.find('.//title').text
if not title:
return '[no-title]'
return norm_title(title)
def add_match(collection, text, orig):
text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', '') in orig.replace('"', ''):
collection.add(text)
def shorten_title(doc):
title = doc.find('.//title').text
if not title:
return ''
title = orig = norm_title(title)
candidates = set()
for item in ['.//h1', './/h2', './/h3']:
for e in list(doc.iterfind(item)):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
for e in doc.cssselect(item):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
if candidates:
title = sorted(candidates, key=len)[-1]
else:
for delimiter in [' | ', ' - ', ' :: ', ' / ']:
if delimiter in title:
parts = orig.split(delimiter)
if len(parts[0].split()) >= 4:
title = parts[0]
break
elif len(parts[-1].split()) >= 4:
title = parts[-1]
break
else:
if ': ' in title:
parts = orig.split(': ')
if len(parts[-1].split()) >= 4:
title = parts[-1]
else:
title = orig.split(': ', 1)[1]
if not 15 < len(title) < 150:
return orig
return title
def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
raw_html = unicode(tostring(doc.body or doc))
return clean_attributes(raw_html)

View File

@ -0,0 +1,508 @@
import re, sys
from collections import defaultdict
from lxml.etree import tostring, tounicode
from lxml.html import fragment_fromstring, document_fromstring
from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title
from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes
REGEXES = {
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I),
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I),
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
#'trimRe': re.compile('^\s+|\s+$/'),
#'normalizeRe': re.compile('\s{2,}/'),
#'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
}
def describe(node, depth=1):
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
if node.get('id', ''): name += '#'+node.get('id')
if node.get('class', ''):
name += '.' + node.get('class').replace(' ','.')
if name[:4] in ['div#', 'div.']:
name = name[3:]
if depth and node.getparent() is not None:
return name+' - '+describe(node.getparent(), depth-1)
return name
def to_int(x):
if not x: return None
x = x.strip()
if x.endswith('px'):
return int(x[:-2])
if x.endswith('em'):
return int(x[:-2]) * 12
return int(x)
def clean(text):
text = re.sub('\s*\n\s*', '\n', text)
text = re.sub('[ \t]{2,}', ' ', text)
return text.strip()
def text_length(i):
return len(clean(i.text_content() or ""))
class Unparseable(ValueError):
pass
class Document:
TEXT_LENGTH_THRESHOLD = 25
RETRY_LENGTH = 250
def __init__(self, input, log, **options):
self.input = input
self.options = defaultdict(lambda: None)
for k, v in options.items():
self.options[k] = v
self.html = None
self.log = log
def _html(self, force=False):
if force or self.html is None:
self.html = self._parse(self.input)
return self.html
def _parse(self, input):
doc = build_doc(input)
doc = html_cleaner.clean_html(doc)
base_href = self.options['url']
if base_href:
doc.make_links_absolute(base_href, resolve_base_href=True)
else:
doc.resolve_base_href()
return doc
def content(self):
return get_body(self._html(True))
def title(self):
return get_title(self._html(True))
def short_title(self):
return shorten_title(self._html(True))
def summary(self):
try:
ruthless = True
while True:
self._html(True)
for i in self.tags(self.html, 'script', 'style'):
i.drop_tree()
for i in self.tags(self.html, 'body'):
i.set('id', 'readabilityBody')
if ruthless:
self.remove_unlikely_candidates()
self.transform_misused_divs_into_paragraphs()
candidates = self.score_paragraphs()
best_candidate = self.select_best_candidate(candidates)
if best_candidate:
article = self.get_article(candidates, best_candidate)
else:
if ruthless:
self.log.debug("ruthless removal did not work. ")
ruthless = False
self.debug("ended up stripping too much - going for a safer _parse")
# try again
continue
else:
self.log.debug("Ruthless and lenient parsing did not work. Returning raw html")
article = self.html.find('body')
if article is None:
article = self.html
cleaned_article = self.sanitize(article, candidates)
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
if ruthless and not of_acceptable_length:
ruthless = False
continue # try again
else:
return cleaned_article
except StandardError, e:
self.log.exception('error getting summary: ' )
raise Unparseable(str(e)), None, sys.exc_info()[2]
def get_article(self, candidates, best_candidate):
# Now that we have the top candidate, look through its siblings for content that might also be related.
# Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
output = document_fromstring('<div/>')
best_elem = best_candidate['elem']
for sibling in best_elem.getparent().getchildren():
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
append = False
if sibling is best_elem:
append = True
sibling_key = sibling #HashableElement(sibling)
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
append = True
if sibling.tag == "p":
link_density = self.get_link_density(sibling)
node_content = sibling.text or ""
node_length = len(node_content)
if node_length > 80 and link_density < 0.25:
append = True
elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
append = True
if append:
output.append(sibling)
#if output is not None:
# output.append(best_elem)
return output
def select_best_candidate(self, candidates):
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
for candidate in sorted_candidates[:5]:
elem = candidate['elem']
self.debug("Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem)))
if len(sorted_candidates) == 0:
return None
best_candidate = sorted_candidates[0]
return best_candidate
def get_link_density(self, elem):
link_length = 0
for i in elem.findall(".//a"):
link_length += text_length(i)
#if len(elem.findall(".//div") or elem.findall(".//p")):
# link_length = link_length
total_length = text_length(elem)
return float(link_length) / max(total_length, 1)
def score_paragraphs(self, ):
MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
candidates = {}
#self.debug(str([describe(node) for node in self.tags(self.html, "div")]))
ordered = []
for elem in self.tags(self.html, "p", "pre", "td"):
parent_node = elem.getparent()
if parent_node is None:
continue
grand_parent_node = parent_node.getparent()
inner_text = clean(elem.text_content() or "")
inner_text_len = len(inner_text)
# If this paragraph is less than 25 characters, don't even count it.
if inner_text_len < MIN_LEN:
continue
if parent_node not in candidates:
candidates[parent_node] = self.score_node(parent_node)
ordered.append(parent_node)
if grand_parent_node is not None and grand_parent_node not in candidates:
candidates[grand_parent_node] = self.score_node(grand_parent_node)
ordered.append(grand_parent_node)
content_score = 1
content_score += len(inner_text.split(','))
content_score += min((inner_text_len / 100), 3)
#if elem not in candidates:
# candidates[elem] = self.score_node(elem)
#WTF? candidates[elem]['content_score'] += content_score
candidates[parent_node]['content_score'] += content_score
if grand_parent_node is not None:
candidates[grand_parent_node]['content_score'] += content_score / 2.0
# Scale the final candidates score based on link density. Good content should have a
# relatively small link density (5% or less) and be mostly unaffected by this operation.
for elem in ordered:
candidate = candidates[elem]
ld = self.get_link_density(elem)
score = candidate['content_score']
self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
candidate['content_score'] *= (1 - ld)
return candidates
def class_weight(self, e):
weight = 0
if e.get('class', None):
if REGEXES['negativeRe'].search(e.get('class')):
weight -= 25
if REGEXES['positiveRe'].search(e.get('class')):
weight += 25
if e.get('id', None):
if REGEXES['negativeRe'].search(e.get('id')):
weight -= 25
if REGEXES['positiveRe'].search(e.get('id')):
weight += 25
return weight
def score_node(self, elem):
content_score = self.class_weight(elem)
name = elem.tag.lower()
if name == "div":
content_score += 5
elif name in ["pre", "td", "blockquote"]:
content_score += 3
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
content_score -= 3
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
content_score -= 5
return {
'content_score': content_score,
'elem': elem
}
def debug(self, *a):
#if self.options['debug']:
self.log.debug(*a)
def remove_unlikely_candidates(self):
for elem in self.html.iter():
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
#self.debug(s)
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
self.debug("Removing unlikely candidate - %s" % describe(elem))
elem.drop_tree()
def transform_misused_divs_into_paragraphs(self):
for elem in self.tags(self.html, 'div'):
# transform <div>s that do not contain other block elements into <p>s
if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
#self.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
for elem in self.tags(self.html, 'div'):
if elem.text and elem.text.strip():
p = fragment_fromstring('<p/>')
p.text = elem.text
elem.text = None
elem.insert(0, p)
#print "Appended "+tounicode(p)+" to "+describe(elem)
for pos, child in reversed(list(enumerate(elem))):
if child.tail and child.tail.strip():
p = fragment_fromstring('<p/>')
p.text = child.tail
child.tail = None
elem.insert(pos + 1, p)
#print "Inserted "+tounicode(p)+" to "+describe(elem)
if child.tag == 'br':
#print 'Dropped <br> at '+describe(elem)
child.drop_tree()
def tags(self, node, *tag_names):
for tag_name in tag_names:
for e in node.findall('.//%s' % tag_name):
yield e
def reverse_tags(self, node, *tag_names):
for tag_name in tag_names:
for e in reversed(node.findall('.//%s' % tag_name)):
yield e
def sanitize(self, node, candidates):
MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
header.drop_tree()
for elem in self.tags(node, "form", "iframe", "textarea"):
elem.drop_tree()
allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s
for el in self.reverse_tags(node, "table", "ul", "div"):
if el in allowed:
continue
weight = self.class_weight(el)
if el in candidates:
content_score = candidates[el]['content_score']
#print '!',el, '-> %6.3f' % content_score
else:
content_score = 0
tag = el.tag
if weight + content_score < 0:
self.debug("Cleaned %s with score %6.3f and weight %-3s" %
(describe(el), content_score, weight, ))
el.drop_tree()
elif el.text_content().count(",") < 10:
counts = {}
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
counts[kind] = len(el.findall('.//%s' %kind))
counts["li"] -= 100
content_length = text_length(el) # Count the text length excluding any surrounding whitespace
link_density = self.get_link_density(el)
parent_node = el.getparent()
if parent_node is not None:
if parent_node in candidates:
content_score = candidates[parent_node]['content_score']
else:
content_score = 0
#if parent_node is not None:
#pweight = self.class_weight(parent_node) + content_score
#pname = describe(parent_node)
#else:
#pweight = 0
#pname = "no parent"
to_remove = False
reason = ""
#if el.tag == 'div' and counts["img"] >= 1:
# continue
if counts["p"] and counts["img"] > counts["p"]:
reason = "too many images (%s)" % counts["img"]
to_remove = True
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
reason = "more <li>s than <p>s"
to_remove = True
elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x <p>s than <input>s"
to_remove = True
elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
reason = "too short content length %s without a single image" % content_length
to_remove = True
elif weight < 25 and link_density > 0.2:
reason = "too many links %.3f for its weight %s" % (link_density, weight)
to_remove = True
elif weight >= 25 and link_density > 0.5:
reason = "too many links %.3f for its weight %s" % (link_density, weight)
to_remove = True
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
reason = "<embed>s with too short content length, or too many <embed>s"
to_remove = True
# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
# imgs = el.findall('.//img')
# valid_img = False
# self.debug(tounicode(el))
# for img in imgs:
#
# height = img.get('height')
# text_length = img.get('text_length')
# self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
# if to_int(height) >= 100 or to_int(text_length) >= 100:
# valid_img = True
# self.debug("valid image" + tounicode(img))
# break
# if valid_img:
# to_remove = False
# self.debug("Allowing %s" %el.text_content())
# for desnode in self.tags(el, "table", "ul", "div"):
# allowed[desnode] = True
#find x non empty preceding and succeeding siblings
i, j = 0, 0
x = 1
siblings = []
for sib in el.itersiblings():
#self.debug(sib.text_content())
sib_content_length = text_length(sib)
if sib_content_length:
i =+ 1
siblings.append(sib_content_length)
if i == x:
break
for sib in el.itersiblings(preceding=True):
#self.debug(sib.text_content())
sib_content_length = text_length(sib)
if sib_content_length:
j =+ 1
siblings.append(sib_content_length)
if j == x:
break
#self.debug(str(siblings))
if siblings and sum(siblings) > 1000 :
to_remove = False
self.debug("Allowing %s" % describe(el))
for desnode in self.tags(el, "table", "ul", "div"):
allowed[desnode] = True
if to_remove:
self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
(content_score, describe(el), weight, reason))
#print tounicode(el)
#self.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree()
for el in ([node] + [n for n in node.iter()]):
if not (self.options['attributes']):
#el.attrib = {} #FIXME:Checkout the effects of disabling this
pass
return clean_attributes(tounicode(node))
class HashableElement():
def __init__(self, node):
self.node = node
self._path = None
def _get_path(self):
if self._path is None:
reverse_path = []
node = self.node
while node is not None:
node_id = (node.tag, tuple(node.attrib.items()), node.text)
reverse_path.append(node_id)
node = node.getparent()
self._path = tuple(reverse_path)
return self._path
path = property(_get_path)
def __hash__(self):
return hash(self.path)
def __eq__(self, other):
return self.path == other.path
def __getattr__(self, tag):
return getattr(self.node, tag)
def main():
import logging
from optparse import OptionParser
parser = OptionParser(usage="%prog: [options] [file]")
parser.add_option('-v', '--verbose', action='store_true')
parser.add_option('-u', '--url', help="use URL instead of a local file")
(options, args) = parser.parse_args()
if not (len(args) == 1 or options.url):
parser.print_help()
sys.exit(1)
logging.basicConfig(level=logging.INFO)
file = None
if options.url:
import urllib
file = urllib.urlopen(options.url)
else:
file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8'
try:
print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
finally:
file.close()
if __name__ == '__main__':
main()

View File

@ -515,6 +515,16 @@ class BasicNewsRecipe(Recipe):
entity_to_unicode(match, encoding=enc))) entity_to_unicode(match, encoding=enc)))
return BeautifulSoup(_raw, markupMassage=massage) return BeautifulSoup(_raw, markupMassage=massage)
def extract_readable_article(self, html, base_url):
'''
Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
Based on the original readability algorithm by Arc90.
'''
from calibre.ebooks.readability import readability
doc = readability.Document(html, self.log, url=base_url)
article_html = doc.summary()
extracted_title = doc.title()
return (article_html, extracted_title)
def sort_index_by(self, index, weights): def sort_index_by(self, index, weights):
''' '''