News download: Use the algorithms from Redability to automatically cleanup downloaded HTML. You can turn this on in your own recipes by adding auto_cleanup=True to the recipe. It is turned on by default for basic recipes created via the GUI.

This commit is contained in:
Kovid Goyal 2011-08-24 20:23:26 -06:00
parent 985d382f1a
commit 72ac735928
3 changed files with 47 additions and 11 deletions

View File

@ -219,6 +219,7 @@ class %(classname)s(%(base_class)s):
title = %(title)s
oldest_article = %(oldest_article)d
max_articles_per_feed = %(max_articles)d
auto_cleanup = True
feeds = %(feeds)s
'''%dict(classname=classname, title=repr(title),

View File

@ -138,6 +138,12 @@ class BasicNewsRecipe(Recipe):
#: Reverse the order of articles in each feed
reverse_article_order = False
#: Automatically extract all the text from downloaded article pages. Uses
#: the algorithms from the readability project. Setting this to True, means
#: that you do not have to worry about cleaning up the downloaded HTML
#: manually (though manual cleanup will always be superior).
auto_cleanup = False
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
#: It will be inserted into `<style>` tags, just before the closing
#: `</head>` tag thereby overriding all :term:`CSS` except that which is
@ -452,6 +458,35 @@ class BasicNewsRecipe(Recipe):
'''
return None
def preprocess_raw_html(self, raw_html, url):
'''
This method is called with the source of each downloaded :term:`HTML` file, before
it is parsed into an object tree. raw_html is a unicode string
representing the raw HTML downloaded from the web. url is the URL from
which the HTML was downloaded.
Note that this method acts *before* preprocess_regexps.
This method must return the processed raw_html as a unicode object.
'''
return raw_html
def preprocess_raw_html_(self, raw_html, url):
raw_html = self.preprocess_raw_html(raw_html, url)
if self.auto_cleanup:
try:
data = self.extract_readable_article(raw_html, url)
except:
self.log.exception('Auto cleanup of URL: %r failed'%url)
else:
article_html = data[0]
extracted_title = data[1]
article_html = re.sub(ur'</?(html|body)/?>', u'', article_html)
article_html = u'<h1>%s</h1>%s'%(extracted_title, article_html)
raw_html = (
u'<html><head><title>%s</title></head><body>%s</body></html>'%
(extracted_title, article_html))
return raw_html
def preprocess_html(self, soup):
'''
@ -515,13 +550,13 @@ class BasicNewsRecipe(Recipe):
entity_to_unicode(match, encoding=enc)))
return BeautifulSoup(_raw, markupMassage=massage)
def extract_readable_article(self, html, base_url):
def extract_readable_article(self, html, url):
'''
Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
Based on the original readability algorithm by Arc90.
'''
from calibre.ebooks.readability import readability
doc = readability.Document(html, self.log, url=base_url)
doc = readability.Document(html, self.log, url=url)
article_html = doc.summary()
extracted_title = doc.title()
return (article_html, extracted_title)
@ -671,6 +706,7 @@ class BasicNewsRecipe(Recipe):
setattr(self.web2disk_options, extra, getattr(self, extra))
self.web2disk_options.postprocess_html = self._postprocess_html
self.web2disk_options.encoding = self.encoding
self.web2disk_options.preprocess_raw_html = self.preprocess_raw_html_
if self.delay > 0:
self.simultaneous_downloads = 1
@ -1417,12 +1453,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
class AutomaticNewsRecipe(BasicNewsRecipe):
keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
if self.use_embedded_content:
self.web2disk_options.keep_only_tags = []
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
auto_cleanup = True
class CalibrePeriodical(BasicNewsRecipe):

View File

@ -130,6 +130,8 @@ class RecursiveFetcher(object):
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.preprocess_raw_html = getattr(options, 'preprocess_raw_html',
lambda raw, url: raw)
self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
self._is_link_wanted = getattr(options, 'is_link_wanted',
@ -139,14 +141,16 @@ class RecursiveFetcher(object):
self.failed_links = []
self.job_info = job_info
def get_soup(self, src):
def get_soup(self, src, url=None):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
# Remove comments as they can leave detritus when extracting tags leaves
# multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
usrc = self.preprocess_raw_html(usrc, url)
soup = BeautifulSoup(usrc, markupMassage=nmassage)
replace = self.prepreprocess_html_ext(soup)
if replace is not None:
@ -425,7 +429,7 @@ class RecursiveFetcher(object):
else:
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
soup = self.get_soup(dsrc)
soup = self.get_soup(dsrc, url=iurl)
base = soup.find('base', href=True)
if base is not None: