mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
News download: Use the algorithms from Redability to automatically cleanup downloaded HTML. You can turn this on in your own recipes by adding auto_cleanup=True to the recipe. It is turned on by default for basic recipes created via the GUI.
This commit is contained in:
parent
985d382f1a
commit
72ac735928
@ -219,6 +219,7 @@ class %(classname)s(%(base_class)s):
|
||||
title = %(title)s
|
||||
oldest_article = %(oldest_article)d
|
||||
max_articles_per_feed = %(max_articles)d
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = %(feeds)s
|
||||
'''%dict(classname=classname, title=repr(title),
|
||||
|
@ -138,6 +138,12 @@ class BasicNewsRecipe(Recipe):
|
||||
#: Reverse the order of articles in each feed
|
||||
reverse_article_order = False
|
||||
|
||||
#: Automatically extract all the text from downloaded article pages. Uses
|
||||
#: the algorithms from the readability project. Setting this to True, means
|
||||
#: that you do not have to worry about cleaning up the downloaded HTML
|
||||
#: manually (though manual cleanup will always be superior).
|
||||
auto_cleanup = False
|
||||
|
||||
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
|
||||
#: It will be inserted into `<style>` tags, just before the closing
|
||||
#: `</head>` tag thereby overriding all :term:`CSS` except that which is
|
||||
@ -452,6 +458,35 @@ class BasicNewsRecipe(Recipe):
|
||||
'''
|
||||
return None
|
||||
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
'''
|
||||
This method is called with the source of each downloaded :term:`HTML` file, before
|
||||
it is parsed into an object tree. raw_html is a unicode string
|
||||
representing the raw HTML downloaded from the web. url is the URL from
|
||||
which the HTML was downloaded.
|
||||
|
||||
Note that this method acts *before* preprocess_regexps.
|
||||
|
||||
This method must return the processed raw_html as a unicode object.
|
||||
'''
|
||||
return raw_html
|
||||
|
||||
def preprocess_raw_html_(self, raw_html, url):
|
||||
raw_html = self.preprocess_raw_html(raw_html, url)
|
||||
if self.auto_cleanup:
|
||||
try:
|
||||
data = self.extract_readable_article(raw_html, url)
|
||||
except:
|
||||
self.log.exception('Auto cleanup of URL: %r failed'%url)
|
||||
else:
|
||||
article_html = data[0]
|
||||
extracted_title = data[1]
|
||||
article_html = re.sub(ur'</?(html|body)/?>', u'', article_html)
|
||||
article_html = u'<h1>%s</h1>%s'%(extracted_title, article_html)
|
||||
raw_html = (
|
||||
u'<html><head><title>%s</title></head><body>%s</body></html>'%
|
||||
(extracted_title, article_html))
|
||||
return raw_html
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
'''
|
||||
@ -515,13 +550,13 @@ class BasicNewsRecipe(Recipe):
|
||||
entity_to_unicode(match, encoding=enc)))
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
|
||||
def extract_readable_article(self, html, base_url):
|
||||
def extract_readable_article(self, html, url):
|
||||
'''
|
||||
Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
|
||||
Based on the original readability algorithm by Arc90.
|
||||
'''
|
||||
from calibre.ebooks.readability import readability
|
||||
doc = readability.Document(html, self.log, url=base_url)
|
||||
doc = readability.Document(html, self.log, url=url)
|
||||
article_html = doc.summary()
|
||||
extracted_title = doc.title()
|
||||
return (article_html, extracted_title)
|
||||
@ -671,6 +706,7 @@ class BasicNewsRecipe(Recipe):
|
||||
setattr(self.web2disk_options, extra, getattr(self, extra))
|
||||
self.web2disk_options.postprocess_html = self._postprocess_html
|
||||
self.web2disk_options.encoding = self.encoding
|
||||
self.web2disk_options.preprocess_raw_html = self.preprocess_raw_html_
|
||||
|
||||
if self.delay > 0:
|
||||
self.simultaneous_downloads = 1
|
||||
@ -1417,12 +1453,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
|
||||
|
||||
class AutomaticNewsRecipe(BasicNewsRecipe):
|
||||
|
||||
keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
|
||||
|
||||
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
|
||||
if self.use_embedded_content:
|
||||
self.web2disk_options.keep_only_tags = []
|
||||
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
|
||||
auto_cleanup = True
|
||||
|
||||
class CalibrePeriodical(BasicNewsRecipe):
|
||||
|
||||
|
@ -130,6 +130,8 @@ class RecursiveFetcher(object):
|
||||
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||
self.preprocess_raw_html = getattr(options, 'preprocess_raw_html',
|
||||
lambda raw, url: raw)
|
||||
self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
|
||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
||||
self._is_link_wanted = getattr(options, 'is_link_wanted',
|
||||
@ -139,14 +141,16 @@ class RecursiveFetcher(object):
|
||||
self.failed_links = []
|
||||
self.job_info = job_info
|
||||
|
||||
def get_soup(self, src):
|
||||
def get_soup(self, src, url=None):
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(self.preprocess_regexps)
|
||||
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
|
||||
# Remove comments as they can leave detritus when extracting tags leaves
|
||||
# multiple nested comments
|
||||
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
||||
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
||||
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
|
||||
usrc = self.preprocess_raw_html(usrc, url)
|
||||
soup = BeautifulSoup(usrc, markupMassage=nmassage)
|
||||
|
||||
replace = self.prepreprocess_html_ext(soup)
|
||||
if replace is not None:
|
||||
@ -425,7 +429,7 @@ class RecursiveFetcher(object):
|
||||
else:
|
||||
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
||||
|
||||
soup = self.get_soup(dsrc)
|
||||
soup = self.get_soup(dsrc, url=iurl)
|
||||
|
||||
base = soup.find('base', href=True)
|
||||
if base is not None:
|
||||
|
Loading…
x
Reference in New Issue
Block a user