mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
News download: Use the algorithms from Redability to automatically cleanup downloaded HTML. You can turn this on in your own recipes by adding auto_cleanup=True to the recipe. It is turned on by default for basic recipes created via the GUI.
This commit is contained in:
parent
985d382f1a
commit
72ac735928
@ -219,6 +219,7 @@ class %(classname)s(%(base_class)s):
|
|||||||
title = %(title)s
|
title = %(title)s
|
||||||
oldest_article = %(oldest_article)d
|
oldest_article = %(oldest_article)d
|
||||||
max_articles_per_feed = %(max_articles)d
|
max_articles_per_feed = %(max_articles)d
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
feeds = %(feeds)s
|
feeds = %(feeds)s
|
||||||
'''%dict(classname=classname, title=repr(title),
|
'''%dict(classname=classname, title=repr(title),
|
||||||
|
@ -138,6 +138,12 @@ class BasicNewsRecipe(Recipe):
|
|||||||
#: Reverse the order of articles in each feed
|
#: Reverse the order of articles in each feed
|
||||||
reverse_article_order = False
|
reverse_article_order = False
|
||||||
|
|
||||||
|
#: Automatically extract all the text from downloaded article pages. Uses
|
||||||
|
#: the algorithms from the readability project. Setting this to True, means
|
||||||
|
#: that you do not have to worry about cleaning up the downloaded HTML
|
||||||
|
#: manually (though manual cleanup will always be superior).
|
||||||
|
auto_cleanup = False
|
||||||
|
|
||||||
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
|
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
|
||||||
#: It will be inserted into `<style>` tags, just before the closing
|
#: It will be inserted into `<style>` tags, just before the closing
|
||||||
#: `</head>` tag thereby overriding all :term:`CSS` except that which is
|
#: `</head>` tag thereby overriding all :term:`CSS` except that which is
|
||||||
@ -452,6 +458,35 @@ class BasicNewsRecipe(Recipe):
|
|||||||
'''
|
'''
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
|
'''
|
||||||
|
This method is called with the source of each downloaded :term:`HTML` file, before
|
||||||
|
it is parsed into an object tree. raw_html is a unicode string
|
||||||
|
representing the raw HTML downloaded from the web. url is the URL from
|
||||||
|
which the HTML was downloaded.
|
||||||
|
|
||||||
|
Note that this method acts *before* preprocess_regexps.
|
||||||
|
|
||||||
|
This method must return the processed raw_html as a unicode object.
|
||||||
|
'''
|
||||||
|
return raw_html
|
||||||
|
|
||||||
|
def preprocess_raw_html_(self, raw_html, url):
|
||||||
|
raw_html = self.preprocess_raw_html(raw_html, url)
|
||||||
|
if self.auto_cleanup:
|
||||||
|
try:
|
||||||
|
data = self.extract_readable_article(raw_html, url)
|
||||||
|
except:
|
||||||
|
self.log.exception('Auto cleanup of URL: %r failed'%url)
|
||||||
|
else:
|
||||||
|
article_html = data[0]
|
||||||
|
extracted_title = data[1]
|
||||||
|
article_html = re.sub(ur'</?(html|body)/?>', u'', article_html)
|
||||||
|
article_html = u'<h1>%s</h1>%s'%(extracted_title, article_html)
|
||||||
|
raw_html = (
|
||||||
|
u'<html><head><title>%s</title></head><body>%s</body></html>'%
|
||||||
|
(extracted_title, article_html))
|
||||||
|
return raw_html
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
'''
|
'''
|
||||||
@ -515,13 +550,13 @@ class BasicNewsRecipe(Recipe):
|
|||||||
entity_to_unicode(match, encoding=enc)))
|
entity_to_unicode(match, encoding=enc)))
|
||||||
return BeautifulSoup(_raw, markupMassage=massage)
|
return BeautifulSoup(_raw, markupMassage=massage)
|
||||||
|
|
||||||
def extract_readable_article(self, html, base_url):
|
def extract_readable_article(self, html, url):
|
||||||
'''
|
'''
|
||||||
Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
|
Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
|
||||||
Based on the original readability algorithm by Arc90.
|
Based on the original readability algorithm by Arc90.
|
||||||
'''
|
'''
|
||||||
from calibre.ebooks.readability import readability
|
from calibre.ebooks.readability import readability
|
||||||
doc = readability.Document(html, self.log, url=base_url)
|
doc = readability.Document(html, self.log, url=url)
|
||||||
article_html = doc.summary()
|
article_html = doc.summary()
|
||||||
extracted_title = doc.title()
|
extracted_title = doc.title()
|
||||||
return (article_html, extracted_title)
|
return (article_html, extracted_title)
|
||||||
@ -671,6 +706,7 @@ class BasicNewsRecipe(Recipe):
|
|||||||
setattr(self.web2disk_options, extra, getattr(self, extra))
|
setattr(self.web2disk_options, extra, getattr(self, extra))
|
||||||
self.web2disk_options.postprocess_html = self._postprocess_html
|
self.web2disk_options.postprocess_html = self._postprocess_html
|
||||||
self.web2disk_options.encoding = self.encoding
|
self.web2disk_options.encoding = self.encoding
|
||||||
|
self.web2disk_options.preprocess_raw_html = self.preprocess_raw_html_
|
||||||
|
|
||||||
if self.delay > 0:
|
if self.delay > 0:
|
||||||
self.simultaneous_downloads = 1
|
self.simultaneous_downloads = 1
|
||||||
@ -1417,12 +1453,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
class AutomaticNewsRecipe(BasicNewsRecipe):
|
class AutomaticNewsRecipe(BasicNewsRecipe):
|
||||||
|
|
||||||
keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
|
auto_cleanup = True
|
||||||
|
|
||||||
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
|
|
||||||
if self.use_embedded_content:
|
|
||||||
self.web2disk_options.keep_only_tags = []
|
|
||||||
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
|
|
||||||
|
|
||||||
class CalibrePeriodical(BasicNewsRecipe):
|
class CalibrePeriodical(BasicNewsRecipe):
|
||||||
|
|
||||||
|
@ -130,6 +130,8 @@ class RecursiveFetcher(object):
|
|||||||
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
||||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||||
|
self.preprocess_raw_html = getattr(options, 'preprocess_raw_html',
|
||||||
|
lambda raw, url: raw)
|
||||||
self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
|
self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
|
||||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
||||||
self._is_link_wanted = getattr(options, 'is_link_wanted',
|
self._is_link_wanted = getattr(options, 'is_link_wanted',
|
||||||
@ -139,14 +141,16 @@ class RecursiveFetcher(object):
|
|||||||
self.failed_links = []
|
self.failed_links = []
|
||||||
self.job_info = job_info
|
self.job_info = job_info
|
||||||
|
|
||||||
def get_soup(self, src):
|
def get_soup(self, src, url=None):
|
||||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
nmassage.extend(self.preprocess_regexps)
|
nmassage.extend(self.preprocess_regexps)
|
||||||
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
|
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
|
||||||
# Remove comments as they can leave detritus when extracting tags leaves
|
# Remove comments as they can leave detritus when extracting tags leaves
|
||||||
# multiple nested comments
|
# multiple nested comments
|
||||||
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
||||||
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
|
||||||
|
usrc = self.preprocess_raw_html(usrc, url)
|
||||||
|
soup = BeautifulSoup(usrc, markupMassage=nmassage)
|
||||||
|
|
||||||
replace = self.prepreprocess_html_ext(soup)
|
replace = self.prepreprocess_html_ext(soup)
|
||||||
if replace is not None:
|
if replace is not None:
|
||||||
@ -425,7 +429,7 @@ class RecursiveFetcher(object):
|
|||||||
else:
|
else:
|
||||||
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
||||||
|
|
||||||
soup = self.get_soup(dsrc)
|
soup = self.get_soup(dsrc, url=iurl)
|
||||||
|
|
||||||
base = soup.find('base', href=True)
|
base = soup.find('base', href=True)
|
||||||
if base is not None:
|
if base is not None:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user