mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add a prepreprocess_html callback to the news download code
This commit is contained in:
parent
cc9457da00
commit
2f9e4ee00d
@ -111,6 +111,8 @@ Pre/post processing of downloaded HTML
|
|||||||
|
|
||||||
.. automember:: BasicNewsRecipe.remove_javascript
|
.. automember:: BasicNewsRecipe.remove_javascript
|
||||||
|
|
||||||
|
.. automethod:: BasicNewsRecipe.prepreprocess_html
|
||||||
|
|
||||||
.. automethod:: BasicNewsRecipe.preprocess_html
|
.. automethod:: BasicNewsRecipe.preprocess_html
|
||||||
|
|
||||||
.. automethod:: BasicNewsRecipe.postprocess_html
|
.. automethod:: BasicNewsRecipe.postprocess_html
|
||||||
|
@ -403,10 +403,25 @@ class BasicNewsRecipe(Recipe):
|
|||||||
return url
|
return url
|
||||||
return article.get('link', None)
|
return article.get('link', None)
|
||||||
|
|
||||||
|
def prepreprocess_html(self, soup):
|
||||||
|
'''
|
||||||
|
This method is called with the source of each downloaded :term:`HTML` file, before
|
||||||
|
any of the cleanup attributes like remove_tags, keep_only_tags are
|
||||||
|
applied. Note that preprocess_regexps will have already been applied.
|
||||||
|
It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`.
|
||||||
|
It should return `soup` after processing it.
|
||||||
|
|
||||||
|
`soup`: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
|
||||||
|
instance containing the downloaded :term:`HTML`.
|
||||||
|
'''
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
'''
|
'''
|
||||||
This method is called with the source of each downloaded :term:`HTML` file, before
|
This method is called with the source of each downloaded :term:`HTML` file, before
|
||||||
it is parsed for links and images.
|
it is parsed for links and images. It is called after the cleanup as
|
||||||
|
specified by remove_tags etc.
|
||||||
It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`.
|
It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`.
|
||||||
It should return `soup` after processing it.
|
It should return `soup` after processing it.
|
||||||
|
|
||||||
@ -603,7 +618,7 @@ class BasicNewsRecipe(Recipe):
|
|||||||
|
|
||||||
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
||||||
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
||||||
'preprocess_html', 'remove_tags_after',
|
'prepreprocess_html', 'preprocess_html', 'remove_tags_after',
|
||||||
'remove_tags_before', 'is_link_wanted'):
|
'remove_tags_before', 'is_link_wanted'):
|
||||||
setattr(self.web2disk_options, extra, getattr(self, extra))
|
setattr(self.web2disk_options, extra, getattr(self, extra))
|
||||||
self.web2disk_options.postprocess_html = self._postprocess_html
|
self.web2disk_options.postprocess_html = self._postprocess_html
|
||||||
|
@ -136,6 +136,7 @@ class RecursiveFetcher(object):
|
|||||||
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
||||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||||
|
self.prepreprocess_html_ext = getattr(options, 'prepreprocess_html', lambda soup: soup)
|
||||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
||||||
self._is_link_wanted = getattr(options, 'is_link_wanted',
|
self._is_link_wanted = getattr(options, 'is_link_wanted',
|
||||||
default_is_link_wanted)
|
default_is_link_wanted)
|
||||||
@ -153,6 +154,8 @@ class RecursiveFetcher(object):
|
|||||||
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
||||||
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
||||||
|
|
||||||
|
soup = self.prepreprocess_html_ext(soup)
|
||||||
|
|
||||||
if self.keep_only_tags:
|
if self.keep_only_tags:
|
||||||
body = Tag(soup, 'body')
|
body = Tag(soup, 'body')
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user