News download: Fix prepreprocess_html method

This commit is contained in:
Kovid Goyal 2010-06-12 17:08:26 -06:00
parent 0d66fe64cf
commit 7cf81e7bff
3 changed files with 11 additions and 8 deletions

View File

@ -111,7 +111,7 @@ Pre/post processing of downloaded HTML
.. automember:: BasicNewsRecipe.remove_javascript
.. automethod:: BasicNewsRecipe.prepreprocess_html
.. automethod:: BasicNewsRecipe.skip_ad_pages
.. automethod:: BasicNewsRecipe.preprocess_html

View File

@ -413,18 +413,19 @@ class BasicNewsRecipe(Recipe):
return url
return article.get('link', None)
def prepreprocess_html(self, soup):
def skip_ad_pages(self, soup):
'''
This method is called with the source of each downloaded :term:`HTML` file, before
any of the cleanup attributes like remove_tags, keep_only_tags are
applied. Note that preprocess_regexps will have already been applied.
It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`.
It should return `soup` after processing it.
It is meant to allow the recipe to skip ad pages. If the soup represents
an ad page, return the HTML of the real page. Otherwise return
None.
`soup`: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
instance containing the downloaded :term:`HTML`.
'''
return soup
return None
def preprocess_html(self, soup):
@ -628,7 +629,7 @@ class BasicNewsRecipe(Recipe):
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
'prepreprocess_html', 'preprocess_html', 'remove_tags_after',
'skip_ad_pages', 'preprocess_html', 'remove_tags_after',
'remove_tags_before', 'is_link_wanted'):
setattr(self.web2disk_options, extra, getattr(self, extra))
self.web2disk_options.postprocess_html = self._postprocess_html

View File

@ -136,7 +136,7 @@ class RecursiveFetcher(object):
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.prepreprocess_html_ext = getattr(options, 'prepreprocess_html', lambda soup: soup)
self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
self._is_link_wanted = getattr(options, 'is_link_wanted',
default_is_link_wanted)
@ -154,7 +154,9 @@ class RecursiveFetcher(object):
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
soup = self.prepreprocess_html_ext(soup)
replace = self.prepreprocess_html_ext(soup)
if replace is not None:
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
if self.keep_only_tags:
body = Tag(soup, 'body')