News download: Fix prepreprocess_html method

This commit is contained in:
Kovid Goyal 2010-06-12 17:08:26 -06:00
parent 0d66fe64cf
commit 7cf81e7bff
3 changed files with 11 additions and 8 deletions

View File

@ -111,7 +111,7 @@ Pre/post processing of downloaded HTML
.. automember:: BasicNewsRecipe.remove_javascript .. automember:: BasicNewsRecipe.remove_javascript
.. automethod:: BasicNewsRecipe.prepreprocess_html .. automethod:: BasicNewsRecipe.skip_ad_pages
.. automethod:: BasicNewsRecipe.preprocess_html .. automethod:: BasicNewsRecipe.preprocess_html

View File

@ -413,18 +413,19 @@ class BasicNewsRecipe(Recipe):
return url return url
return article.get('link', None) return article.get('link', None)
def prepreprocess_html(self, soup): def skip_ad_pages(self, soup):
''' '''
This method is called with the source of each downloaded :term:`HTML` file, before This method is called with the source of each downloaded :term:`HTML` file, before
any of the cleanup attributes like remove_tags, keep_only_tags are any of the cleanup attributes like remove_tags, keep_only_tags are
applied. Note that preprocess_regexps will have already been applied. applied. Note that preprocess_regexps will have already been applied.
It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`. It is meant to allow the recipe to skip ad pages. If the soup represents
It should return `soup` after processing it. an ad page, return the HTML of the real page. Otherwise return
None.
`soup`: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ `soup`: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
instance containing the downloaded :term:`HTML`. instance containing the downloaded :term:`HTML`.
''' '''
return soup return None
def preprocess_html(self, soup): def preprocess_html(self, soup):
@ -628,7 +629,7 @@ class BasicNewsRecipe(Recipe):
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0] self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
'prepreprocess_html', 'preprocess_html', 'remove_tags_after', 'skip_ad_pages', 'preprocess_html', 'remove_tags_after',
'remove_tags_before', 'is_link_wanted'): 'remove_tags_before', 'is_link_wanted'):
setattr(self.web2disk_options, extra, getattr(self, extra)) setattr(self.web2disk_options, extra, getattr(self, extra))
self.web2disk_options.postprocess_html = self._postprocess_html self.web2disk_options.postprocess_html = self._postprocess_html

View File

@ -136,7 +136,7 @@ class RecursiveFetcher(object):
self.remove_tags_before = getattr(options, 'remove_tags_before', None) self.remove_tags_before = getattr(options, 'remove_tags_before', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', []) self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.prepreprocess_html_ext = getattr(options, 'prepreprocess_html', lambda soup: soup) self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
self.postprocess_html_ext= getattr(options, 'postprocess_html', None) self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
self._is_link_wanted = getattr(options, 'is_link_wanted', self._is_link_wanted = getattr(options, 'is_link_wanted',
default_is_link_wanted) default_is_link_wanted)
@ -154,7 +154,9 @@ class RecursiveFetcher(object):
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
soup = self.prepreprocess_html_ext(soup) replace = self.prepreprocess_html_ext(soup)
if replace is not None:
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
if self.keep_only_tags: if self.keep_only_tags:
body = Tag(soup, 'body') body = Tag(soup, 'body')