Add a prepreprocess_html callback to the news download code

2025-07-09 03:04:10 -04:00 · 2010-06-03 23:03:06 -06:00 · 2010-06-03 23:03:06 -06:00 · 2f9e4ee00d
commit 2f9e4ee00d
parent cc9457da00
3 changed files with 22 additions and 2 deletions
--- a/src/calibre/manual/news_recipe.rst
+++ b/src/calibre/manual/news_recipe.rst
@ -111,6 +111,8 @@ Pre/post processing of downloaded HTML

 .. automember:: BasicNewsRecipe.remove_javascript

+.. automethod:: BasicNewsRecipe.prepreprocess_html
+
 .. automethod:: BasicNewsRecipe.preprocess_html

 .. automethod:: BasicNewsRecipe.postprocess_html
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -403,10 +403,25 @@ class BasicNewsRecipe(Recipe):
                    return url
        return article.get('link',  None)

+    def prepreprocess_html(self, soup):
+        '''
+        This method is called with the source of each downloaded :term:`HTML` file, before
+        any of the cleanup attributes like remove_tags, keep_only_tags are
+        applied. Note that preprocess_regexps will have already been applied.
+        It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`.
+        It should return `soup` after processing it.
+
+        `soup`: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
+        instance containing the downloaded :term:`HTML`.
+        '''
+        return soup
+
+
    def preprocess_html(self, soup):
        '''
        This method is called with the source of each downloaded :term:`HTML` file, before
-        it is parsed for links and images.
+        it is parsed for links and images. It is called after the cleanup as
+        specified by remove_tags etc.
        It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`.
        It should return `soup` after processing it.

@ -603,7 +618,7 @@ class BasicNewsRecipe(Recipe):

        self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
        for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
-                      'preprocess_html', 'remove_tags_after',
+                      'prepreprocess_html', 'preprocess_html', 'remove_tags_after',
                      'remove_tags_before', 'is_link_wanted'):
            setattr(self.web2disk_options, extra, getattr(self, extra))
        self.web2disk_options.postprocess_html = self._postprocess_html
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -136,6 +136,7 @@ class RecursiveFetcher(object):
        self.remove_tags_before  = getattr(options, 'remove_tags_before', None)
        self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
+        self.prepreprocess_html_ext = getattr(options, 'prepreprocess_html', lambda soup: soup)
        self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
        self._is_link_wanted     = getattr(options, 'is_link_wanted',
                default_is_link_wanted)
@ -153,6 +154,8 @@ class RecursiveFetcher(object):
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)

+        soup = self.prepreprocess_html_ext(soup)
+
        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try: