News download: Add an auto_cleanup_keep variable that allows recipe writers to tell the auto cleanup to never remove a specified element

2025-11-26 16:25:02 -05:00 · 2011-09-13 19:07:53 -06:00 · 2011-09-13 19:07:53 -06:00 · 92fdad1ef3
commit 92fdad1ef3
parent 371db4901f
3 changed files with 42 additions and 39 deletions
--- a/recipes/people_us_mashup.recipe
+++ b/recipes/people_us_mashup.recipe
@ -18,6 +18,7 @@ class PeopleMag(BasicNewsRecipe):

    no_stylesheets = True
    auto_cleanup = True
+    auto_cleanup_keep = '//div[@id="article-image"]'


    feeds = [
--- a/src/calibre/ebooks/readability/readability.py
+++ b/src/calibre/ebooks/readability/readability.py
@ -1,3 +1,8 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
 import re, sys
 from collections import defaultdict

@ -72,10 +77,15 @@ class Document:
            self.options[k] = v
        self.html = None
        self.log = log
+        self.keep_elements = set()

    def _html(self, force=False):
        if force or self.html is None:
            self.html = self._parse(self.input)
+            path = self.options['keep_elements']
+            if path is not None:
+                self.keep_elements = set(self.html.xpath(path))
+
        return self.html

    def _parse(self, input):
@ -152,8 +162,9 @@ class Document:
            append = False
            if sibling is best_elem:
                append = True
-            sibling_key = sibling #HashableElement(sibling)
-            if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
+            if sibling in candidates and candidates[sibling]['content_score'] >= sibling_score_threshold:
+                append = True
+            if sibling in self.keep_elements:
                append = True

            if sibling.tag == "p":
@ -283,6 +294,8 @@ class Document:

    def remove_unlikely_candidates(self):
        for elem in self.html.iter():
+            if elem in self.keep_elements:
+                continue
            s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
            #self.debug(s)
            if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
@ -337,7 +350,7 @@ class Document:
        allowed = {}
        # Conditionally clean <table>s, <ul>s, and <div>s
        for el in self.reverse_tags(node, "table", "ul", "div"):
-            if el in allowed:
+            if el in allowed or el in self.keep_elements:
                continue
            weight = self.class_weight(el)
            if el in candidates:
@ -450,46 +463,17 @@ class Document:
                    #self.debug("pname %s pweight %.3f" %(pname, pweight))
                    el.drop_tree()

-        for el in ([node] + [n for n in node.iter()]):
-            if not (self.options['attributes']):
-                #el.attrib = {} #FIXME:Checkout the effects of disabling this
-                pass
-
        return clean_attributes(tounicode(node))

-
-class HashableElement():
-    def __init__(self, node):
-        self.node = node
-        self._path = None
-
-    def _get_path(self):
-        if self._path is None:
-            reverse_path = []
-            node = self.node
-            while node is not None:
-                node_id = (node.tag, tuple(node.attrib.items()), node.text)
-                reverse_path.append(node_id)
-                node = node.getparent()
-            self._path = tuple(reverse_path)
-        return self._path
-    path = property(_get_path)
-
-    def __hash__(self):
-        return hash(self.path)
-
-    def __eq__(self, other):
-        return self.path == other.path
-
-    def __getattr__(self, tag):
-        return getattr(self.node, tag)
-
 def option_parser():
    from calibre.utils.config import OptionParser
    parser = OptionParser(usage='%prog: [options] file')
    parser.add_option('-v', '--verbose', default=False, action='store_true',
-                      dest='verbose',
-                      help=_('Show detailed output information. Useful for debugging'))
+            dest='verbose',
+            help='Show detailed output information. Useful for debugging')
+    parser.add_option('-k', '--keep-elements', default=None, action='store',
+            dest='keep_elements',
+            help='XPath specifying elements that should not be removed')

    return parser

@ -506,7 +490,12 @@ def main():
        raw = f.read()

    enc = sys.__stdout__.encoding or 'utf-8'
-    print Document(raw, default_log, debug=options.verbose).summary().encode(enc, 'replace')
+    if options.verbose:
+        default_log.filter_level = default_log.DEBUG
+    print (Document(raw, default_log,
+            debug=options.verbose,
+            keep_elements=options.keep_elements).summary().encode(enc,
+                'replace'))

 if __name__ == '__main__':
    main()
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -144,6 +144,18 @@ class BasicNewsRecipe(Recipe):
    #: manually (though manual cleanup will always be superior).
    auto_cleanup = False

+    #: Specify elements that the auto cleanup algorithm should never remove
+    #: The syntax is a XPath expression. For example::
+    #:
+    #: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
+    #:                                                  id="article-image"
+    #: auto_cleanup_keep = '//*[@class="important"]' will keep all elements
+    #:                                               with class="important"
+    #: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
+    #:                     will keep all divs with id="article-image" and spans
+    #:                     with class="important"
+    auto_cleanup_keep = None
+
    #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
    #: It will be inserted into `<style>` tags, just before the closing
    #: `</head>` tag thereby overriding all :term:`CSS` except that which is
@ -552,7 +564,8 @@ class BasicNewsRecipe(Recipe):
        from lxml.html import (fragment_fromstring, tostring,
                document_fromstring)

-        doc = readability.Document(html, self.log, url=url)
+        doc = readability.Document(html, self.log, url=url,
+                keep_elements=self.auto_cleanup_keep)
        article_html = doc.summary()
        extracted_title = doc.title()