News download: Add an auto_cleanup_keep variable that allows recipe writers to tell the auto cleanup to never remove a specified element

This commit is contained in:
Kovid Goyal 2011-09-13 19:07:53 -06:00
parent 371db4901f
commit 92fdad1ef3
3 changed files with 42 additions and 39 deletions

View File

@ -18,6 +18,7 @@ class PeopleMag(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
auto_cleanup = True auto_cleanup = True
auto_cleanup_keep = '//div[@id="article-image"]'
feeds = [ feeds = [

View File

@ -1,3 +1,8 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
import re, sys import re, sys
from collections import defaultdict from collections import defaultdict
@ -72,10 +77,15 @@ class Document:
self.options[k] = v self.options[k] = v
self.html = None self.html = None
self.log = log self.log = log
self.keep_elements = set()
def _html(self, force=False): def _html(self, force=False):
if force or self.html is None: if force or self.html is None:
self.html = self._parse(self.input) self.html = self._parse(self.input)
path = self.options['keep_elements']
if path is not None:
self.keep_elements = set(self.html.xpath(path))
return self.html return self.html
def _parse(self, input): def _parse(self, input):
@ -152,8 +162,9 @@ class Document:
append = False append = False
if sibling is best_elem: if sibling is best_elem:
append = True append = True
sibling_key = sibling #HashableElement(sibling) if sibling in candidates and candidates[sibling]['content_score'] >= sibling_score_threshold:
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True
if sibling in self.keep_elements:
append = True append = True
if sibling.tag == "p": if sibling.tag == "p":
@ -283,6 +294,8 @@ class Document:
def remove_unlikely_candidates(self): def remove_unlikely_candidates(self):
for elem in self.html.iter(): for elem in self.html.iter():
if elem in self.keep_elements:
continue
s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
#self.debug(s) #self.debug(s)
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body': if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
@ -337,7 +350,7 @@ class Document:
allowed = {} allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s # Conditionally clean <table>s, <ul>s, and <div>s
for el in self.reverse_tags(node, "table", "ul", "div"): for el in self.reverse_tags(node, "table", "ul", "div"):
if el in allowed: if el in allowed or el in self.keep_elements:
continue continue
weight = self.class_weight(el) weight = self.class_weight(el)
if el in candidates: if el in candidates:
@ -450,46 +463,17 @@ class Document:
#self.debug("pname %s pweight %.3f" %(pname, pweight)) #self.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree() el.drop_tree()
for el in ([node] + [n for n in node.iter()]):
if not (self.options['attributes']):
#el.attrib = {} #FIXME:Checkout the effects of disabling this
pass
return clean_attributes(tounicode(node)) return clean_attributes(tounicode(node))
class HashableElement():
def __init__(self, node):
self.node = node
self._path = None
def _get_path(self):
if self._path is None:
reverse_path = []
node = self.node
while node is not None:
node_id = (node.tag, tuple(node.attrib.items()), node.text)
reverse_path.append(node_id)
node = node.getparent()
self._path = tuple(reverse_path)
return self._path
path = property(_get_path)
def __hash__(self):
return hash(self.path)
def __eq__(self, other):
return self.path == other.path
def __getattr__(self, tag):
return getattr(self.node, tag)
def option_parser(): def option_parser():
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
parser = OptionParser(usage='%prog: [options] file') parser = OptionParser(usage='%prog: [options] file')
parser.add_option('-v', '--verbose', default=False, action='store_true', parser.add_option('-v', '--verbose', default=False, action='store_true',
dest='verbose', dest='verbose',
help=_('Show detailed output information. Useful for debugging')) help='Show detailed output information. Useful for debugging')
parser.add_option('-k', '--keep-elements', default=None, action='store',
dest='keep_elements',
help='XPath specifying elements that should not be removed')
return parser return parser
@ -506,7 +490,12 @@ def main():
raw = f.read() raw = f.read()
enc = sys.__stdout__.encoding or 'utf-8' enc = sys.__stdout__.encoding or 'utf-8'
print Document(raw, default_log, debug=options.verbose).summary().encode(enc, 'replace') if options.verbose:
default_log.filter_level = default_log.DEBUG
print (Document(raw, default_log,
debug=options.verbose,
keep_elements=options.keep_elements).summary().encode(enc,
'replace'))
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -144,6 +144,18 @@ class BasicNewsRecipe(Recipe):
#: manually (though manual cleanup will always be superior). #: manually (though manual cleanup will always be superior).
auto_cleanup = False auto_cleanup = False
#: Specify elements that the auto cleanup algorithm should never remove
#: The syntax is a XPath expression. For example::
#:
#: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
#: id="article-image"
#: auto_cleanup_keep = '//*[@class="important"]' will keep all elements
#: with class="important"
#: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
#: will keep all divs with id="article-image" and spans
#: with class="important"
auto_cleanup_keep = None
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
#: It will be inserted into `<style>` tags, just before the closing #: It will be inserted into `<style>` tags, just before the closing
#: `</head>` tag thereby overriding all :term:`CSS` except that which is #: `</head>` tag thereby overriding all :term:`CSS` except that which is
@ -552,7 +564,8 @@ class BasicNewsRecipe(Recipe):
from lxml.html import (fragment_fromstring, tostring, from lxml.html import (fragment_fromstring, tostring,
document_fromstring) document_fromstring)
doc = readability.Document(html, self.log, url=url) doc = readability.Document(html, self.log, url=url,
keep_elements=self.auto_cleanup_keep)
article_html = doc.summary() article_html = doc.summary()
extracted_title = doc.title() extracted_title = doc.title()