mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
News download: Add an auto_cleanup_keep variable that allows recipe writers to tell the auto cleanup to never remove a specified element
This commit is contained in:
parent
371db4901f
commit
92fdad1ef3
@ -18,6 +18,7 @@ class PeopleMag(BasicNewsRecipe):
|
|||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
|
auto_cleanup_keep = '//div[@id="article-image"]'
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
|
@ -1,3 +1,8 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
import re, sys
|
import re, sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
@ -72,10 +77,15 @@ class Document:
|
|||||||
self.options[k] = v
|
self.options[k] = v
|
||||||
self.html = None
|
self.html = None
|
||||||
self.log = log
|
self.log = log
|
||||||
|
self.keep_elements = set()
|
||||||
|
|
||||||
def _html(self, force=False):
|
def _html(self, force=False):
|
||||||
if force or self.html is None:
|
if force or self.html is None:
|
||||||
self.html = self._parse(self.input)
|
self.html = self._parse(self.input)
|
||||||
|
path = self.options['keep_elements']
|
||||||
|
if path is not None:
|
||||||
|
self.keep_elements = set(self.html.xpath(path))
|
||||||
|
|
||||||
return self.html
|
return self.html
|
||||||
|
|
||||||
def _parse(self, input):
|
def _parse(self, input):
|
||||||
@ -152,8 +162,9 @@ class Document:
|
|||||||
append = False
|
append = False
|
||||||
if sibling is best_elem:
|
if sibling is best_elem:
|
||||||
append = True
|
append = True
|
||||||
sibling_key = sibling #HashableElement(sibling)
|
if sibling in candidates and candidates[sibling]['content_score'] >= sibling_score_threshold:
|
||||||
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
|
append = True
|
||||||
|
if sibling in self.keep_elements:
|
||||||
append = True
|
append = True
|
||||||
|
|
||||||
if sibling.tag == "p":
|
if sibling.tag == "p":
|
||||||
@ -283,6 +294,8 @@ class Document:
|
|||||||
|
|
||||||
def remove_unlikely_candidates(self):
|
def remove_unlikely_candidates(self):
|
||||||
for elem in self.html.iter():
|
for elem in self.html.iter():
|
||||||
|
if elem in self.keep_elements:
|
||||||
|
continue
|
||||||
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
|
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
|
||||||
#self.debug(s)
|
#self.debug(s)
|
||||||
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
|
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
|
||||||
@ -337,7 +350,7 @@ class Document:
|
|||||||
allowed = {}
|
allowed = {}
|
||||||
# Conditionally clean <table>s, <ul>s, and <div>s
|
# Conditionally clean <table>s, <ul>s, and <div>s
|
||||||
for el in self.reverse_tags(node, "table", "ul", "div"):
|
for el in self.reverse_tags(node, "table", "ul", "div"):
|
||||||
if el in allowed:
|
if el in allowed or el in self.keep_elements:
|
||||||
continue
|
continue
|
||||||
weight = self.class_weight(el)
|
weight = self.class_weight(el)
|
||||||
if el in candidates:
|
if el in candidates:
|
||||||
@ -450,46 +463,17 @@ class Document:
|
|||||||
#self.debug("pname %s pweight %.3f" %(pname, pweight))
|
#self.debug("pname %s pweight %.3f" %(pname, pweight))
|
||||||
el.drop_tree()
|
el.drop_tree()
|
||||||
|
|
||||||
for el in ([node] + [n for n in node.iter()]):
|
|
||||||
if not (self.options['attributes']):
|
|
||||||
#el.attrib = {} #FIXME:Checkout the effects of disabling this
|
|
||||||
pass
|
|
||||||
|
|
||||||
return clean_attributes(tounicode(node))
|
return clean_attributes(tounicode(node))
|
||||||
|
|
||||||
|
|
||||||
class HashableElement():
|
|
||||||
def __init__(self, node):
|
|
||||||
self.node = node
|
|
||||||
self._path = None
|
|
||||||
|
|
||||||
def _get_path(self):
|
|
||||||
if self._path is None:
|
|
||||||
reverse_path = []
|
|
||||||
node = self.node
|
|
||||||
while node is not None:
|
|
||||||
node_id = (node.tag, tuple(node.attrib.items()), node.text)
|
|
||||||
reverse_path.append(node_id)
|
|
||||||
node = node.getparent()
|
|
||||||
self._path = tuple(reverse_path)
|
|
||||||
return self._path
|
|
||||||
path = property(_get_path)
|
|
||||||
|
|
||||||
def __hash__(self):
|
|
||||||
return hash(self.path)
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
|
||||||
return self.path == other.path
|
|
||||||
|
|
||||||
def __getattr__(self, tag):
|
|
||||||
return getattr(self.node, tag)
|
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
from calibre.utils.config import OptionParser
|
from calibre.utils.config import OptionParser
|
||||||
parser = OptionParser(usage='%prog: [options] file')
|
parser = OptionParser(usage='%prog: [options] file')
|
||||||
parser.add_option('-v', '--verbose', default=False, action='store_true',
|
parser.add_option('-v', '--verbose', default=False, action='store_true',
|
||||||
dest='verbose',
|
dest='verbose',
|
||||||
help=_('Show detailed output information. Useful for debugging'))
|
help='Show detailed output information. Useful for debugging')
|
||||||
|
parser.add_option('-k', '--keep-elements', default=None, action='store',
|
||||||
|
dest='keep_elements',
|
||||||
|
help='XPath specifying elements that should not be removed')
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
@ -506,7 +490,12 @@ def main():
|
|||||||
raw = f.read()
|
raw = f.read()
|
||||||
|
|
||||||
enc = sys.__stdout__.encoding or 'utf-8'
|
enc = sys.__stdout__.encoding or 'utf-8'
|
||||||
print Document(raw, default_log, debug=options.verbose).summary().encode(enc, 'replace')
|
if options.verbose:
|
||||||
|
default_log.filter_level = default_log.DEBUG
|
||||||
|
print (Document(raw, default_log,
|
||||||
|
debug=options.verbose,
|
||||||
|
keep_elements=options.keep_elements).summary().encode(enc,
|
||||||
|
'replace'))
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@ -144,6 +144,18 @@ class BasicNewsRecipe(Recipe):
|
|||||||
#: manually (though manual cleanup will always be superior).
|
#: manually (though manual cleanup will always be superior).
|
||||||
auto_cleanup = False
|
auto_cleanup = False
|
||||||
|
|
||||||
|
#: Specify elements that the auto cleanup algorithm should never remove
|
||||||
|
#: The syntax is a XPath expression. For example::
|
||||||
|
#:
|
||||||
|
#: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
|
||||||
|
#: id="article-image"
|
||||||
|
#: auto_cleanup_keep = '//*[@class="important"]' will keep all elements
|
||||||
|
#: with class="important"
|
||||||
|
#: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
|
||||||
|
#: will keep all divs with id="article-image" and spans
|
||||||
|
#: with class="important"
|
||||||
|
auto_cleanup_keep = None
|
||||||
|
|
||||||
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
|
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
|
||||||
#: It will be inserted into `<style>` tags, just before the closing
|
#: It will be inserted into `<style>` tags, just before the closing
|
||||||
#: `</head>` tag thereby overriding all :term:`CSS` except that which is
|
#: `</head>` tag thereby overriding all :term:`CSS` except that which is
|
||||||
@ -552,7 +564,8 @@ class BasicNewsRecipe(Recipe):
|
|||||||
from lxml.html import (fragment_fromstring, tostring,
|
from lxml.html import (fragment_fromstring, tostring,
|
||||||
document_fromstring)
|
document_fromstring)
|
||||||
|
|
||||||
doc = readability.Document(html, self.log, url=url)
|
doc = readability.Document(html, self.log, url=url,
|
||||||
|
keep_elements=self.auto_cleanup_keep)
|
||||||
article_html = doc.summary()
|
article_html = doc.summary()
|
||||||
extracted_title = doc.title()
|
extracted_title = doc.title()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user