diff --git a/recipes/people_us_mashup.recipe b/recipes/people_us_mashup.recipe index 28c76d820c..5d820bacc0 100644 --- a/recipes/people_us_mashup.recipe +++ b/recipes/people_us_mashup.recipe @@ -18,6 +18,7 @@ class PeopleMag(BasicNewsRecipe): no_stylesheets = True auto_cleanup = True + auto_cleanup_keep = '//div[@id="article-image"]' feeds = [ diff --git a/src/calibre/ebooks/readability/readability.py b/src/calibre/ebooks/readability/readability.py index 8d4a23b338..028a4d6ede 100644 --- a/src/calibre/ebooks/readability/readability.py +++ b/src/calibre/ebooks/readability/readability.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + import re, sys from collections import defaultdict @@ -72,10 +77,15 @@ class Document: self.options[k] = v self.html = None self.log = log + self.keep_elements = set() def _html(self, force=False): if force or self.html is None: self.html = self._parse(self.input) + path = self.options['keep_elements'] + if path is not None: + self.keep_elements = set(self.html.xpath(path)) + return self.html def _parse(self, input): @@ -152,8 +162,9 @@ class Document: append = False if sibling is best_elem: append = True - sibling_key = sibling #HashableElement(sibling) - if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: + if sibling in candidates and candidates[sibling]['content_score'] >= sibling_score_threshold: + append = True + if sibling in self.keep_elements: append = True if sibling.tag == "p": @@ -283,6 +294,8 @@ class Document: def remove_unlikely_candidates(self): for elem in self.html.iter(): + if elem in self.keep_elements: + continue s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) #self.debug(s) if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body': @@ -337,7 +350,7 @@ class Document: allowed = {} # Conditionally clean