diff --git a/src/calibre/ebooks/conversion/plugins/html_input.py b/src/calibre/ebooks/conversion/plugins/html_input.py index 5d415a5e69..f56c21c48d 100644 --- a/src/calibre/ebooks/conversion/plugins/html_input.py +++ b/src/calibre/ebooks/conversion/plugins/html_input.py @@ -299,7 +299,7 @@ class HTMLInput(InputFormatPlugin): return (None, None) try: raw = open(link, 'rb').read().decode('utf-8', 'replace') - raw = self.oeb.css_preprocessor(raw, add_namespace=True) + raw = self.oeb.css_preprocessor(raw, add_namespace=False) except: self.log.exception('Failed to read CSS file: %r'%link) return (None, None) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index afb7e16a0c..158aa4b857 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -924,13 +924,12 @@ class Manifest(object): log.raiseExceptions = False self.oeb.log.debug('Parsing', self.href, '...') data = self.oeb.decode(data) - data = self.oeb.css_preprocessor(data, add_namespace=True) + data = self.oeb.css_preprocessor(data, add_namespace=False) parser = CSSParser(loglevel=logging.WARNING, fetcher=self.override_css_fetch or self._fetch_css, log=_css_logger) data = parser.parseString(data, href=self.href, validate=False) data = resolveImports(data) - data.namespaces['h'] = XHTML_NS for rule in tuple(data.cssRules.rulesOfType(CSSRule.PAGE_RULE)): data.cssRules.remove(rule) return data diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 2c31f7f524..fc9c1e63ec 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -15,28 +15,23 @@ from cssutils.css import (CSSStyleRule, CSSPageRule, CSSFontFaceRule, cssproperties, CSSRule) from cssutils import (profile as cssprofiles, parseString, parseStyle, log as cssutils_log, CSSParser, profiles, replaceUrls) -from lxml import etree -from cssselect import HTMLTranslator - -from calibre import force_unicode +from calibre import force_unicode, as_unicode from calibre.ebooks import unit_convert -from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, XPNSMAP, xpath, urlnormalize +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, xpath, urlnormalize from calibre.ebooks.oeb.normalize_css import DEFAULTS, normalizers +from css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES cssutils_log.setLevel(logging.WARN) _html_css_stylesheet = None -css_to_xpath = HTMLTranslator().css_to_xpath def html_css_stylesheet(): global _html_css_stylesheet if _html_css_stylesheet is None: html_css = open(P('templates/html.css'), 'rb').read() _html_css_stylesheet = parseString(html_css, validate=False) - _html_css_stylesheet.namespaces['h'] = XHTML_NS return _html_css_stylesheet -XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS INHERITED = set(['azimuth', 'border-collapse', 'border-spacing', 'caption-side', 'color', 'cursor', 'direction', 'elevation', @@ -53,100 +48,6 @@ INHERITED = set(['azimuth', 'border-collapse', 'border-spacing', FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large']) -def xpath_lower_case(arg): - 'An ASCII lowercase function for XPath' - return ("translate(%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', " - "'abcdefghijklmnopqrstuvwxyz')")%arg -is_non_whitespace = re.compile(r'^[^ \t\r\n\f]+$').match - -class CaseInsensitiveAttributesTranslator(HTMLTranslator): - 'Treat class and id CSS selectors case-insensitively' - - def xpath_class(self, class_selector): - """Translate a class selector.""" - x = self.xpath(class_selector.selector) - if is_non_whitespace(class_selector.class_name): - x.add_condition( - "%s and contains(concat(' ', normalize-space(%s), ' '), %s)" - % ('@class', xpath_lower_case('@class'), self.xpath_literal( - ' '+class_selector.class_name.lower()+' '))) - else: - x.add_condition('0') - return x - - def xpath_hash(self, id_selector): - """Translate an ID selector.""" - x = self.xpath(id_selector.selector) - return self.xpath_attrib_equals(x, xpath_lower_case('@id'), - (id_selector.id.lower())) - -ci_css_to_xpath = CaseInsensitiveAttributesTranslator().css_to_xpath - -NULL_NAMESPACE_REGEX = re.compile(ur'''(name\(\) = ['"])h:''') -def fix_namespace(raw): - ''' - cssselect uses name() = 'h:p' to select tags for some CSS selectors (e.g. - h|p+h|p). - However, since for us the XHTML namespace is the default namespace (with no - prefix), name() is the same as local-name(). So this is a hack to - workaround the problem. - ''' - return NULL_NAMESPACE_REGEX.sub(ur'\1', raw) - -class CSSSelector(object): - - def __init__(self, css, log=None, namespaces=XPNSMAP): - self.namespaces = namespaces - self.sel = self.build_selector(css, log) - self.css = css - self.used_ci_sel = False - - def build_selector(self, css, log, func=css_to_xpath): - try: - return etree.XPath(fix_namespace(func(css)), namespaces=self.namespaces) - except: - if log is not None: - log.exception('Failed to parse CSS selector: %r'%css) - return None - - def __call__(self, node, log): - if self.sel is None: - return [] - try: - ans = self.sel(node) - except: - log.exception(u'Failed to run CSS selector: %s'%self.css) - return [] - - if not ans: - # Try a case insensitive version - if not hasattr(self, 'ci_sel'): - self.ci_sel = self.build_selector(self.css, log, ci_css_to_xpath) - if self.ci_sel is not None: - try: - ans = self.ci_sel(node) - except: - log.exception(u'Failed to run case-insensitive CSS selector: %s'%self.css) - return [] - if ans: - if not self.used_ci_sel: - log.warn('Interpreting class and id values ' - 'case-insensitively in selector: %s'%self.css) - self.used_ci_sel = True - return ans - -_selector_cache = {} - -MIN_SPACE_RE = re.compile(r' *([>~+]) *') - -def get_css_selector(raw_selector, log): - css = MIN_SPACE_RE.sub(r'\1', raw_selector) - ans = _selector_cache.get(css, None) - if ans is None: - ans = CSSSelector(css, log) - _selector_cache[css] = ans - return ans - class Stylizer(object): STYLESHEETS = WeakKeyDictionary() @@ -195,13 +96,12 @@ class Stylizer(object): if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: - text = oeb.css_preprocessor(text, add_namespace=True) + text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) - stylesheet.namespaces['h'] = XHTML_NS for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) @@ -244,10 +144,9 @@ class Stylizer(object): for w, x in csses.items(): if x: try: - text = XHTML_CSS_NAMESPACE + x + text = x stylesheet = parser.parseString(text, href=cssname, validate=False) - stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) @@ -275,13 +174,17 @@ class Stylizer(object): rules.sort() self.rules = rules self._styles = {} - pseudo_pat = re.compile(ur':{1,2}(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I) + pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) + select = Select(tree, ignore_inappropriate_pseudo_classes=True) + for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) - if fl is not None: - text = text.replace(fl.group(), '') - selector = get_css_selector(text, self.oeb.log) - matches = selector(tree, self.logger) + try: + matches = select(text) + except SelectorError as err: + self.log.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err))) + continue + if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, @@ -486,9 +389,7 @@ class Style(object): result = None if name in self._style: result = self._style[name] - if (result == 'inherit' - or (result is None and name in INHERITED - and self._has_parent())): + if (result == 'inherit' or (result is None and name in INHERITED and self._has_parent())): stylizer = self._stylizer result = stylizer.style(self._element.getparent())._get(name) if result is None: diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 512215e6d9..8576d187ba 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -14,12 +14,13 @@ from collections import OrderedDict from lxml.etree import XPath as _XPath from lxml import etree -from cssselect import HTMLTranslator +from calibre import as_unicode from calibre.ebooks.epub import rules from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES, urldefrag, rewrite_links, urlunquote, XHTML, urlnormalize) from calibre.ebooks.oeb.polish.split import do_split +from css_selectors import Select, SelectorError XPath = functools.partial(_XPath, namespaces=NAMESPACES) @@ -75,9 +76,7 @@ class Split(object): def find_page_breaks(self, item): if self.page_break_selectors is None: - from calibre.ebooks.oeb.stylizer import fix_namespace - css_to_xpath = HTMLTranslator().css_to_xpath - self.page_break_selectors = set([]) + self.page_break_selectors = set() stylesheets = [x.data for x in self.oeb.manifest if x.media_type in OEB_STYLES] for rule in rules(stylesheets): @@ -87,31 +86,37 @@ class Split(object): 'page-break-after'), 'cssText', '').strip().lower() try: if before and before not in {'avoid', 'auto', 'inherit'}: - self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))), - True)) + self.page_break_selectors.add((rule.selectorText, True)) if self.remove_css_pagebreaks: rule.style.removeProperty('page-break-before') except: pass try: if after and after not in {'avoid', 'auto', 'inherit'}: - self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))), - False)) + self.page_break_selectors.add((rule.selectorText, False)) if self.remove_css_pagebreaks: rule.style.removeProperty('page-break-after') except: pass - page_breaks = set([]) - for selector, before in self.page_break_selectors: - body = item.data.xpath('//h:body', namespaces=NAMESPACES) - if not body: - continue - for elem in selector(body[0]): - if elem not in body: - elem.set('pb_before', '1' if before else '0') - page_breaks.add(elem) + page_breaks = set() + select = Select(item.data) + if not self.page_break_selectors: + return [], [] + body = item.data.xpath('//h:body', namespaces=NAMESPACES) + if not body: + return [], [] + descendants = frozenset(body[0].iterdescendants('*')) - for i, elem in enumerate(item.data.iter()): + for selector, before in self.page_break_selectors: + try: + for elem in select(selector): + if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}: + elem.set('pb_before', '1' if before else '0') + page_breaks.add(elem) + except SelectorError as err: + self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err))) + + for i, elem in enumerate(item.data.iter('*')): try: elem.set('pb_order', str(i)) except TypeError: # Cant set attributes on comment nodes etc. @@ -358,8 +363,7 @@ class FlowSplitter(object): len(self.split_trees), size/1024.)) else: self.log.debug( - '\t\t\tSplit tree still too large: %d KB' % - (size/1024.)) + '\t\t\tSplit tree still too large: %d KB' % (size/1024.)) self.split_to_size(t) def find_split_point(self, root):