Get rid of cssselect from the conversion pipeline

2025-11-22 22:43:02 -05:00 · 2015-02-22 14:02:42 +05:30 · 2015-02-22 14:02:42 +05:30 · 3226fb0ab4
commit 3226fb0ab4
parent 1de3706bff
4 changed files with 41 additions and 137 deletions
--- a/src/calibre/ebooks/conversion/plugins/html_input.py
+++ b/src/calibre/ebooks/conversion/plugins/html_input.py
@ -299,7 +299,7 @@ class HTMLInput(InputFormatPlugin):
            return (None, None)
        try:
            raw = open(link, 'rb').read().decode('utf-8', 'replace')
-            raw = self.oeb.css_preprocessor(raw, add_namespace=True)
+            raw = self.oeb.css_preprocessor(raw, add_namespace=False)
        except:
            self.log.exception('Failed to read CSS file: %r'%link)
            return (None, None)
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -924,13 +924,12 @@ class Manifest(object):
            log.raiseExceptions = False
            self.oeb.log.debug('Parsing', self.href, '...')
            data = self.oeb.decode(data)
-            data = self.oeb.css_preprocessor(data, add_namespace=True)
+            data = self.oeb.css_preprocessor(data, add_namespace=False)
            parser = CSSParser(loglevel=logging.WARNING,
                               fetcher=self.override_css_fetch or self._fetch_css,
                               log=_css_logger)
            data = parser.parseString(data, href=self.href, validate=False)
            data = resolveImports(data)
-            data.namespaces['h'] = XHTML_NS
            for rule in tuple(data.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
                data.cssRules.remove(rule)
            return data
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@ -15,28 +15,23 @@ from cssutils.css import (CSSStyleRule, CSSPageRule, CSSFontFaceRule,
        cssproperties, CSSRule)
 from cssutils import (profile as cssprofiles, parseString, parseStyle, log as
        cssutils_log, CSSParser, profiles, replaceUrls)
-from lxml import etree
-from cssselect import HTMLTranslator
-
-from calibre import force_unicode
+from calibre import force_unicode, as_unicode
 from calibre.ebooks import unit_convert
-from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, XPNSMAP, xpath, urlnormalize
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, xpath, urlnormalize
 from calibre.ebooks.oeb.normalize_css import DEFAULTS, normalizers
+from css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES

 cssutils_log.setLevel(logging.WARN)

 _html_css_stylesheet = None
-css_to_xpath = HTMLTranslator().css_to_xpath

 def html_css_stylesheet():
    global _html_css_stylesheet
    if _html_css_stylesheet is None:
        html_css = open(P('templates/html.css'), 'rb').read()
        _html_css_stylesheet = parseString(html_css, validate=False)
-        _html_css_stylesheet.namespaces['h'] = XHTML_NS
    return _html_css_stylesheet

-XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS

 INHERITED = set(['azimuth', 'border-collapse', 'border-spacing',
                 'caption-side', 'color', 'cursor', 'direction', 'elevation',
@ -53,100 +48,6 @@ INHERITED = set(['azimuth', 'border-collapse', 'border-spacing',
 FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
                       'x-large', 'xx-large'])

-def xpath_lower_case(arg):
-    'An ASCII lowercase function for XPath'
-    return ("translate(%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
-            "'abcdefghijklmnopqrstuvwxyz')")%arg
-is_non_whitespace = re.compile(r'^[^ \t\r\n\f]+$').match
-
-class CaseInsensitiveAttributesTranslator(HTMLTranslator):
-    'Treat class and id CSS selectors case-insensitively'
-
-    def xpath_class(self, class_selector):
-        """Translate a class selector."""
-        x = self.xpath(class_selector.selector)
-        if is_non_whitespace(class_selector.class_name):
-            x.add_condition(
-                "%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
-                % ('@class', xpath_lower_case('@class'), self.xpath_literal(
-                    ' '+class_selector.class_name.lower()+' ')))
-        else:
-            x.add_condition('0')
-        return x
-
-    def xpath_hash(self, id_selector):
-        """Translate an ID selector."""
-        x = self.xpath(id_selector.selector)
-        return self.xpath_attrib_equals(x, xpath_lower_case('@id'),
-                (id_selector.id.lower()))
-
-ci_css_to_xpath = CaseInsensitiveAttributesTranslator().css_to_xpath
-
-NULL_NAMESPACE_REGEX = re.compile(ur'''(name\(\) = ['"])h:''')
-def fix_namespace(raw):
-    '''
-    cssselect uses name() = 'h:p' to select tags for some CSS selectors (e.g.
-    h|p+h|p).
-    However, since for us the XHTML namespace is the default namespace (with no
-    prefix), name() is the same as local-name(). So this is a hack to
-    workaround the problem.
-    '''
-    return NULL_NAMESPACE_REGEX.sub(ur'\1', raw)
-
-class CSSSelector(object):
-
-    def __init__(self, css, log=None, namespaces=XPNSMAP):
-        self.namespaces = namespaces
-        self.sel = self.build_selector(css, log)
-        self.css = css
-        self.used_ci_sel = False
-
-    def build_selector(self, css, log, func=css_to_xpath):
-        try:
-            return etree.XPath(fix_namespace(func(css)), namespaces=self.namespaces)
-        except:
-            if log is not None:
-                log.exception('Failed to parse CSS selector: %r'%css)
-        return None
-
-    def __call__(self, node, log):
-        if self.sel is None:
-            return []
-        try:
-            ans = self.sel(node)
-        except:
-            log.exception(u'Failed to run CSS selector: %s'%self.css)
-            return []
-
-        if not ans:
-            # Try a case insensitive version
-            if not hasattr(self, 'ci_sel'):
-                self.ci_sel = self.build_selector(self.css, log, ci_css_to_xpath)
-                if self.ci_sel is not None:
-                    try:
-                        ans = self.ci_sel(node)
-                    except:
-                        log.exception(u'Failed to run case-insensitive CSS selector: %s'%self.css)
-                        return []
-                    if ans:
-                        if not self.used_ci_sel:
-                            log.warn('Interpreting class and id values '
-                                'case-insensitively in selector: %s'%self.css)
-                        self.used_ci_sel = True
-        return ans
-
-_selector_cache = {}
-
-MIN_SPACE_RE = re.compile(r' *([>~+]) *')
-
-def get_css_selector(raw_selector, log):
-    css = MIN_SPACE_RE.sub(r'\1', raw_selector)
-    ans = _selector_cache.get(css, None)
-    if ans is None:
-        ans = CSSSelector(css, log)
-        _selector_cache[css] = ans
-    return ans
-
 class Stylizer(object):
    STYLESHEETS = WeakKeyDictionary()

@ -195,13 +96,12 @@ class Stylizer(object):
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
-                    text = oeb.css_preprocessor(text, add_namespace=True)
+                    text = oeb.css_preprocessor(text)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    parser.setFetcher(self._fetch_css_file)
-                    stylesheet.namespaces['h'] = XHTML_NS
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
@ -244,10 +144,9 @@ class Stylizer(object):
        for w, x in csses.items():
            if x:
                try:
-                    text = XHTML_CSS_NAMESPACE + x
+                    text = x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
-                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
@ -275,13 +174,17 @@ class Stylizer(object):
        rules.sort()
        self.rules = rules
        self._styles = {}
-        pseudo_pat = re.compile(ur':{1,2}(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I)
+        pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
+        select = Select(tree, ignore_inappropriate_pseudo_classes=True)
+
        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
-            if fl is not None:
-                text = text.replace(fl.group(), '')
-            selector = get_css_selector(text, self.oeb.log)
-            matches = selector(tree, self.logger)
+            try:
+                matches = select(text)
+            except SelectorError as err:
+                self.log.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err)))
+                continue
+
            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
@ -486,9 +389,7 @@ class Style(object):
        result = None
        if name in self._style:
            result = self._style[name]
-        if (result == 'inherit'
-            or (result is None and name in INHERITED
-                and self._has_parent())):
+        if (result == 'inherit' or (result is None and name in INHERITED and self._has_parent())):
            stylizer = self._stylizer
            result = stylizer.style(self._element.getparent())._get(name)
        if result is None:
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -14,12 +14,13 @@ from collections import OrderedDict

 from lxml.etree import XPath as _XPath
 from lxml import etree
-from cssselect import HTMLTranslator

+from calibre import as_unicode
 from calibre.ebooks.epub import rules
 from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
        urldefrag, rewrite_links, urlunquote, XHTML, urlnormalize)
 from calibre.ebooks.oeb.polish.split import do_split
+from css_selectors import Select, SelectorError

 XPath = functools.partial(_XPath, namespaces=NAMESPACES)

@ -75,9 +76,7 @@ class Split(object):

    def find_page_breaks(self, item):
        if self.page_break_selectors is None:
-            from calibre.ebooks.oeb.stylizer import fix_namespace
-            css_to_xpath = HTMLTranslator().css_to_xpath
-            self.page_break_selectors = set([])
+            self.page_break_selectors = set()
            stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
                    OEB_STYLES]
            for rule in rules(stylesheets):
@ -87,31 +86,37 @@ class Split(object):
                    'page-break-after'), 'cssText', '').strip().lower()
                try:
                    if before and before not in {'avoid', 'auto', 'inherit'}:
-                        self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))),
-                            True))
+                        self.page_break_selectors.add((rule.selectorText, True))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-before')
                except:
                    pass
                try:
                    if after and after not in {'avoid', 'auto', 'inherit'}:
-                        self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))),
-                            False))
+                        self.page_break_selectors.add((rule.selectorText, False))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-after')
                except:
                    pass
-        page_breaks = set([])
-        for selector, before in self.page_break_selectors:
+        page_breaks = set()
+        select = Select(item.data)
+        if not self.page_break_selectors:
+            return [], []
        body = item.data.xpath('//h:body', namespaces=NAMESPACES)
        if not body:
-                continue
-            for elem in selector(body[0]):
-                if elem not in body:
+            return [], []
+        descendants = frozenset(body[0].iterdescendants('*'))
+
+        for selector, before in self.page_break_selectors:
+            try:
+                for elem in select(selector):
+                    if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
                        elem.set('pb_before', '1' if before else '0')
                        page_breaks.add(elem)
+            except SelectorError as err:
+                self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))

-        for i, elem in enumerate(item.data.iter()):
+        for i, elem in enumerate(item.data.iter('*')):
            try:
                elem.set('pb_order', str(i))
            except TypeError:  # Cant set attributes on comment nodes etc.
@ -358,8 +363,7 @@ class FlowSplitter(object):
                               len(self.split_trees), size/1024.))
            else:
                self.log.debug(
-                        '\t\t\tSplit tree still too large: %d KB' %
-                                (size/1024.))
+                        '\t\t\tSplit tree still too large: %d KB' % (size/1024.))
                self.split_to_size(t)

    def find_split_point(self, root):