Get rid of cssselect from the conversion pipeline

2025-07-08 18:54:09 -04:00 · 2015-02-22 14:02:42 +05:30 · 2015-02-22 14:02:42 +05:30 · 3226fb0ab4
commit 3226fb0ab4
parent 1de3706bff
4 changed files with 41 additions and 137 deletions
--- a/src/calibre/ebooks/conversion/plugins/html_input.py
+++ b/src/calibre/ebooks/conversion/plugins/html_input.py
@ -299,7 +299,7 @@ class HTMLInput(InputFormatPlugin):
            return (None, None)
        try:
            raw = open(link, 'rb').read().decode('utf-8', 'replace')
-            raw = self.oeb.css_preprocessor(raw, add_namespace=True)
+            raw = self.oeb.css_preprocessor(raw, add_namespace=False)
        except:
            self.log.exception('Failed to read CSS file: %r'%link)
            return (None, None)
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -924,13 +924,12 @@ class Manifest(object):
            log.raiseExceptions = False
            self.oeb.log.debug('Parsing', self.href, '...')
            data = self.oeb.decode(data)
-            data = self.oeb.css_preprocessor(data, add_namespace=True)
+            data = self.oeb.css_preprocessor(data, add_namespace=False)
            parser = CSSParser(loglevel=logging.WARNING,
                               fetcher=self.override_css_fetch or self._fetch_css,
                               log=_css_logger)
            data = parser.parseString(data, href=self.href, validate=False)
            data = resolveImports(data)
            data.namespaces['h'] = XHTML_NS
            for rule in tuple(data.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
                data.cssRules.remove(rule)
            return data
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@ -15,28 +15,23 @@ from cssutils.css import (CSSStyleRule, CSSPageRule, CSSFontFaceRule,
        cssproperties, CSSRule)
 from cssutils import (profile as cssprofiles, parseString, parseStyle, log as
        cssutils_log, CSSParser, profiles, replaceUrls)
-from lxml import etree
+from calibre import force_unicode, as_unicode
 from cssselect import HTMLTranslator
 from calibre import force_unicode
 from calibre.ebooks import unit_convert
-from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, XPNSMAP, xpath, urlnormalize
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, xpath, urlnormalize
 from calibre.ebooks.oeb.normalize_css import DEFAULTS, normalizers
 from css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES
 cssutils_log.setLevel(logging.WARN)
 _html_css_stylesheet = None
 css_to_xpath = HTMLTranslator().css_to_xpath
 def html_css_stylesheet():
    global _html_css_stylesheet
    if _html_css_stylesheet is None:
        html_css = open(P('templates/html.css'), 'rb').read()
        _html_css_stylesheet = parseString(html_css, validate=False)
        _html_css_stylesheet.namespaces['h'] = XHTML_NS
    return _html_css_stylesheet
 XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
 INHERITED = set(['azimuth', 'border-collapse', 'border-spacing',
                 'caption-side', 'color', 'cursor', 'direction', 'elevation',
@ -53,100 +48,6 @@ INHERITED = set(['azimuth', 'border-collapse', 'border-spacing',
 FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
                       'x-large', 'xx-large'])
 def xpath_lower_case(arg):
    'An ASCII lowercase function for XPath'
    return ("translate(%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
            "'abcdefghijklmnopqrstuvwxyz')")%arg
 is_non_whitespace = re.compile(r'^[^ \t\r\n\f]+$').match
 class CaseInsensitiveAttributesTranslator(HTMLTranslator):
    'Treat class and id CSS selectors case-insensitively'
    def xpath_class(self, class_selector):
        """Translate a class selector."""
        x = self.xpath(class_selector.selector)
        if is_non_whitespace(class_selector.class_name):
            x.add_condition(
                "%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
                % ('@class', xpath_lower_case('@class'), self.xpath_literal(
                    ' '+class_selector.class_name.lower()+' ')))
        else:
            x.add_condition('0')
        return x
    def xpath_hash(self, id_selector):
        """Translate an ID selector."""
        x = self.xpath(id_selector.selector)
        return self.xpath_attrib_equals(x, xpath_lower_case('@id'),
                (id_selector.id.lower()))
 ci_css_to_xpath = CaseInsensitiveAttributesTranslator().css_to_xpath
 NULL_NAMESPACE_REGEX = re.compile(ur'''(name\(\) = ['"])h:''')
 def fix_namespace(raw):
    '''
    cssselect uses name() = 'h:p' to select tags for some CSS selectors (e.g.
    h|p+h|p).
    However, since for us the XHTML namespace is the default namespace (with no
    prefix), name() is the same as local-name(). So this is a hack to
    workaround the problem.
    '''
    return NULL_NAMESPACE_REGEX.sub(ur'\1', raw)
 class CSSSelector(object):
    def __init__(self, css, log=None, namespaces=XPNSMAP):
        self.namespaces = namespaces
        self.sel = self.build_selector(css, log)
        self.css = css
        self.used_ci_sel = False
    def build_selector(self, css, log, func=css_to_xpath):
        try:
            return etree.XPath(fix_namespace(func(css)), namespaces=self.namespaces)
        except:
            if log is not None:
                log.exception('Failed to parse CSS selector: %r'%css)
        return None
    def __call__(self, node, log):
        if self.sel is None:
            return []
        try:
            ans = self.sel(node)
        except:
            log.exception(u'Failed to run CSS selector: %s'%self.css)
            return []
        if not ans:
            # Try a case insensitive version
            if not hasattr(self, 'ci_sel'):
                self.ci_sel = self.build_selector(self.css, log, ci_css_to_xpath)
                if self.ci_sel is not None:
                    try:
                        ans = self.ci_sel(node)
                    except:
                        log.exception(u'Failed to run case-insensitive CSS selector: %s'%self.css)
                        return []
                    if ans:
                        if not self.used_ci_sel:
                            log.warn('Interpreting class and id values '
                                'case-insensitively in selector: %s'%self.css)
                        self.used_ci_sel = True
        return ans
 _selector_cache = {}
 MIN_SPACE_RE = re.compile(r' *([>~+]) *')
 def get_css_selector(raw_selector, log):
    css = MIN_SPACE_RE.sub(r'\1', raw_selector)
    ans = _selector_cache.get(css, None)
    if ans is None:
        ans = CSSSelector(css, log)
        _selector_cache[css] = ans
    return ans
 class Stylizer(object):
    STYLESHEETS = WeakKeyDictionary()
@ -195,13 +96,12 @@ class Stylizer(object):
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
-                    text = oeb.css_preprocessor(text, add_namespace=True)
+                    text = oeb.css_preprocessor(text)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    stylesheet.namespaces['h'] = XHTML_NS
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
@ -244,10 +144,9 @@ class Stylizer(object):
        for w, x in csses.items():
            if x:
                try:
-                    text = XHTML_CSS_NAMESPACE + x
+                    text = x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
@ -275,13 +174,17 @@ class Stylizer(object):
        rules.sort()
        self.rules = rules
        self._styles = {}
-        pseudo_pat = re.compile(ur':{1,2}(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I)
+        pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
        select = Select(tree, ignore_inappropriate_pseudo_classes=True)
        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
-            if fl is not None:
+            try:
-                text = text.replace(fl.group(), '')
+                matches = select(text)
-            selector = get_css_selector(text, self.oeb.log)
+            except SelectorError as err:
-            matches = selector(tree, self.logger)
+                self.log.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err)))
                continue
            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
@ -486,9 +389,7 @@ class Style(object):
        result = None
        if name in self._style:
            result = self._style[name]
-        if (result == 'inherit'
+        if (result == 'inherit' or (result is None and name in INHERITED and self._has_parent())):
            or (result is None and name in INHERITED
                and self._has_parent())):
            stylizer = self._stylizer
            result = stylizer.style(self._element.getparent())._get(name)
        if result is None:
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -14,12 +14,13 @@ from collections import OrderedDict
 from lxml.etree import XPath as _XPath
 from lxml import etree
 from cssselect import HTMLTranslator
 from calibre import as_unicode
 from calibre.ebooks.epub import rules
 from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
        urldefrag, rewrite_links, urlunquote, XHTML, urlnormalize)
 from calibre.ebooks.oeb.polish.split import do_split
 from css_selectors import Select, SelectorError
 XPath = functools.partial(_XPath, namespaces=NAMESPACES)
@ -75,9 +76,7 @@ class Split(object):
    def find_page_breaks(self, item):
        if self.page_break_selectors is None:
-            from calibre.ebooks.oeb.stylizer import fix_namespace
+            self.page_break_selectors = set()
            css_to_xpath = HTMLTranslator().css_to_xpath
            self.page_break_selectors = set([])
            stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
                    OEB_STYLES]
            for rule in rules(stylesheets):
@ -87,31 +86,37 @@ class Split(object):
                    'page-break-after'), 'cssText', '').strip().lower()
                try:
                    if before and before not in {'avoid', 'auto', 'inherit'}:
-                        self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))),
+                        self.page_break_selectors.add((rule.selectorText, True))
                            True))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-before')
                except:
                    pass
                try:
                    if after and after not in {'avoid', 'auto', 'inherit'}:
-                        self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))),
+                        self.page_break_selectors.add((rule.selectorText, False))
                            False))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-after')
                except:
                    pass
-        page_breaks = set([])
+        page_breaks = set()
-        for selector, before in self.page_break_selectors:
+        select = Select(item.data)
-            body = item.data.xpath('//h:body', namespaces=NAMESPACES)
+        if not self.page_break_selectors:
-            if not body:
+            return [], []
-                continue
+        body = item.data.xpath('//h:body', namespaces=NAMESPACES)
-            for elem in selector(body[0]):
+        if not body:
-                if elem not in body:
+            return [], []
-                    elem.set('pb_before', '1' if before else '0')
+        descendants = frozenset(body[0].iterdescendants('*'))
                    page_breaks.add(elem)
-        for i, elem in enumerate(item.data.iter()):
+        for selector, before in self.page_break_selectors:
            try:
                for elem in select(selector):
                    if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
                        elem.set('pb_before', '1' if before else '0')
                        page_breaks.add(elem)
            except SelectorError as err:
                self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))
        for i, elem in enumerate(item.data.iter('*')):
            try:
                elem.set('pb_order', str(i))
            except TypeError:  # Cant set attributes on comment nodes etc.
@ -358,8 +363,7 @@ class FlowSplitter(object):
                               len(self.split_trees), size/1024.))
            else:
                self.log.debug(
-                        '\t\t\tSplit tree still too large: %d KB' %
+                        '\t\t\tSplit tree still too large: %d KB' % (size/1024.))
                                (size/1024.))
                self.split_to_size(t)
    def find_split_point(self, root):