Get rid of cssselect from Edit Book

2025-12-10 23:25:01 -05:00 · 2015-02-22 09:11:03 +05:30 · 2015-02-22 09:11:03 +05:30 · 8f6f60bca2
commit 8f6f60bca2
parent 0c4e86dcd1
3 changed files with 21 additions and 143 deletions
--- a/src/calibre/ebooks/oeb/polish/css.py
+++ b/src/calibre/ebooks/oeb/polish/css.py
@ -6,122 +6,26 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'

-import re
-
-from lxml import etree
 from cssutils.css import CSSRule
-from cssselect import HTMLTranslator, parse
-from cssselect.xpath import XPathExpr, is_safe_name
-from cssselect.parser import SelectorSyntaxError
+from css_selectors import parse, SelectorSyntaxError

 from calibre import force_unicode
-from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XPNSMAP, XHTML_NS
+from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS
 from calibre.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
-from calibre.ebooks.oeb.stylizer import MIN_SPACE_RE, is_non_whitespace, xpath_lower_case, fix_namespace
 from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style
+from css_selectors import Select

-class NamespacedTranslator(HTMLTranslator):

-    def xpath_element(self, selector):
-        element = selector.element
-        if not element:
-            element = '*'
-            safe = True
-        else:
-            safe = is_safe_name(element)
-            if safe:
-                # We use the h: prefix for the XHTML namespace
-                element = 'h:%s' % element.lower()
-        xpath = XPathExpr(element=element)
-        if not safe:
-            xpath.add_name_test()
-        return xpath
-
-class CaseInsensitiveAttributesTranslator(NamespacedTranslator):
-    'Treat class and id CSS selectors case-insensitively'
-
-    def xpath_class(self, class_selector):
-        """Translate a class selector."""
-        x = self.xpath(class_selector.selector)
-        if is_non_whitespace(class_selector.class_name):
-            x.add_condition(
-                "%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
-                % ('@class', xpath_lower_case('@class'), self.xpath_literal(
-                    ' '+class_selector.class_name.lower()+' ')))
-        else:
-            x.add_condition('0')
-        return x
-
-    def xpath_hash(self, id_selector):
-        """Translate an ID selector."""
-        x = self.xpath(id_selector.selector)
-        return self.xpath_attrib_equals(x, xpath_lower_case('@id'),
-                (id_selector.id.lower()))
-
-css_to_xpath = NamespacedTranslator().css_to_xpath
-ci_css_to_xpath = CaseInsensitiveAttributesTranslator().css_to_xpath
-
-def build_selector(text, case_sensitive=True):
-    func = css_to_xpath if case_sensitive else ci_css_to_xpath
-    try:
-        return etree.XPath(fix_namespace(func(text)), namespaces=XPNSMAP)
-    except Exception:
-        return None
-
-PSEUDO_PAT = r':(first-letter|first-line|link|hover|visited|active|focus|before|after)'
-
-def is_rule_used(root, selector, log, pseudo_pat, cache):
-    selector = pseudo_pat.sub('', selector)
-    selector = MIN_SPACE_RE.sub(r'\1', selector)
-    try:
-        xp = cache[(True, selector)]
-    except KeyError:
-        xp = cache[(True, selector)] = build_selector(selector)
-    try:
-        if xp(root):
-            return True
-    except Exception:
-        return True
-
-    # See if interpreting class and id selectors case-insensitively gives us
-    # matches. Strictly speaking, class and id selectors should be case
-    # sensitive for XHTML, but we err on the side of caution and not remove
-    # them, since case sensitivity depends on whether the html is rendered in
-    # quirks mode or not.
-    try:
-        xp = cache[(False, selector)]
-    except KeyError:
-        xp = cache[(False, selector)] = build_selector(selector, case_sensitive=False)
-    try:
-        return bool(xp(root))
-    except Exception:
-        return True
-
-def filter_used_rules(root, rules, log, pseudo_pat, cache):
+def filter_used_rules(rules, log, select):
    for rule in rules:
        used = False
        for selector in rule.selectorList:
-            text = selector.selectorText
-            if is_rule_used(root, text, log, pseudo_pat, cache):
+            if select.has_matches(selector.selectorText):
                used = True
                break
        if not used:
            yield rule

-def process_namespaces(sheet):
-    # Find the namespace prefix (if any) for the XHTML namespace, so that we
-    # can preserve it after processing
-    for prefix in sheet.namespaces:
-        if sheet.namespaces[prefix] == XHTML_NS:
-            return prefix
-
-def preserve_htmlns_prefix(sheet, prefix):
-    if prefix is None:
-        while 'h' in sheet.namespaces:
-            del sheet.namespaces['h']
-    else:
-        sheet.namespaces[prefix] = XHTML_NS
-
 def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None):
    ans = set()
    sheet = sheet or sheets[name]
@ -155,20 +59,15 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
    import_map = {name:get_imported_sheets(name, container, sheets) for name in sheets}
    if remove_unused_classes:
        class_map = {name:{icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in sheets.iteritems()}
-    sheet_namespace = {}
-    for sheet in sheets.itervalues():
-        sheet_namespace[sheet] = process_namespaces(sheet)
-        sheet.namespaces['h'] = XHTML_NS
    style_rules = {name:tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in sheets.iteritems()}

    num_of_removed_rules = num_of_removed_classes = 0
-    pseudo_pat = re.compile(PSEUDO_PAT, re.I)
-    cache = {}

    for name, mt in container.mime_map.iteritems():
        if mt not in OEB_DOCS:
            continue
        root = container.parsed(name)
+        select = Select(root, ignore_inappropriate_pseudo_classes=True)
        used_classes = set()
        for style in root.xpath('//*[local-name()="style"]'):
            if style.get('type', 'text/css') == 'text/css' and style.text:
@ -177,17 +76,14 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
                    used_classes |= {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)}
                imports = get_imported_sheets(name, container, sheets, sheet=sheet)
                for imported_sheet in imports:
-                    style_rules[imported_sheet] = tuple(filter_used_rules(root, style_rules[imported_sheet], container.log, pseudo_pat, cache))
+                    style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
                    if remove_unused_classes:
                        used_classes |= class_map[imported_sheet]
-                ns = process_namespaces(sheet)
-                sheet.namespaces['h'] = XHTML_NS
                rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
-                unused_rules = tuple(filter_used_rules(root, rules, container.log, pseudo_pat, cache))
+                unused_rules = tuple(filter_used_rules(rules, container.log, select))
                if unused_rules:
                    num_of_removed_rules += len(unused_rules)
                    [sheet.cssRules.remove(r) for r in unused_rules]
-                    preserve_htmlns_prefix(sheet, ns)
                    style.text = force_unicode(sheet.cssText, 'utf-8')
                    pretty_script_or_style(container, style)
                    container.dirty(name)
@ -196,12 +92,12 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
            sname = container.href_to_name(link.get('href'), name)
            if sname not in sheets:
                continue
-            style_rules[sname] = tuple(filter_used_rules(root, style_rules[sname], container.log, pseudo_pat, cache))
+            style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select))
            if remove_unused_classes:
                used_classes |= class_map[sname]

            for iname in import_map[sname]:
-                style_rules[iname] = tuple(filter_used_rules(root, style_rules[iname], container.log, pseudo_pat, cache))
+                style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select))
                if remove_unused_classes:
                    used_classes |= class_map[iname]

@ -220,7 +116,6 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
                    container.dirty(name)

    for name, sheet in sheets.iteritems():
-        preserve_htmlns_prefix(sheet, sheet_namespace[sheet])
        unused_rules = style_rules[name]
        if unused_rules:
            num_of_removed_rules += len(unused_rules)
--- a/src/calibre/ebooks/oeb/polish/report.py
+++ b/src/calibre/ebooks/oeb/polish/report.py
@ -6,17 +6,17 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'

-import posixpath, os, time, types, re
+import posixpath, os, time, types
 from collections import namedtuple, defaultdict, Counter
 from itertools import chain

 from calibre import prepare_string_for_xml, force_unicode
 from calibre.ebooks.oeb.base import XPath, xml2text
 from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS
-from calibre.ebooks.oeb.polish.css import build_selector, PSEUDO_PAT, MIN_SPACE_RE
 from calibre.ebooks.oeb.polish.spell import get_all_words
 from calibre.utils.icu import numeric_sort_key, ord_string, safe_chr
 from calibre.utils.magick.draw import identify
+from css_selectors import Select, SelectorError

 File = namedtuple('File', 'name dir basename size category')

@ -255,8 +255,6 @@ def css_data(container, book_locale, result_data, *args):
                        css_rules(name, parser.parse_stylesheet(force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1))

    rule_map = defaultdict(lambda : defaultdict(list))
-    pseudo_pat = re.compile(PSEUDO_PAT, re.I)
-    cache = {}

    def rules_in_sheet(sheet):
        for rule in sheet:
@ -285,28 +283,12 @@ def css_data(container, book_locale, result_data, *args):
                return '<%s %s>' % (tag, attribs)
            ans = tt_cache[elem] = '<%s>' % tag

-    def matches_for_selector(selector, root, class_map, rule):
-        selector = pseudo_pat.sub('', selector)
-        selector = MIN_SPACE_RE.sub(r'\1', selector)
-        try:
-            xp = cache[(True, selector)]
-        except KeyError:
-            xp = cache[(True, selector)] = build_selector(selector)
-
-        try:
-            matches = xp(root)
-        except Exception:
-            return ()
-        if not matches:
-            try:
-                xp = cache[(False, selector)]
-            except KeyError:
-                xp = cache[(False, selector)] = build_selector(selector, case_sensitive=False)
-            try:
-                matches = xp(root)
-            except Exception:
-                return ()
+    def matches_for_selector(selector, select, class_map, rule):
        lsel = selector.lower()
+        try:
+            matches = tuple(select(selector))
+        except SelectorError:
+            return ()
        for elem in matches:
            for cls in elem.get('class', '').split():
                if '.' + cls.lower() in lsel:
@ -322,9 +304,10 @@ def css_data(container, book_locale, result_data, *args):
        for elem in root.xpath('//*[@class]'):
            for cls in elem.get('class', '').split():
                cmap[cls][elem] = []
+        select = Select(root, ignore_inappropriate_pseudo_classes=True)
        for sheet in chain(sheets_for_html(name, root), inline_sheets):
            for rule in rules_in_sheet(sheet):
-                rule_map[rule][name].extend(matches_for_selector(rule.selector, root, cmap, rule))
+                rule_map[rule][name].extend(matches_for_selector(rule.selector, select, cmap, rule))
        for cls, elem_map in cmap.iteritems():
            class_elements = class_map[cls][name]
            for elem, usage in elem_map.iteritems():
--- a/src/calibre/gui2/tweak_book/live_css.py
+++ b/src/calibre/gui2/tweak_book/live_css.py
@ -8,7 +8,6 @@ __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'

 import json

-from cssselect import parse
 from PyQt5.Qt import (
    QWidget, QTimer, QStackedLayout, QLabel, QScrollArea, QVBoxLayout,
    QPainter, Qt, QPalette, QRect, QSize, QSizePolicy, pyqtSignal,
@ -18,6 +17,7 @@ from calibre.constants import iswindows
 from calibre.gui2.tweak_book import editors, actions, current_container, tprefs
 from calibre.gui2.tweak_book.editor.themes import get_theme, theme_color
 from calibre.gui2.tweak_book.editor.text import default_font_family
+from css_selectors import parse, SelectorError

 class Heading(QWidget):  # {{{

@ -434,7 +434,7 @@ class LiveCSS(QWidget):
        if selector is not None:
            try:
                specificity = [0] + list(parse(selector)[0].specificity())
-            except (AttributeError, TypeError):
+            except (AttributeError, TypeError, SelectorError):
                specificity = [0, 0, 0, 0]
        else:  # style attribute
            specificity = [1, 0, 0, 0]