Conversion: Add support for CSS 3 selectors by switching to the new cssselect module which was extracted from the lxml package and further developed

2025-07-09 03:04:10 -04:00 · 2012-10-18 14:30:51 +05:30 · 2012-10-18 14:30:51 +05:30 · c0a35f9553
commit c0a35f9553
parent ac659e7ede
6 changed files with 77 additions and 1067 deletions
--- a/setup/installer/linux/freeze2.py
+++ b/setup/installer/linux/freeze2.py
@ -16,7 +16,7 @@ SITE_PACKAGES = ['PIL', 'dateutil', 'dns', 'PyQt4', 'mechanize',
        'sip.so', 'BeautifulSoup.py', 'cssutils', 'encutils', 'lxml',
        'sipconfig.py', 'xdg', 'dbus', '_dbus_bindings.so', 'dbus_bindings.py',
        '_dbus_glib_bindings.so', 'netifaces.so', '_psutil_posix.so',
-        '_psutil_linux.so', 'psutil']
+        '_psutil_linux.so', 'psutil', 'cssselect']

 QTDIR          = '/usr/lib/qt4'
 QTDLLS         = ('QtCore', 'QtGui', 'QtNetwork', 'QtSvg', 'QtXml', 'QtWebKit', 'QtDBus')
--- a/setup/installer/windows/notes.rst
+++ b/setup/installer/windows/notes.rst
@ -30,7 +30,7 @@ If there are no windows binaries already compiled for the version of python you

 Run the following command to install python dependencies::

-    easy_install --always-unzip -U mechanize pyreadline python-dateutil dnspython cssutils clientform pycrypto
+    easy_install --always-unzip -U mechanize pyreadline python-dateutil dnspython cssutils clientform pycrypto cssselect

 Install BeautifulSoup 3.0.x manually into site-packages (3.1.x parses broken HTML very poorly)

--- a/src/calibre/ebooks/cssselect.py
+++ b/src/calibre/ebooks/cssselect.py
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@ -20,16 +20,17 @@ except ImportError:
 from cssutils import (profile as cssprofiles, parseString, parseStyle, log as
        cssutils_log, CSSParser, profiles, replaceUrls)
 from lxml import etree
-from lxml.cssselect import css_to_xpath, ExpressionError, SelectorSyntaxError
+from cssselect import HTMLTranslator
+
 from calibre import force_unicode
 from calibre.ebooks import unit_convert
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
 from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize
-from calibre.ebooks.cssselect import css_to_xpath_no_case

 cssutils_log.setLevel(logging.WARN)

 _html_css_stylesheet = None
+css_to_xpath = HTMLTranslator().css_to_xpath

 def html_css_stylesheet():
    global _html_css_stylesheet
@ -96,70 +97,86 @@ DEFAULTS = {'azimuth': 'center', 'background-attachment': 'scroll',
 FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
                       'x-large', 'xx-large'])

+def xpath_lower_case(arg):
+    'An ASCII lowercase function for XPath'
+    return ("translate(%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
+            "'abcdefghijklmnopqrstuvwxyz')")%arg
+is_non_whitespace = re.compile(r'^[^ \t\r\n\f]+$').match
+
+class CaseInsensitiveAttributesTranslator(HTMLTranslator):
+    'Treat class and id CSS selectors case-insensitively'
+
+    def xpath_class(self, class_selector):
+        """Translate a class selector."""
+        x = self.xpath(class_selector.selector)
+        if is_non_whitespace(class_selector.class_name):
+            x.add_condition(
+                "%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
+                % ('@class', xpath_lower_case('@class'), self.xpath_literal(
+                    ' '+class_selector.class_name.lower()+' ')))
+        else:
+            x.add_condition('0')
+        return x
+
+    def xpath_hash(self, id_selector):
+        """Translate an ID selector."""
+        x = self.xpath(id_selector.selector)
+        return self.xpath_attrib_equals(x, xpath_lower_case('@id'),
+                (id_selector.id.lower()))
+
+ci_css_to_xpath = CaseInsensitiveAttributesTranslator().css_to_xpath

 class CSSSelector(object):

-    LOCAL_NAME_RE = re.compile(r"(?<!local-)name[(][)] *= *'[^:]+:")
-
-    def __init__(self, css, namespaces=XPNSMAP):
-        if isinstance(css, unicode):
-            # Workaround for bug in lxml on windows/OS X that causes a massive
-            # memory leak with non ASCII selectors
-            css = css.encode('ascii', 'ignore').decode('ascii')
-        try:
-            path = self.LOCAL_NAME_RE.sub(r"local-name() = '", css_to_xpath(css))
-            self.sel1 = etree.XPath(css_to_xpath(css), namespaces=namespaces)
-        except:
-            self.sel1 = lambda x: []
-        try:
-            path = self.LOCAL_NAME_RE.sub(r"local-name() = '",
-                    css_to_xpath_no_case(css))
-            self.sel2 = etree.XPath(path, namespaces=namespaces)
-        except:
-            self.sel2 = lambda x: []
-        self.sel2_use_logged = False
+    def __init__(self, css, log=None, namespaces=XPNSMAP):
+        self.namespaces = namespaces
+        self.sel = self.build_selector(css, log)
        self.css = css
+        self.used_ci_sel = False
+
+    def build_selector(self, css, log, func=css_to_xpath):
+        try:
+            return etree.XPath(func(css), namespaces=self.namespaces)
+        except:
+            if log is not None:
+                log.exception('Failed to parse CSS selector: %r'%css)
+        return None

    def __call__(self, node, log):
+        if self.sel is None:
+            return []
        try:
-            ans = self.sel1(node)
-        except (AssertionError, ExpressionError, etree.XPathSyntaxError,
-                    NameError, # thrown on OS X instead of SelectorSyntaxError
-                    SelectorSyntaxError):
+            ans = self.sel(node)
+        except:
+            log.exception(u'Failed to run CSS selector: %s'%self.css)
            return []

        if not ans:
-            try:
-                ans = self.sel2(node)
-            except:
-                return []
-            else:
-                if ans and not self.sel2_use_logged:
-                    self.sel2_use_logged = True
-                    log.warn('Interpreting class and tag selectors case'
-                        ' insensitively in the CSS selector: %s'%self.css)
+            # Try a case insensitive version
+            if not hasattr(self, 'ci_sel'):
+                self.ci_sel = self.build_selector(self.css, log, ci_css_to_xpath)
+                if self.ci_sel is not None:
+                    try:
+                        ans = self.ci_sel(node)
+                    except:
+                        log.exception(u'Failed to run case-insensitive CSS selector: %s'%self.css)
+                        return []
+                    if ans:
+                        if not self.used_ci_sel:
+                            log.warn('Interpreting class and id values '
+                                'case-insensitively in selector: %s'%self.css)
+                        self.used_ci_sel = True
        return ans

-
-    def __repr__(self):
-        return '<%s %s for %r>' % (
-            self.__class__.__name__,
-            hex(abs(id(self)))[2:],
-            self.css)
-
 _selector_cache = {}

 MIN_SPACE_RE = re.compile(r' *([>~+]) *')

-def get_css_selector(raw_selector):
+def get_css_selector(raw_selector, log):
    css = MIN_SPACE_RE.sub(r'\1', raw_selector)
-    if isinstance(css, unicode):
-        # Workaround for bug in lxml on windows/OS X that causes a massive
-        # memory leak with non ASCII selectors
-        css = css.encode('ascii', 'ignore').decode('ascii')
    ans = _selector_cache.get(css, None)
    if ans is None:
-        ans = CSSSelector(css)
+        ans = CSSSelector(css, log)
        _selector_cache[css] = ans
    return ans

@ -272,7 +289,7 @@ class Stylizer(object):
            fl = pseudo_pat.search(text)
            if fl is not None:
                text = text.replace(fl.group(), '')
-            selector = get_css_selector(text)
+            selector = get_css_selector(text, self.oeb.log)
            matches = selector(tree, self.logger)
            if fl is not None:
                fl = fl.group(1)
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -13,7 +13,7 @@ import os, math, functools, collections, re, copy

 from lxml.etree import XPath as _XPath
 from lxml import etree
-from lxml.cssselect import CSSSelector
+from cssselect import HTMLTranslator

 from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
        urldefrag, rewrite_links, urlunquote, barename, XHTML, urlnormalize)
@ -73,6 +73,7 @@ class Split(object):

    def find_page_breaks(self, item):
        if self.page_break_selectors is None:
+            css_to_xpath = HTMLTranslator().css_to_xpath
            self.page_break_selectors = set([])
            stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
                    OEB_STYLES]
@ -83,7 +84,7 @@ class Split(object):
                    'page-break-after'), 'cssText', '').strip().lower()
                try:
                    if before and before not in {'avoid', 'auto', 'inherit'}:
-                        self.page_break_selectors.add((CSSSelector(rule.selectorText),
+                        self.page_break_selectors.add((XPath(css_to_xpath(rule.selectorText)),
                            True))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-before')
@ -91,7 +92,7 @@ class Split(object):
                    pass
                try:
                    if after and after not in {'avoid', 'auto', 'inherit'}:
-                        self.page_break_selectors.add((CSSSelector(rule.selectorText),
+                        self.page_break_selectors.add((XPath(css_to_xpath(rule.selectorText)),
                            False))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-after')
--- a/src/calibre/ebooks/readability/htmls.py
+++ b/src/calibre/ebooks/readability/htmls.py
@ -64,8 +64,12 @@ def shorten_title(doc):
            if e.text_content():
                add_match(candidates, e.text_content(), orig)

-    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
-        for e in doc.cssselect(item):
+    from cssselect import HTMLTranslator
+    css_to_xpath = HTMLTranslator().css_to_xpath
+    for item in ('#title', '#head', '#heading', '.pageTitle', '.news_title',
+            '.title', '.head', '.heading', '.contentheading',
+            '.small_header_red'):
+        for e in doc.xpath(css_to_xpath(item)):
            if e.text:
                add_match(candidates, e.text, orig)
            if e.text_content():