diff --git a/src/calibre/ebooks/oeb/polish/css.py b/src/calibre/ebooks/oeb/polish/css.py index f8a7385f2f..b6e4ff0acb 100644 --- a/src/calibre/ebooks/oeb/polish/css.py +++ b/src/calibre/ebooks/oeb/polish/css.py @@ -6,122 +6,26 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2014, Kovid Goyal ' -import re - -from lxml import etree from cssutils.css import CSSRule -from cssselect import HTMLTranslator, parse -from cssselect.xpath import XPathExpr, is_safe_name -from cssselect.parser import SelectorSyntaxError +from css_selectors import parse, SelectorSyntaxError from calibre import force_unicode -from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XPNSMAP, XHTML_NS +from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS from calibre.ebooks.oeb.normalize_css import normalize_filter_css, normalizers -from calibre.ebooks.oeb.stylizer import MIN_SPACE_RE, is_non_whitespace, xpath_lower_case, fix_namespace from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style +from css_selectors import Select -class NamespacedTranslator(HTMLTranslator): - def xpath_element(self, selector): - element = selector.element - if not element: - element = '*' - safe = True - else: - safe = is_safe_name(element) - if safe: - # We use the h: prefix for the XHTML namespace - element = 'h:%s' % element.lower() - xpath = XPathExpr(element=element) - if not safe: - xpath.add_name_test() - return xpath - -class CaseInsensitiveAttributesTranslator(NamespacedTranslator): - 'Treat class and id CSS selectors case-insensitively' - - def xpath_class(self, class_selector): - """Translate a class selector.""" - x = self.xpath(class_selector.selector) - if is_non_whitespace(class_selector.class_name): - x.add_condition( - "%s and contains(concat(' ', normalize-space(%s), ' '), %s)" - % ('@class', xpath_lower_case('@class'), self.xpath_literal( - ' '+class_selector.class_name.lower()+' '))) - else: - x.add_condition('0') - return x - - def xpath_hash(self, id_selector): - """Translate an ID selector.""" - x = self.xpath(id_selector.selector) - return self.xpath_attrib_equals(x, xpath_lower_case('@id'), - (id_selector.id.lower())) - -css_to_xpath = NamespacedTranslator().css_to_xpath -ci_css_to_xpath = CaseInsensitiveAttributesTranslator().css_to_xpath - -def build_selector(text, case_sensitive=True): - func = css_to_xpath if case_sensitive else ci_css_to_xpath - try: - return etree.XPath(fix_namespace(func(text)), namespaces=XPNSMAP) - except Exception: - return None - -PSEUDO_PAT = r':(first-letter|first-line|link|hover|visited|active|focus|before|after)' - -def is_rule_used(root, selector, log, pseudo_pat, cache): - selector = pseudo_pat.sub('', selector) - selector = MIN_SPACE_RE.sub(r'\1', selector) - try: - xp = cache[(True, selector)] - except KeyError: - xp = cache[(True, selector)] = build_selector(selector) - try: - if xp(root): - return True - except Exception: - return True - - # See if interpreting class and id selectors case-insensitively gives us - # matches. Strictly speaking, class and id selectors should be case - # sensitive for XHTML, but we err on the side of caution and not remove - # them, since case sensitivity depends on whether the html is rendered in - # quirks mode or not. - try: - xp = cache[(False, selector)] - except KeyError: - xp = cache[(False, selector)] = build_selector(selector, case_sensitive=False) - try: - return bool(xp(root)) - except Exception: - return True - -def filter_used_rules(root, rules, log, pseudo_pat, cache): +def filter_used_rules(rules, log, select): for rule in rules: used = False for selector in rule.selectorList: - text = selector.selectorText - if is_rule_used(root, text, log, pseudo_pat, cache): + if select.has_matches(selector.selectorText): used = True break if not used: yield rule -def process_namespaces(sheet): - # Find the namespace prefix (if any) for the XHTML namespace, so that we - # can preserve it after processing - for prefix in sheet.namespaces: - if sheet.namespaces[prefix] == XHTML_NS: - return prefix - -def preserve_htmlns_prefix(sheet, prefix): - if prefix is None: - while 'h' in sheet.namespaces: - del sheet.namespaces['h'] - else: - sheet.namespaces[prefix] = XHTML_NS - def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None): ans = set() sheet = sheet or sheets[name] @@ -155,20 +59,15 @@ def remove_unused_css(container, report=None, remove_unused_classes=False): import_map = {name:get_imported_sheets(name, container, sheets) for name in sheets} if remove_unused_classes: class_map = {name:{icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in sheets.iteritems()} - sheet_namespace = {} - for sheet in sheets.itervalues(): - sheet_namespace[sheet] = process_namespaces(sheet) - sheet.namespaces['h'] = XHTML_NS style_rules = {name:tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in sheets.iteritems()} num_of_removed_rules = num_of_removed_classes = 0 - pseudo_pat = re.compile(PSEUDO_PAT, re.I) - cache = {} for name, mt in container.mime_map.iteritems(): if mt not in OEB_DOCS: continue root = container.parsed(name) + select = Select(root, ignore_inappropriate_pseudo_classes=True) used_classes = set() for style in root.xpath('//*[local-name()="style"]'): if style.get('type', 'text/css') == 'text/css' and style.text: @@ -177,17 +76,14 @@ def remove_unused_css(container, report=None, remove_unused_classes=False): used_classes |= {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} imports = get_imported_sheets(name, container, sheets, sheet=sheet) for imported_sheet in imports: - style_rules[imported_sheet] = tuple(filter_used_rules(root, style_rules[imported_sheet], container.log, pseudo_pat, cache)) + style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select)) if remove_unused_classes: used_classes |= class_map[imported_sheet] - ns = process_namespaces(sheet) - sheet.namespaces['h'] = XHTML_NS rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) - unused_rules = tuple(filter_used_rules(root, rules, container.log, pseudo_pat, cache)) + unused_rules = tuple(filter_used_rules(rules, container.log, select)) if unused_rules: num_of_removed_rules += len(unused_rules) [sheet.cssRules.remove(r) for r in unused_rules] - preserve_htmlns_prefix(sheet, ns) style.text = force_unicode(sheet.cssText, 'utf-8') pretty_script_or_style(container, style) container.dirty(name) @@ -196,12 +92,12 @@ def remove_unused_css(container, report=None, remove_unused_classes=False): sname = container.href_to_name(link.get('href'), name) if sname not in sheets: continue - style_rules[sname] = tuple(filter_used_rules(root, style_rules[sname], container.log, pseudo_pat, cache)) + style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select)) if remove_unused_classes: used_classes |= class_map[sname] for iname in import_map[sname]: - style_rules[iname] = tuple(filter_used_rules(root, style_rules[iname], container.log, pseudo_pat, cache)) + style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select)) if remove_unused_classes: used_classes |= class_map[iname] @@ -220,7 +116,6 @@ def remove_unused_css(container, report=None, remove_unused_classes=False): container.dirty(name) for name, sheet in sheets.iteritems(): - preserve_htmlns_prefix(sheet, sheet_namespace[sheet]) unused_rules = style_rules[name] if unused_rules: num_of_removed_rules += len(unused_rules) diff --git a/src/calibre/ebooks/oeb/polish/report.py b/src/calibre/ebooks/oeb/polish/report.py index f2b8a0f2fe..db39292ff1 100644 --- a/src/calibre/ebooks/oeb/polish/report.py +++ b/src/calibre/ebooks/oeb/polish/report.py @@ -6,17 +6,17 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2015, Kovid Goyal ' -import posixpath, os, time, types, re +import posixpath, os, time, types from collections import namedtuple, defaultdict, Counter from itertools import chain from calibre import prepare_string_for_xml, force_unicode from calibre.ebooks.oeb.base import XPath, xml2text from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS -from calibre.ebooks.oeb.polish.css import build_selector, PSEUDO_PAT, MIN_SPACE_RE from calibre.ebooks.oeb.polish.spell import get_all_words from calibre.utils.icu import numeric_sort_key, ord_string, safe_chr from calibre.utils.magick.draw import identify +from css_selectors import Select, SelectorError File = namedtuple('File', 'name dir basename size category') @@ -255,8 +255,6 @@ def css_data(container, book_locale, result_data, *args): css_rules(name, parser.parse_stylesheet(force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1)) rule_map = defaultdict(lambda : defaultdict(list)) - pseudo_pat = re.compile(PSEUDO_PAT, re.I) - cache = {} def rules_in_sheet(sheet): for rule in sheet: @@ -285,28 +283,12 @@ def css_data(container, book_locale, result_data, *args): return '<%s %s>' % (tag, attribs) ans = tt_cache[elem] = '<%s>' % tag - def matches_for_selector(selector, root, class_map, rule): - selector = pseudo_pat.sub('', selector) - selector = MIN_SPACE_RE.sub(r'\1', selector) - try: - xp = cache[(True, selector)] - except KeyError: - xp = cache[(True, selector)] = build_selector(selector) - - try: - matches = xp(root) - except Exception: - return () - if not matches: - try: - xp = cache[(False, selector)] - except KeyError: - xp = cache[(False, selector)] = build_selector(selector, case_sensitive=False) - try: - matches = xp(root) - except Exception: - return () + def matches_for_selector(selector, select, class_map, rule): lsel = selector.lower() + try: + matches = tuple(select(selector)) + except SelectorError: + return () for elem in matches: for cls in elem.get('class', '').split(): if '.' + cls.lower() in lsel: @@ -322,9 +304,10 @@ def css_data(container, book_locale, result_data, *args): for elem in root.xpath('//*[@class]'): for cls in elem.get('class', '').split(): cmap[cls][elem] = [] + select = Select(root, ignore_inappropriate_pseudo_classes=True) for sheet in chain(sheets_for_html(name, root), inline_sheets): for rule in rules_in_sheet(sheet): - rule_map[rule][name].extend(matches_for_selector(rule.selector, root, cmap, rule)) + rule_map[rule][name].extend(matches_for_selector(rule.selector, select, cmap, rule)) for cls, elem_map in cmap.iteritems(): class_elements = class_map[cls][name] for elem, usage in elem_map.iteritems(): diff --git a/src/calibre/gui2/tweak_book/live_css.py b/src/calibre/gui2/tweak_book/live_css.py index d6d14fa2fc..0446e3c05d 100644 --- a/src/calibre/gui2/tweak_book/live_css.py +++ b/src/calibre/gui2/tweak_book/live_css.py @@ -8,7 +8,6 @@ __copyright__ = '2014, Kovid Goyal ' import json -from cssselect import parse from PyQt5.Qt import ( QWidget, QTimer, QStackedLayout, QLabel, QScrollArea, QVBoxLayout, QPainter, Qt, QPalette, QRect, QSize, QSizePolicy, pyqtSignal, @@ -18,6 +17,7 @@ from calibre.constants import iswindows from calibre.gui2.tweak_book import editors, actions, current_container, tprefs from calibre.gui2.tweak_book.editor.themes import get_theme, theme_color from calibre.gui2.tweak_book.editor.text import default_font_family +from css_selectors import parse, SelectorError class Heading(QWidget): # {{{ @@ -434,7 +434,7 @@ class LiveCSS(QWidget): if selector is not None: try: specificity = [0] + list(parse(selector)[0].specificity()) - except (AttributeError, TypeError): + except (AttributeError, TypeError, SelectorError): specificity = [0, 0, 0, 0] else: # style attribute specificity = [1, 0, 0, 0]