Get rid of cssselect from Edit Book

This commit is contained in:
Kovid Goyal 2015-02-22 09:11:03 +05:30
parent 0c4e86dcd1
commit 8f6f60bca2
3 changed files with 21 additions and 143 deletions

View File

@ -6,122 +6,26 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from lxml import etree
from cssutils.css import CSSRule
from cssselect import HTMLTranslator, parse
from cssselect.xpath import XPathExpr, is_safe_name
from cssselect.parser import SelectorSyntaxError
from css_selectors import parse, SelectorSyntaxError
from calibre import force_unicode
from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XPNSMAP, XHTML_NS
from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS
from calibre.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
from calibre.ebooks.oeb.stylizer import MIN_SPACE_RE, is_non_whitespace, xpath_lower_case, fix_namespace
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style
from css_selectors import Select
class NamespacedTranslator(HTMLTranslator):
def xpath_element(self, selector):
element = selector.element
if not element:
element = '*'
safe = True
else:
safe = is_safe_name(element)
if safe:
# We use the h: prefix for the XHTML namespace
element = 'h:%s' % element.lower()
xpath = XPathExpr(element=element)
if not safe:
xpath.add_name_test()
return xpath
class CaseInsensitiveAttributesTranslator(NamespacedTranslator):
'Treat class and id CSS selectors case-insensitively'
def xpath_class(self, class_selector):
"""Translate a class selector."""
x = self.xpath(class_selector.selector)
if is_non_whitespace(class_selector.class_name):
x.add_condition(
"%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
% ('@class', xpath_lower_case('@class'), self.xpath_literal(
' '+class_selector.class_name.lower()+' ')))
else:
x.add_condition('0')
return x
def xpath_hash(self, id_selector):
"""Translate an ID selector."""
x = self.xpath(id_selector.selector)
return self.xpath_attrib_equals(x, xpath_lower_case('@id'),
(id_selector.id.lower()))
css_to_xpath = NamespacedTranslator().css_to_xpath
ci_css_to_xpath = CaseInsensitiveAttributesTranslator().css_to_xpath
def build_selector(text, case_sensitive=True):
func = css_to_xpath if case_sensitive else ci_css_to_xpath
try:
return etree.XPath(fix_namespace(func(text)), namespaces=XPNSMAP)
except Exception:
return None
PSEUDO_PAT = r':(first-letter|first-line|link|hover|visited|active|focus|before|after)'
def is_rule_used(root, selector, log, pseudo_pat, cache):
selector = pseudo_pat.sub('', selector)
selector = MIN_SPACE_RE.sub(r'\1', selector)
try:
xp = cache[(True, selector)]
except KeyError:
xp = cache[(True, selector)] = build_selector(selector)
try:
if xp(root):
return True
except Exception:
return True
# See if interpreting class and id selectors case-insensitively gives us
# matches. Strictly speaking, class and id selectors should be case
# sensitive for XHTML, but we err on the side of caution and not remove
# them, since case sensitivity depends on whether the html is rendered in
# quirks mode or not.
try:
xp = cache[(False, selector)]
except KeyError:
xp = cache[(False, selector)] = build_selector(selector, case_sensitive=False)
try:
return bool(xp(root))
except Exception:
return True
def filter_used_rules(root, rules, log, pseudo_pat, cache):
def filter_used_rules(rules, log, select):
for rule in rules:
used = False
for selector in rule.selectorList:
text = selector.selectorText
if is_rule_used(root, text, log, pseudo_pat, cache):
if select.has_matches(selector.selectorText):
used = True
break
if not used:
yield rule
def process_namespaces(sheet):
# Find the namespace prefix (if any) for the XHTML namespace, so that we
# can preserve it after processing
for prefix in sheet.namespaces:
if sheet.namespaces[prefix] == XHTML_NS:
return prefix
def preserve_htmlns_prefix(sheet, prefix):
if prefix is None:
while 'h' in sheet.namespaces:
del sheet.namespaces['h']
else:
sheet.namespaces[prefix] = XHTML_NS
def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None):
ans = set()
sheet = sheet or sheets[name]
@ -155,20 +59,15 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
import_map = {name:get_imported_sheets(name, container, sheets) for name in sheets}
if remove_unused_classes:
class_map = {name:{icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in sheets.iteritems()}
sheet_namespace = {}
for sheet in sheets.itervalues():
sheet_namespace[sheet] = process_namespaces(sheet)
sheet.namespaces['h'] = XHTML_NS
style_rules = {name:tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in sheets.iteritems()}
num_of_removed_rules = num_of_removed_classes = 0
pseudo_pat = re.compile(PSEUDO_PAT, re.I)
cache = {}
for name, mt in container.mime_map.iteritems():
if mt not in OEB_DOCS:
continue
root = container.parsed(name)
select = Select(root, ignore_inappropriate_pseudo_classes=True)
used_classes = set()
for style in root.xpath('//*[local-name()="style"]'):
if style.get('type', 'text/css') == 'text/css' and style.text:
@ -177,17 +76,14 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
used_classes |= {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets, sheet=sheet)
for imported_sheet in imports:
style_rules[imported_sheet] = tuple(filter_used_rules(root, style_rules[imported_sheet], container.log, pseudo_pat, cache))
style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
if remove_unused_classes:
used_classes |= class_map[imported_sheet]
ns = process_namespaces(sheet)
sheet.namespaces['h'] = XHTML_NS
rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
unused_rules = tuple(filter_used_rules(root, rules, container.log, pseudo_pat, cache))
unused_rules = tuple(filter_used_rules(rules, container.log, select))
if unused_rules:
num_of_removed_rules += len(unused_rules)
[sheet.cssRules.remove(r) for r in unused_rules]
preserve_htmlns_prefix(sheet, ns)
style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style)
container.dirty(name)
@ -196,12 +92,12 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
sname = container.href_to_name(link.get('href'), name)
if sname not in sheets:
continue
style_rules[sname] = tuple(filter_used_rules(root, style_rules[sname], container.log, pseudo_pat, cache))
style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select))
if remove_unused_classes:
used_classes |= class_map[sname]
for iname in import_map[sname]:
style_rules[iname] = tuple(filter_used_rules(root, style_rules[iname], container.log, pseudo_pat, cache))
style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select))
if remove_unused_classes:
used_classes |= class_map[iname]
@ -220,7 +116,6 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
container.dirty(name)
for name, sheet in sheets.iteritems():
preserve_htmlns_prefix(sheet, sheet_namespace[sheet])
unused_rules = style_rules[name]
if unused_rules:
num_of_removed_rules += len(unused_rules)

View File

@ -6,17 +6,17 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import posixpath, os, time, types, re
import posixpath, os, time, types
from collections import namedtuple, defaultdict, Counter
from itertools import chain
from calibre import prepare_string_for_xml, force_unicode
from calibre.ebooks.oeb.base import XPath, xml2text
from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS
from calibre.ebooks.oeb.polish.css import build_selector, PSEUDO_PAT, MIN_SPACE_RE
from calibre.ebooks.oeb.polish.spell import get_all_words
from calibre.utils.icu import numeric_sort_key, ord_string, safe_chr
from calibre.utils.magick.draw import identify
from css_selectors import Select, SelectorError
File = namedtuple('File', 'name dir basename size category')
@ -255,8 +255,6 @@ def css_data(container, book_locale, result_data, *args):
css_rules(name, parser.parse_stylesheet(force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1))
rule_map = defaultdict(lambda : defaultdict(list))
pseudo_pat = re.compile(PSEUDO_PAT, re.I)
cache = {}
def rules_in_sheet(sheet):
for rule in sheet:
@ -285,28 +283,12 @@ def css_data(container, book_locale, result_data, *args):
return '<%s %s>' % (tag, attribs)
ans = tt_cache[elem] = '<%s>' % tag
def matches_for_selector(selector, root, class_map, rule):
selector = pseudo_pat.sub('', selector)
selector = MIN_SPACE_RE.sub(r'\1', selector)
try:
xp = cache[(True, selector)]
except KeyError:
xp = cache[(True, selector)] = build_selector(selector)
try:
matches = xp(root)
except Exception:
return ()
if not matches:
try:
xp = cache[(False, selector)]
except KeyError:
xp = cache[(False, selector)] = build_selector(selector, case_sensitive=False)
try:
matches = xp(root)
except Exception:
return ()
def matches_for_selector(selector, select, class_map, rule):
lsel = selector.lower()
try:
matches = tuple(select(selector))
except SelectorError:
return ()
for elem in matches:
for cls in elem.get('class', '').split():
if '.' + cls.lower() in lsel:
@ -322,9 +304,10 @@ def css_data(container, book_locale, result_data, *args):
for elem in root.xpath('//*[@class]'):
for cls in elem.get('class', '').split():
cmap[cls][elem] = []
select = Select(root, ignore_inappropriate_pseudo_classes=True)
for sheet in chain(sheets_for_html(name, root), inline_sheets):
for rule in rules_in_sheet(sheet):
rule_map[rule][name].extend(matches_for_selector(rule.selector, root, cmap, rule))
rule_map[rule][name].extend(matches_for_selector(rule.selector, select, cmap, rule))
for cls, elem_map in cmap.iteritems():
class_elements = class_map[cls][name]
for elem, usage in elem_map.iteritems():

View File

@ -8,7 +8,6 @@ __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import json
from cssselect import parse
from PyQt5.Qt import (
QWidget, QTimer, QStackedLayout, QLabel, QScrollArea, QVBoxLayout,
QPainter, Qt, QPalette, QRect, QSize, QSizePolicy, pyqtSignal,
@ -18,6 +17,7 @@ from calibre.constants import iswindows
from calibre.gui2.tweak_book import editors, actions, current_container, tprefs
from calibre.gui2.tweak_book.editor.themes import get_theme, theme_color
from calibre.gui2.tweak_book.editor.text import default_font_family
from css_selectors import parse, SelectorError
class Heading(QWidget): # {{{
@ -434,7 +434,7 @@ class LiveCSS(QWidget):
if selector is not None:
try:
specificity = [0] + list(parse(selector)[0].specificity())
except (AttributeError, TypeError):
except (AttributeError, TypeError, SelectorError):
specificity = [0, 0, 0, 0]
else: # style attribute
specificity = [1, 0, 0, 0]