Get rid of cssselect from Edit Book

This commit is contained in:
Kovid Goyal 2015-02-22 09:11:03 +05:30
parent 0c4e86dcd1
commit 8f6f60bca2
3 changed files with 21 additions and 143 deletions

View File

@ -6,122 +6,26 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from lxml import etree
from cssutils.css import CSSRule from cssutils.css import CSSRule
from cssselect import HTMLTranslator, parse from css_selectors import parse, SelectorSyntaxError
from cssselect.xpath import XPathExpr, is_safe_name
from cssselect.parser import SelectorSyntaxError
from calibre import force_unicode from calibre import force_unicode
from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XPNSMAP, XHTML_NS from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS
from calibre.ebooks.oeb.normalize_css import normalize_filter_css, normalizers from calibre.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
from calibre.ebooks.oeb.stylizer import MIN_SPACE_RE, is_non_whitespace, xpath_lower_case, fix_namespace
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style
from css_selectors import Select
class NamespacedTranslator(HTMLTranslator):
def xpath_element(self, selector): def filter_used_rules(rules, log, select):
element = selector.element
if not element:
element = '*'
safe = True
else:
safe = is_safe_name(element)
if safe:
# We use the h: prefix for the XHTML namespace
element = 'h:%s' % element.lower()
xpath = XPathExpr(element=element)
if not safe:
xpath.add_name_test()
return xpath
class CaseInsensitiveAttributesTranslator(NamespacedTranslator):
'Treat class and id CSS selectors case-insensitively'
def xpath_class(self, class_selector):
"""Translate a class selector."""
x = self.xpath(class_selector.selector)
if is_non_whitespace(class_selector.class_name):
x.add_condition(
"%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
% ('@class', xpath_lower_case('@class'), self.xpath_literal(
' '+class_selector.class_name.lower()+' ')))
else:
x.add_condition('0')
return x
def xpath_hash(self, id_selector):
"""Translate an ID selector."""
x = self.xpath(id_selector.selector)
return self.xpath_attrib_equals(x, xpath_lower_case('@id'),
(id_selector.id.lower()))
css_to_xpath = NamespacedTranslator().css_to_xpath
ci_css_to_xpath = CaseInsensitiveAttributesTranslator().css_to_xpath
def build_selector(text, case_sensitive=True):
func = css_to_xpath if case_sensitive else ci_css_to_xpath
try:
return etree.XPath(fix_namespace(func(text)), namespaces=XPNSMAP)
except Exception:
return None
PSEUDO_PAT = r':(first-letter|first-line|link|hover|visited|active|focus|before|after)'
def is_rule_used(root, selector, log, pseudo_pat, cache):
selector = pseudo_pat.sub('', selector)
selector = MIN_SPACE_RE.sub(r'\1', selector)
try:
xp = cache[(True, selector)]
except KeyError:
xp = cache[(True, selector)] = build_selector(selector)
try:
if xp(root):
return True
except Exception:
return True
# See if interpreting class and id selectors case-insensitively gives us
# matches. Strictly speaking, class and id selectors should be case
# sensitive for XHTML, but we err on the side of caution and not remove
# them, since case sensitivity depends on whether the html is rendered in
# quirks mode or not.
try:
xp = cache[(False, selector)]
except KeyError:
xp = cache[(False, selector)] = build_selector(selector, case_sensitive=False)
try:
return bool(xp(root))
except Exception:
return True
def filter_used_rules(root, rules, log, pseudo_pat, cache):
for rule in rules: for rule in rules:
used = False used = False
for selector in rule.selectorList: for selector in rule.selectorList:
text = selector.selectorText if select.has_matches(selector.selectorText):
if is_rule_used(root, text, log, pseudo_pat, cache):
used = True used = True
break break
if not used: if not used:
yield rule yield rule
def process_namespaces(sheet):
# Find the namespace prefix (if any) for the XHTML namespace, so that we
# can preserve it after processing
for prefix in sheet.namespaces:
if sheet.namespaces[prefix] == XHTML_NS:
return prefix
def preserve_htmlns_prefix(sheet, prefix):
if prefix is None:
while 'h' in sheet.namespaces:
del sheet.namespaces['h']
else:
sheet.namespaces[prefix] = XHTML_NS
def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None): def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None):
ans = set() ans = set()
sheet = sheet or sheets[name] sheet = sheet or sheets[name]
@ -155,20 +59,15 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
import_map = {name:get_imported_sheets(name, container, sheets) for name in sheets} import_map = {name:get_imported_sheets(name, container, sheets) for name in sheets}
if remove_unused_classes: if remove_unused_classes:
class_map = {name:{icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in sheets.iteritems()} class_map = {name:{icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in sheets.iteritems()}
sheet_namespace = {}
for sheet in sheets.itervalues():
sheet_namespace[sheet] = process_namespaces(sheet)
sheet.namespaces['h'] = XHTML_NS
style_rules = {name:tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in sheets.iteritems()} style_rules = {name:tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in sheets.iteritems()}
num_of_removed_rules = num_of_removed_classes = 0 num_of_removed_rules = num_of_removed_classes = 0
pseudo_pat = re.compile(PSEUDO_PAT, re.I)
cache = {}
for name, mt in container.mime_map.iteritems(): for name, mt in container.mime_map.iteritems():
if mt not in OEB_DOCS: if mt not in OEB_DOCS:
continue continue
root = container.parsed(name) root = container.parsed(name)
select = Select(root, ignore_inappropriate_pseudo_classes=True)
used_classes = set() used_classes = set()
for style in root.xpath('//*[local-name()="style"]'): for style in root.xpath('//*[local-name()="style"]'):
if style.get('type', 'text/css') == 'text/css' and style.text: if style.get('type', 'text/css') == 'text/css' and style.text:
@ -177,17 +76,14 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
used_classes |= {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} used_classes |= {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets, sheet=sheet) imports = get_imported_sheets(name, container, sheets, sheet=sheet)
for imported_sheet in imports: for imported_sheet in imports:
style_rules[imported_sheet] = tuple(filter_used_rules(root, style_rules[imported_sheet], container.log, pseudo_pat, cache)) style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
if remove_unused_classes: if remove_unused_classes:
used_classes |= class_map[imported_sheet] used_classes |= class_map[imported_sheet]
ns = process_namespaces(sheet)
sheet.namespaces['h'] = XHTML_NS
rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
unused_rules = tuple(filter_used_rules(root, rules, container.log, pseudo_pat, cache)) unused_rules = tuple(filter_used_rules(rules, container.log, select))
if unused_rules: if unused_rules:
num_of_removed_rules += len(unused_rules) num_of_removed_rules += len(unused_rules)
[sheet.cssRules.remove(r) for r in unused_rules] [sheet.cssRules.remove(r) for r in unused_rules]
preserve_htmlns_prefix(sheet, ns)
style.text = force_unicode(sheet.cssText, 'utf-8') style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style) pretty_script_or_style(container, style)
container.dirty(name) container.dirty(name)
@ -196,12 +92,12 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
sname = container.href_to_name(link.get('href'), name) sname = container.href_to_name(link.get('href'), name)
if sname not in sheets: if sname not in sheets:
continue continue
style_rules[sname] = tuple(filter_used_rules(root, style_rules[sname], container.log, pseudo_pat, cache)) style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select))
if remove_unused_classes: if remove_unused_classes:
used_classes |= class_map[sname] used_classes |= class_map[sname]
for iname in import_map[sname]: for iname in import_map[sname]:
style_rules[iname] = tuple(filter_used_rules(root, style_rules[iname], container.log, pseudo_pat, cache)) style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select))
if remove_unused_classes: if remove_unused_classes:
used_classes |= class_map[iname] used_classes |= class_map[iname]
@ -220,7 +116,6 @@ def remove_unused_css(container, report=None, remove_unused_classes=False):
container.dirty(name) container.dirty(name)
for name, sheet in sheets.iteritems(): for name, sheet in sheets.iteritems():
preserve_htmlns_prefix(sheet, sheet_namespace[sheet])
unused_rules = style_rules[name] unused_rules = style_rules[name]
if unused_rules: if unused_rules:
num_of_removed_rules += len(unused_rules) num_of_removed_rules += len(unused_rules)

View File

@ -6,17 +6,17 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import posixpath, os, time, types, re import posixpath, os, time, types
from collections import namedtuple, defaultdict, Counter from collections import namedtuple, defaultdict, Counter
from itertools import chain from itertools import chain
from calibre import prepare_string_for_xml, force_unicode from calibre import prepare_string_for_xml, force_unicode
from calibre.ebooks.oeb.base import XPath, xml2text from calibre.ebooks.oeb.base import XPath, xml2text
from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS
from calibre.ebooks.oeb.polish.css import build_selector, PSEUDO_PAT, MIN_SPACE_RE
from calibre.ebooks.oeb.polish.spell import get_all_words from calibre.ebooks.oeb.polish.spell import get_all_words
from calibre.utils.icu import numeric_sort_key, ord_string, safe_chr from calibre.utils.icu import numeric_sort_key, ord_string, safe_chr
from calibre.utils.magick.draw import identify from calibre.utils.magick.draw import identify
from css_selectors import Select, SelectorError
File = namedtuple('File', 'name dir basename size category') File = namedtuple('File', 'name dir basename size category')
@ -255,8 +255,6 @@ def css_data(container, book_locale, result_data, *args):
css_rules(name, parser.parse_stylesheet(force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1)) css_rules(name, parser.parse_stylesheet(force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1))
rule_map = defaultdict(lambda : defaultdict(list)) rule_map = defaultdict(lambda : defaultdict(list))
pseudo_pat = re.compile(PSEUDO_PAT, re.I)
cache = {}
def rules_in_sheet(sheet): def rules_in_sheet(sheet):
for rule in sheet: for rule in sheet:
@ -285,28 +283,12 @@ def css_data(container, book_locale, result_data, *args):
return '<%s %s>' % (tag, attribs) return '<%s %s>' % (tag, attribs)
ans = tt_cache[elem] = '<%s>' % tag ans = tt_cache[elem] = '<%s>' % tag
def matches_for_selector(selector, root, class_map, rule): def matches_for_selector(selector, select, class_map, rule):
selector = pseudo_pat.sub('', selector)
selector = MIN_SPACE_RE.sub(r'\1', selector)
try:
xp = cache[(True, selector)]
except KeyError:
xp = cache[(True, selector)] = build_selector(selector)
try:
matches = xp(root)
except Exception:
return ()
if not matches:
try:
xp = cache[(False, selector)]
except KeyError:
xp = cache[(False, selector)] = build_selector(selector, case_sensitive=False)
try:
matches = xp(root)
except Exception:
return ()
lsel = selector.lower() lsel = selector.lower()
try:
matches = tuple(select(selector))
except SelectorError:
return ()
for elem in matches: for elem in matches:
for cls in elem.get('class', '').split(): for cls in elem.get('class', '').split():
if '.' + cls.lower() in lsel: if '.' + cls.lower() in lsel:
@ -322,9 +304,10 @@ def css_data(container, book_locale, result_data, *args):
for elem in root.xpath('//*[@class]'): for elem in root.xpath('//*[@class]'):
for cls in elem.get('class', '').split(): for cls in elem.get('class', '').split():
cmap[cls][elem] = [] cmap[cls][elem] = []
select = Select(root, ignore_inappropriate_pseudo_classes=True)
for sheet in chain(sheets_for_html(name, root), inline_sheets): for sheet in chain(sheets_for_html(name, root), inline_sheets):
for rule in rules_in_sheet(sheet): for rule in rules_in_sheet(sheet):
rule_map[rule][name].extend(matches_for_selector(rule.selector, root, cmap, rule)) rule_map[rule][name].extend(matches_for_selector(rule.selector, select, cmap, rule))
for cls, elem_map in cmap.iteritems(): for cls, elem_map in cmap.iteritems():
class_elements = class_map[cls][name] class_elements = class_map[cls][name]
for elem, usage in elem_map.iteritems(): for elem, usage in elem_map.iteritems():

View File

@ -8,7 +8,6 @@ __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import json import json
from cssselect import parse
from PyQt5.Qt import ( from PyQt5.Qt import (
QWidget, QTimer, QStackedLayout, QLabel, QScrollArea, QVBoxLayout, QWidget, QTimer, QStackedLayout, QLabel, QScrollArea, QVBoxLayout,
QPainter, Qt, QPalette, QRect, QSize, QSizePolicy, pyqtSignal, QPainter, Qt, QPalette, QRect, QSize, QSizePolicy, pyqtSignal,
@ -18,6 +17,7 @@ from calibre.constants import iswindows
from calibre.gui2.tweak_book import editors, actions, current_container, tprefs from calibre.gui2.tweak_book import editors, actions, current_container, tprefs
from calibre.gui2.tweak_book.editor.themes import get_theme, theme_color from calibre.gui2.tweak_book.editor.themes import get_theme, theme_color
from calibre.gui2.tweak_book.editor.text import default_font_family from calibre.gui2.tweak_book.editor.text import default_font_family
from css_selectors import parse, SelectorError
class Heading(QWidget): # {{{ class Heading(QWidget): # {{{
@ -434,7 +434,7 @@ class LiveCSS(QWidget):
if selector is not None: if selector is not None:
try: try:
specificity = [0] + list(parse(selector)[0].specificity()) specificity = [0] + list(parse(selector)[0].specificity())
except (AttributeError, TypeError): except (AttributeError, TypeError, SelectorError):
specificity = [0, 0, 0, 0] specificity = [0, 0, 0, 0]
else: # style attribute else: # style attribute
specificity = [1, 0, 0, 0] specificity = [1, 0, 0, 0]