Get rid of cssselect from the conversion pipeline

This commit is contained in:
Kovid Goyal 2015-02-22 14:02:42 +05:30
parent 1de3706bff
commit 3226fb0ab4
4 changed files with 41 additions and 137 deletions

View File

@ -299,7 +299,7 @@ class HTMLInput(InputFormatPlugin):
return (None, None) return (None, None)
try: try:
raw = open(link, 'rb').read().decode('utf-8', 'replace') raw = open(link, 'rb').read().decode('utf-8', 'replace')
raw = self.oeb.css_preprocessor(raw, add_namespace=True) raw = self.oeb.css_preprocessor(raw, add_namespace=False)
except: except:
self.log.exception('Failed to read CSS file: %r'%link) self.log.exception('Failed to read CSS file: %r'%link)
return (None, None) return (None, None)

View File

@ -924,13 +924,12 @@ class Manifest(object):
log.raiseExceptions = False log.raiseExceptions = False
self.oeb.log.debug('Parsing', self.href, '...') self.oeb.log.debug('Parsing', self.href, '...')
data = self.oeb.decode(data) data = self.oeb.decode(data)
data = self.oeb.css_preprocessor(data, add_namespace=True) data = self.oeb.css_preprocessor(data, add_namespace=False)
parser = CSSParser(loglevel=logging.WARNING, parser = CSSParser(loglevel=logging.WARNING,
fetcher=self.override_css_fetch or self._fetch_css, fetcher=self.override_css_fetch or self._fetch_css,
log=_css_logger) log=_css_logger)
data = parser.parseString(data, href=self.href, validate=False) data = parser.parseString(data, href=self.href, validate=False)
data = resolveImports(data) data = resolveImports(data)
data.namespaces['h'] = XHTML_NS
for rule in tuple(data.cssRules.rulesOfType(CSSRule.PAGE_RULE)): for rule in tuple(data.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
data.cssRules.remove(rule) data.cssRules.remove(rule)
return data return data

View File

@ -15,28 +15,23 @@ from cssutils.css import (CSSStyleRule, CSSPageRule, CSSFontFaceRule,
cssproperties, CSSRule) cssproperties, CSSRule)
from cssutils import (profile as cssprofiles, parseString, parseStyle, log as from cssutils import (profile as cssprofiles, parseString, parseStyle, log as
cssutils_log, CSSParser, profiles, replaceUrls) cssutils_log, CSSParser, profiles, replaceUrls)
from lxml import etree from calibre import force_unicode, as_unicode
from cssselect import HTMLTranslator
from calibre import force_unicode
from calibre.ebooks import unit_convert from calibre.ebooks import unit_convert
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, XPNSMAP, xpath, urlnormalize from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, xpath, urlnormalize
from calibre.ebooks.oeb.normalize_css import DEFAULTS, normalizers from calibre.ebooks.oeb.normalize_css import DEFAULTS, normalizers
from css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES
cssutils_log.setLevel(logging.WARN) cssutils_log.setLevel(logging.WARN)
_html_css_stylesheet = None _html_css_stylesheet = None
css_to_xpath = HTMLTranslator().css_to_xpath
def html_css_stylesheet(): def html_css_stylesheet():
global _html_css_stylesheet global _html_css_stylesheet
if _html_css_stylesheet is None: if _html_css_stylesheet is None:
html_css = open(P('templates/html.css'), 'rb').read() html_css = open(P('templates/html.css'), 'rb').read()
_html_css_stylesheet = parseString(html_css, validate=False) _html_css_stylesheet = parseString(html_css, validate=False)
_html_css_stylesheet.namespaces['h'] = XHTML_NS
return _html_css_stylesheet return _html_css_stylesheet
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
INHERITED = set(['azimuth', 'border-collapse', 'border-spacing', INHERITED = set(['azimuth', 'border-collapse', 'border-spacing',
'caption-side', 'color', 'cursor', 'direction', 'elevation', 'caption-side', 'color', 'cursor', 'direction', 'elevation',
@ -53,100 +48,6 @@ INHERITED = set(['azimuth', 'border-collapse', 'border-spacing',
FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large', FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
'x-large', 'xx-large']) 'x-large', 'xx-large'])
def xpath_lower_case(arg):
'An ASCII lowercase function for XPath'
return ("translate(%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz')")%arg
is_non_whitespace = re.compile(r'^[^ \t\r\n\f]+$').match
class CaseInsensitiveAttributesTranslator(HTMLTranslator):
'Treat class and id CSS selectors case-insensitively'
def xpath_class(self, class_selector):
"""Translate a class selector."""
x = self.xpath(class_selector.selector)
if is_non_whitespace(class_selector.class_name):
x.add_condition(
"%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
% ('@class', xpath_lower_case('@class'), self.xpath_literal(
' '+class_selector.class_name.lower()+' ')))
else:
x.add_condition('0')
return x
def xpath_hash(self, id_selector):
"""Translate an ID selector."""
x = self.xpath(id_selector.selector)
return self.xpath_attrib_equals(x, xpath_lower_case('@id'),
(id_selector.id.lower()))
ci_css_to_xpath = CaseInsensitiveAttributesTranslator().css_to_xpath
NULL_NAMESPACE_REGEX = re.compile(ur'''(name\(\) = ['"])h:''')
def fix_namespace(raw):
'''
cssselect uses name() = 'h:p' to select tags for some CSS selectors (e.g.
h|p+h|p).
However, since for us the XHTML namespace is the default namespace (with no
prefix), name() is the same as local-name(). So this is a hack to
workaround the problem.
'''
return NULL_NAMESPACE_REGEX.sub(ur'\1', raw)
class CSSSelector(object):
def __init__(self, css, log=None, namespaces=XPNSMAP):
self.namespaces = namespaces
self.sel = self.build_selector(css, log)
self.css = css
self.used_ci_sel = False
def build_selector(self, css, log, func=css_to_xpath):
try:
return etree.XPath(fix_namespace(func(css)), namespaces=self.namespaces)
except:
if log is not None:
log.exception('Failed to parse CSS selector: %r'%css)
return None
def __call__(self, node, log):
if self.sel is None:
return []
try:
ans = self.sel(node)
except:
log.exception(u'Failed to run CSS selector: %s'%self.css)
return []
if not ans:
# Try a case insensitive version
if not hasattr(self, 'ci_sel'):
self.ci_sel = self.build_selector(self.css, log, ci_css_to_xpath)
if self.ci_sel is not None:
try:
ans = self.ci_sel(node)
except:
log.exception(u'Failed to run case-insensitive CSS selector: %s'%self.css)
return []
if ans:
if not self.used_ci_sel:
log.warn('Interpreting class and id values '
'case-insensitively in selector: %s'%self.css)
self.used_ci_sel = True
return ans
_selector_cache = {}
MIN_SPACE_RE = re.compile(r' *([>~+]) *')
def get_css_selector(raw_selector, log):
css = MIN_SPACE_RE.sub(r'\1', raw_selector)
ans = _selector_cache.get(css, None)
if ans is None:
ans = CSSSelector(css, log)
_selector_cache[css] = ans
return ans
class Stylizer(object): class Stylizer(object):
STYLESHEETS = WeakKeyDictionary() STYLESHEETS = WeakKeyDictionary()
@ -195,13 +96,12 @@ class Stylizer(object):
if t: if t:
text += u'\n\n' + force_unicode(t, u'utf-8') text += u'\n\n' + force_unicode(t, u'utf-8')
if text: if text:
text = oeb.css_preprocessor(text, add_namespace=True) text = oeb.css_preprocessor(text)
# We handle @import rules separately # We handle @import rules separately
parser.setFetcher(lambda x: ('utf-8', b'')) parser.setFetcher(lambda x: ('utf-8', b''))
stylesheet = parser.parseString(text, href=cssname, stylesheet = parser.parseString(text, href=cssname,
validate=False) validate=False)
parser.setFetcher(self._fetch_css_file) parser.setFetcher(self._fetch_css_file)
stylesheet.namespaces['h'] = XHTML_NS
for rule in stylesheet.cssRules: for rule in stylesheet.cssRules:
if rule.type == rule.IMPORT_RULE: if rule.type == rule.IMPORT_RULE:
ihref = item.abshref(rule.href) ihref = item.abshref(rule.href)
@ -244,10 +144,9 @@ class Stylizer(object):
for w, x in csses.items(): for w, x in csses.items():
if x: if x:
try: try:
text = XHTML_CSS_NAMESPACE + x text = x
stylesheet = parser.parseString(text, href=cssname, stylesheet = parser.parseString(text, href=cssname,
validate=False) validate=False)
stylesheet.namespaces['h'] = XHTML_NS
stylesheets.append(stylesheet) stylesheets.append(stylesheet)
except: except:
self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.exception('Failed to parse %s, ignoring.'%w)
@ -275,13 +174,17 @@ class Stylizer(object):
rules.sort() rules.sort()
self.rules = rules self.rules = rules
self._styles = {} self._styles = {}
pseudo_pat = re.compile(ur':{1,2}(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I) pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
select = Select(tree, ignore_inappropriate_pseudo_classes=True)
for _, _, cssdict, text, _ in rules: for _, _, cssdict, text, _ in rules:
fl = pseudo_pat.search(text) fl = pseudo_pat.search(text)
if fl is not None: try:
text = text.replace(fl.group(), '') matches = select(text)
selector = get_css_selector(text, self.oeb.log) except SelectorError as err:
matches = selector(tree, self.logger) self.log.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err)))
continue
if fl is not None: if fl is not None:
fl = fl.group(1) fl = fl.group(1)
if fl == 'first-letter' and getattr(self.oeb, if fl == 'first-letter' and getattr(self.oeb,
@ -486,9 +389,7 @@ class Style(object):
result = None result = None
if name in self._style: if name in self._style:
result = self._style[name] result = self._style[name]
if (result == 'inherit' if (result == 'inherit' or (result is None and name in INHERITED and self._has_parent())):
or (result is None and name in INHERITED
and self._has_parent())):
stylizer = self._stylizer stylizer = self._stylizer
result = stylizer.style(self._element.getparent())._get(name) result = stylizer.style(self._element.getparent())._get(name)
if result is None: if result is None:

View File

@ -14,12 +14,13 @@ from collections import OrderedDict
from lxml.etree import XPath as _XPath from lxml.etree import XPath as _XPath
from lxml import etree from lxml import etree
from cssselect import HTMLTranslator
from calibre import as_unicode
from calibre.ebooks.epub import rules from calibre.ebooks.epub import rules
from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES, from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
urldefrag, rewrite_links, urlunquote, XHTML, urlnormalize) urldefrag, rewrite_links, urlunquote, XHTML, urlnormalize)
from calibre.ebooks.oeb.polish.split import do_split from calibre.ebooks.oeb.polish.split import do_split
from css_selectors import Select, SelectorError
XPath = functools.partial(_XPath, namespaces=NAMESPACES) XPath = functools.partial(_XPath, namespaces=NAMESPACES)
@ -75,9 +76,7 @@ class Split(object):
def find_page_breaks(self, item): def find_page_breaks(self, item):
if self.page_break_selectors is None: if self.page_break_selectors is None:
from calibre.ebooks.oeb.stylizer import fix_namespace self.page_break_selectors = set()
css_to_xpath = HTMLTranslator().css_to_xpath
self.page_break_selectors = set([])
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES] OEB_STYLES]
for rule in rules(stylesheets): for rule in rules(stylesheets):
@ -87,31 +86,37 @@ class Split(object):
'page-break-after'), 'cssText', '').strip().lower() 'page-break-after'), 'cssText', '').strip().lower()
try: try:
if before and before not in {'avoid', 'auto', 'inherit'}: if before and before not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))), self.page_break_selectors.add((rule.selectorText, True))
True))
if self.remove_css_pagebreaks: if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-before') rule.style.removeProperty('page-break-before')
except: except:
pass pass
try: try:
if after and after not in {'avoid', 'auto', 'inherit'}: if after and after not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))), self.page_break_selectors.add((rule.selectorText, False))
False))
if self.remove_css_pagebreaks: if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-after') rule.style.removeProperty('page-break-after')
except: except:
pass pass
page_breaks = set([]) page_breaks = set()
for selector, before in self.page_break_selectors: select = Select(item.data)
body = item.data.xpath('//h:body', namespaces=NAMESPACES) if not self.page_break_selectors:
if not body: return [], []
continue body = item.data.xpath('//h:body', namespaces=NAMESPACES)
for elem in selector(body[0]): if not body:
if elem not in body: return [], []
elem.set('pb_before', '1' if before else '0') descendants = frozenset(body[0].iterdescendants('*'))
page_breaks.add(elem)
for i, elem in enumerate(item.data.iter()): for selector, before in self.page_break_selectors:
try:
for elem in select(selector):
if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
elem.set('pb_before', '1' if before else '0')
page_breaks.add(elem)
except SelectorError as err:
self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))
for i, elem in enumerate(item.data.iter('*')):
try: try:
elem.set('pb_order', str(i)) elem.set('pb_order', str(i))
except TypeError: # Cant set attributes on comment nodes etc. except TypeError: # Cant set attributes on comment nodes etc.
@ -358,8 +363,7 @@ class FlowSplitter(object):
len(self.split_trees), size/1024.)) len(self.split_trees), size/1024.))
else: else:
self.log.debug( self.log.debug(
'\t\t\tSplit tree still too large: %d KB' % '\t\t\tSplit tree still too large: %d KB' % (size/1024.))
(size/1024.))
self.split_to_size(t) self.split_to_size(t)
def find_split_point(self, root): def find_split_point(self, root):