mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Get rid of cssselect from the conversion pipeline
This commit is contained in:
parent
1de3706bff
commit
3226fb0ab4
@ -299,7 +299,7 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
return (None, None)
|
return (None, None)
|
||||||
try:
|
try:
|
||||||
raw = open(link, 'rb').read().decode('utf-8', 'replace')
|
raw = open(link, 'rb').read().decode('utf-8', 'replace')
|
||||||
raw = self.oeb.css_preprocessor(raw, add_namespace=True)
|
raw = self.oeb.css_preprocessor(raw, add_namespace=False)
|
||||||
except:
|
except:
|
||||||
self.log.exception('Failed to read CSS file: %r'%link)
|
self.log.exception('Failed to read CSS file: %r'%link)
|
||||||
return (None, None)
|
return (None, None)
|
||||||
|
@ -924,13 +924,12 @@ class Manifest(object):
|
|||||||
log.raiseExceptions = False
|
log.raiseExceptions = False
|
||||||
self.oeb.log.debug('Parsing', self.href, '...')
|
self.oeb.log.debug('Parsing', self.href, '...')
|
||||||
data = self.oeb.decode(data)
|
data = self.oeb.decode(data)
|
||||||
data = self.oeb.css_preprocessor(data, add_namespace=True)
|
data = self.oeb.css_preprocessor(data, add_namespace=False)
|
||||||
parser = CSSParser(loglevel=logging.WARNING,
|
parser = CSSParser(loglevel=logging.WARNING,
|
||||||
fetcher=self.override_css_fetch or self._fetch_css,
|
fetcher=self.override_css_fetch or self._fetch_css,
|
||||||
log=_css_logger)
|
log=_css_logger)
|
||||||
data = parser.parseString(data, href=self.href, validate=False)
|
data = parser.parseString(data, href=self.href, validate=False)
|
||||||
data = resolveImports(data)
|
data = resolveImports(data)
|
||||||
data.namespaces['h'] = XHTML_NS
|
|
||||||
for rule in tuple(data.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
|
for rule in tuple(data.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
|
||||||
data.cssRules.remove(rule)
|
data.cssRules.remove(rule)
|
||||||
return data
|
return data
|
||||||
|
@ -15,28 +15,23 @@ from cssutils.css import (CSSStyleRule, CSSPageRule, CSSFontFaceRule,
|
|||||||
cssproperties, CSSRule)
|
cssproperties, CSSRule)
|
||||||
from cssutils import (profile as cssprofiles, parseString, parseStyle, log as
|
from cssutils import (profile as cssprofiles, parseString, parseStyle, log as
|
||||||
cssutils_log, CSSParser, profiles, replaceUrls)
|
cssutils_log, CSSParser, profiles, replaceUrls)
|
||||||
from lxml import etree
|
from calibre import force_unicode, as_unicode
|
||||||
from cssselect import HTMLTranslator
|
|
||||||
|
|
||||||
from calibre import force_unicode
|
|
||||||
from calibre.ebooks import unit_convert
|
from calibre.ebooks import unit_convert
|
||||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, XPNSMAP, xpath, urlnormalize
|
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, xpath, urlnormalize
|
||||||
from calibre.ebooks.oeb.normalize_css import DEFAULTS, normalizers
|
from calibre.ebooks.oeb.normalize_css import DEFAULTS, normalizers
|
||||||
|
from css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES
|
||||||
|
|
||||||
cssutils_log.setLevel(logging.WARN)
|
cssutils_log.setLevel(logging.WARN)
|
||||||
|
|
||||||
_html_css_stylesheet = None
|
_html_css_stylesheet = None
|
||||||
css_to_xpath = HTMLTranslator().css_to_xpath
|
|
||||||
|
|
||||||
def html_css_stylesheet():
|
def html_css_stylesheet():
|
||||||
global _html_css_stylesheet
|
global _html_css_stylesheet
|
||||||
if _html_css_stylesheet is None:
|
if _html_css_stylesheet is None:
|
||||||
html_css = open(P('templates/html.css'), 'rb').read()
|
html_css = open(P('templates/html.css'), 'rb').read()
|
||||||
_html_css_stylesheet = parseString(html_css, validate=False)
|
_html_css_stylesheet = parseString(html_css, validate=False)
|
||||||
_html_css_stylesheet.namespaces['h'] = XHTML_NS
|
|
||||||
return _html_css_stylesheet
|
return _html_css_stylesheet
|
||||||
|
|
||||||
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
|
|
||||||
|
|
||||||
INHERITED = set(['azimuth', 'border-collapse', 'border-spacing',
|
INHERITED = set(['azimuth', 'border-collapse', 'border-spacing',
|
||||||
'caption-side', 'color', 'cursor', 'direction', 'elevation',
|
'caption-side', 'color', 'cursor', 'direction', 'elevation',
|
||||||
@ -53,100 +48,6 @@ INHERITED = set(['azimuth', 'border-collapse', 'border-spacing',
|
|||||||
FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
|
FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
|
||||||
'x-large', 'xx-large'])
|
'x-large', 'xx-large'])
|
||||||
|
|
||||||
def xpath_lower_case(arg):
|
|
||||||
'An ASCII lowercase function for XPath'
|
|
||||||
return ("translate(%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
|
|
||||||
"'abcdefghijklmnopqrstuvwxyz')")%arg
|
|
||||||
is_non_whitespace = re.compile(r'^[^ \t\r\n\f]+$').match
|
|
||||||
|
|
||||||
class CaseInsensitiveAttributesTranslator(HTMLTranslator):
|
|
||||||
'Treat class and id CSS selectors case-insensitively'
|
|
||||||
|
|
||||||
def xpath_class(self, class_selector):
|
|
||||||
"""Translate a class selector."""
|
|
||||||
x = self.xpath(class_selector.selector)
|
|
||||||
if is_non_whitespace(class_selector.class_name):
|
|
||||||
x.add_condition(
|
|
||||||
"%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
|
|
||||||
% ('@class', xpath_lower_case('@class'), self.xpath_literal(
|
|
||||||
' '+class_selector.class_name.lower()+' ')))
|
|
||||||
else:
|
|
||||||
x.add_condition('0')
|
|
||||||
return x
|
|
||||||
|
|
||||||
def xpath_hash(self, id_selector):
|
|
||||||
"""Translate an ID selector."""
|
|
||||||
x = self.xpath(id_selector.selector)
|
|
||||||
return self.xpath_attrib_equals(x, xpath_lower_case('@id'),
|
|
||||||
(id_selector.id.lower()))
|
|
||||||
|
|
||||||
ci_css_to_xpath = CaseInsensitiveAttributesTranslator().css_to_xpath
|
|
||||||
|
|
||||||
NULL_NAMESPACE_REGEX = re.compile(ur'''(name\(\) = ['"])h:''')
|
|
||||||
def fix_namespace(raw):
|
|
||||||
'''
|
|
||||||
cssselect uses name() = 'h:p' to select tags for some CSS selectors (e.g.
|
|
||||||
h|p+h|p).
|
|
||||||
However, since for us the XHTML namespace is the default namespace (with no
|
|
||||||
prefix), name() is the same as local-name(). So this is a hack to
|
|
||||||
workaround the problem.
|
|
||||||
'''
|
|
||||||
return NULL_NAMESPACE_REGEX.sub(ur'\1', raw)
|
|
||||||
|
|
||||||
class CSSSelector(object):
|
|
||||||
|
|
||||||
def __init__(self, css, log=None, namespaces=XPNSMAP):
|
|
||||||
self.namespaces = namespaces
|
|
||||||
self.sel = self.build_selector(css, log)
|
|
||||||
self.css = css
|
|
||||||
self.used_ci_sel = False
|
|
||||||
|
|
||||||
def build_selector(self, css, log, func=css_to_xpath):
|
|
||||||
try:
|
|
||||||
return etree.XPath(fix_namespace(func(css)), namespaces=self.namespaces)
|
|
||||||
except:
|
|
||||||
if log is not None:
|
|
||||||
log.exception('Failed to parse CSS selector: %r'%css)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __call__(self, node, log):
|
|
||||||
if self.sel is None:
|
|
||||||
return []
|
|
||||||
try:
|
|
||||||
ans = self.sel(node)
|
|
||||||
except:
|
|
||||||
log.exception(u'Failed to run CSS selector: %s'%self.css)
|
|
||||||
return []
|
|
||||||
|
|
||||||
if not ans:
|
|
||||||
# Try a case insensitive version
|
|
||||||
if not hasattr(self, 'ci_sel'):
|
|
||||||
self.ci_sel = self.build_selector(self.css, log, ci_css_to_xpath)
|
|
||||||
if self.ci_sel is not None:
|
|
||||||
try:
|
|
||||||
ans = self.ci_sel(node)
|
|
||||||
except:
|
|
||||||
log.exception(u'Failed to run case-insensitive CSS selector: %s'%self.css)
|
|
||||||
return []
|
|
||||||
if ans:
|
|
||||||
if not self.used_ci_sel:
|
|
||||||
log.warn('Interpreting class and id values '
|
|
||||||
'case-insensitively in selector: %s'%self.css)
|
|
||||||
self.used_ci_sel = True
|
|
||||||
return ans
|
|
||||||
|
|
||||||
_selector_cache = {}
|
|
||||||
|
|
||||||
MIN_SPACE_RE = re.compile(r' *([>~+]) *')
|
|
||||||
|
|
||||||
def get_css_selector(raw_selector, log):
|
|
||||||
css = MIN_SPACE_RE.sub(r'\1', raw_selector)
|
|
||||||
ans = _selector_cache.get(css, None)
|
|
||||||
if ans is None:
|
|
||||||
ans = CSSSelector(css, log)
|
|
||||||
_selector_cache[css] = ans
|
|
||||||
return ans
|
|
||||||
|
|
||||||
class Stylizer(object):
|
class Stylizer(object):
|
||||||
STYLESHEETS = WeakKeyDictionary()
|
STYLESHEETS = WeakKeyDictionary()
|
||||||
|
|
||||||
@ -195,13 +96,12 @@ class Stylizer(object):
|
|||||||
if t:
|
if t:
|
||||||
text += u'\n\n' + force_unicode(t, u'utf-8')
|
text += u'\n\n' + force_unicode(t, u'utf-8')
|
||||||
if text:
|
if text:
|
||||||
text = oeb.css_preprocessor(text, add_namespace=True)
|
text = oeb.css_preprocessor(text)
|
||||||
# We handle @import rules separately
|
# We handle @import rules separately
|
||||||
parser.setFetcher(lambda x: ('utf-8', b''))
|
parser.setFetcher(lambda x: ('utf-8', b''))
|
||||||
stylesheet = parser.parseString(text, href=cssname,
|
stylesheet = parser.parseString(text, href=cssname,
|
||||||
validate=False)
|
validate=False)
|
||||||
parser.setFetcher(self._fetch_css_file)
|
parser.setFetcher(self._fetch_css_file)
|
||||||
stylesheet.namespaces['h'] = XHTML_NS
|
|
||||||
for rule in stylesheet.cssRules:
|
for rule in stylesheet.cssRules:
|
||||||
if rule.type == rule.IMPORT_RULE:
|
if rule.type == rule.IMPORT_RULE:
|
||||||
ihref = item.abshref(rule.href)
|
ihref = item.abshref(rule.href)
|
||||||
@ -244,10 +144,9 @@ class Stylizer(object):
|
|||||||
for w, x in csses.items():
|
for w, x in csses.items():
|
||||||
if x:
|
if x:
|
||||||
try:
|
try:
|
||||||
text = XHTML_CSS_NAMESPACE + x
|
text = x
|
||||||
stylesheet = parser.parseString(text, href=cssname,
|
stylesheet = parser.parseString(text, href=cssname,
|
||||||
validate=False)
|
validate=False)
|
||||||
stylesheet.namespaces['h'] = XHTML_NS
|
|
||||||
stylesheets.append(stylesheet)
|
stylesheets.append(stylesheet)
|
||||||
except:
|
except:
|
||||||
self.logger.exception('Failed to parse %s, ignoring.'%w)
|
self.logger.exception('Failed to parse %s, ignoring.'%w)
|
||||||
@ -275,13 +174,17 @@ class Stylizer(object):
|
|||||||
rules.sort()
|
rules.sort()
|
||||||
self.rules = rules
|
self.rules = rules
|
||||||
self._styles = {}
|
self._styles = {}
|
||||||
pseudo_pat = re.compile(ur':{1,2}(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I)
|
pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
|
||||||
|
select = Select(tree, ignore_inappropriate_pseudo_classes=True)
|
||||||
|
|
||||||
for _, _, cssdict, text, _ in rules:
|
for _, _, cssdict, text, _ in rules:
|
||||||
fl = pseudo_pat.search(text)
|
fl = pseudo_pat.search(text)
|
||||||
if fl is not None:
|
try:
|
||||||
text = text.replace(fl.group(), '')
|
matches = select(text)
|
||||||
selector = get_css_selector(text, self.oeb.log)
|
except SelectorError as err:
|
||||||
matches = selector(tree, self.logger)
|
self.log.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err)))
|
||||||
|
continue
|
||||||
|
|
||||||
if fl is not None:
|
if fl is not None:
|
||||||
fl = fl.group(1)
|
fl = fl.group(1)
|
||||||
if fl == 'first-letter' and getattr(self.oeb,
|
if fl == 'first-letter' and getattr(self.oeb,
|
||||||
@ -486,9 +389,7 @@ class Style(object):
|
|||||||
result = None
|
result = None
|
||||||
if name in self._style:
|
if name in self._style:
|
||||||
result = self._style[name]
|
result = self._style[name]
|
||||||
if (result == 'inherit'
|
if (result == 'inherit' or (result is None and name in INHERITED and self._has_parent())):
|
||||||
or (result is None and name in INHERITED
|
|
||||||
and self._has_parent())):
|
|
||||||
stylizer = self._stylizer
|
stylizer = self._stylizer
|
||||||
result = stylizer.style(self._element.getparent())._get(name)
|
result = stylizer.style(self._element.getparent())._get(name)
|
||||||
if result is None:
|
if result is None:
|
||||||
|
@ -14,12 +14,13 @@ from collections import OrderedDict
|
|||||||
|
|
||||||
from lxml.etree import XPath as _XPath
|
from lxml.etree import XPath as _XPath
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from cssselect import HTMLTranslator
|
|
||||||
|
|
||||||
|
from calibre import as_unicode
|
||||||
from calibre.ebooks.epub import rules
|
from calibre.ebooks.epub import rules
|
||||||
from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
|
from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
|
||||||
urldefrag, rewrite_links, urlunquote, XHTML, urlnormalize)
|
urldefrag, rewrite_links, urlunquote, XHTML, urlnormalize)
|
||||||
from calibre.ebooks.oeb.polish.split import do_split
|
from calibre.ebooks.oeb.polish.split import do_split
|
||||||
|
from css_selectors import Select, SelectorError
|
||||||
|
|
||||||
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
|
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
|
||||||
|
|
||||||
@ -75,9 +76,7 @@ class Split(object):
|
|||||||
|
|
||||||
def find_page_breaks(self, item):
|
def find_page_breaks(self, item):
|
||||||
if self.page_break_selectors is None:
|
if self.page_break_selectors is None:
|
||||||
from calibre.ebooks.oeb.stylizer import fix_namespace
|
self.page_break_selectors = set()
|
||||||
css_to_xpath = HTMLTranslator().css_to_xpath
|
|
||||||
self.page_break_selectors = set([])
|
|
||||||
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
|
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
|
||||||
OEB_STYLES]
|
OEB_STYLES]
|
||||||
for rule in rules(stylesheets):
|
for rule in rules(stylesheets):
|
||||||
@ -87,31 +86,37 @@ class Split(object):
|
|||||||
'page-break-after'), 'cssText', '').strip().lower()
|
'page-break-after'), 'cssText', '').strip().lower()
|
||||||
try:
|
try:
|
||||||
if before and before not in {'avoid', 'auto', 'inherit'}:
|
if before and before not in {'avoid', 'auto', 'inherit'}:
|
||||||
self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))),
|
self.page_break_selectors.add((rule.selectorText, True))
|
||||||
True))
|
|
||||||
if self.remove_css_pagebreaks:
|
if self.remove_css_pagebreaks:
|
||||||
rule.style.removeProperty('page-break-before')
|
rule.style.removeProperty('page-break-before')
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
if after and after not in {'avoid', 'auto', 'inherit'}:
|
if after and after not in {'avoid', 'auto', 'inherit'}:
|
||||||
self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))),
|
self.page_break_selectors.add((rule.selectorText, False))
|
||||||
False))
|
|
||||||
if self.remove_css_pagebreaks:
|
if self.remove_css_pagebreaks:
|
||||||
rule.style.removeProperty('page-break-after')
|
rule.style.removeProperty('page-break-after')
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
page_breaks = set([])
|
page_breaks = set()
|
||||||
for selector, before in self.page_break_selectors:
|
select = Select(item.data)
|
||||||
body = item.data.xpath('//h:body', namespaces=NAMESPACES)
|
if not self.page_break_selectors:
|
||||||
if not body:
|
return [], []
|
||||||
continue
|
body = item.data.xpath('//h:body', namespaces=NAMESPACES)
|
||||||
for elem in selector(body[0]):
|
if not body:
|
||||||
if elem not in body:
|
return [], []
|
||||||
elem.set('pb_before', '1' if before else '0')
|
descendants = frozenset(body[0].iterdescendants('*'))
|
||||||
page_breaks.add(elem)
|
|
||||||
|
|
||||||
for i, elem in enumerate(item.data.iter()):
|
for selector, before in self.page_break_selectors:
|
||||||
|
try:
|
||||||
|
for elem in select(selector):
|
||||||
|
if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
|
||||||
|
elem.set('pb_before', '1' if before else '0')
|
||||||
|
page_breaks.add(elem)
|
||||||
|
except SelectorError as err:
|
||||||
|
self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))
|
||||||
|
|
||||||
|
for i, elem in enumerate(item.data.iter('*')):
|
||||||
try:
|
try:
|
||||||
elem.set('pb_order', str(i))
|
elem.set('pb_order', str(i))
|
||||||
except TypeError: # Cant set attributes on comment nodes etc.
|
except TypeError: # Cant set attributes on comment nodes etc.
|
||||||
@ -358,8 +363,7 @@ class FlowSplitter(object):
|
|||||||
len(self.split_trees), size/1024.))
|
len(self.split_trees), size/1024.))
|
||||||
else:
|
else:
|
||||||
self.log.debug(
|
self.log.debug(
|
||||||
'\t\t\tSplit tree still too large: %d KB' %
|
'\t\t\tSplit tree still too large: %d KB' % (size/1024.))
|
||||||
(size/1024.))
|
|
||||||
self.split_to_size(t)
|
self.split_to_size(t)
|
||||||
|
|
||||||
def find_split_point(self, root):
|
def find_split_point(self, root):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user