mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion pipeline: More robust handling of case insensitve tag and class css selectors
This commit is contained in:
parent
ac30f8edd4
commit
2afef9211e
1007
src/calibre/ebooks/cssselect.py
Normal file
1007
src/calibre/ebooks/cssselect.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -27,6 +27,7 @@ from calibre import force_unicode
|
|||||||
from calibre.ebooks import unit_convert
|
from calibre.ebooks import unit_convert
|
||||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
|
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
|
||||||
from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize
|
from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize
|
||||||
|
from calibre.ebooks.cssselect import css_to_xpath_no_case
|
||||||
|
|
||||||
cssutils_log.setLevel(logging.WARN)
|
cssutils_log.setLevel(logging.WARN)
|
||||||
|
|
||||||
@ -98,32 +99,72 @@ FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
|
|||||||
'x-large', 'xx-large'])
|
'x-large', 'xx-large'])
|
||||||
|
|
||||||
|
|
||||||
class CSSSelector(etree.XPath):
|
class CSSSelector(object):
|
||||||
MIN_SPACE_RE = re.compile(r' *([>~+]) *')
|
|
||||||
LOCAL_NAME_RE = re.compile(r"(?<!local-)name[(][)] *= *'[^:]+:")
|
LOCAL_NAME_RE = re.compile(r"(?<!local-)name[(][)] *= *'[^:]+:")
|
||||||
|
|
||||||
def __init__(self, css, namespaces=XPNSMAP):
|
def __init__(self, css, namespaces=XPNSMAP):
|
||||||
css = self.MIN_SPACE_RE.sub(r'\1', css)
|
|
||||||
if isinstance(css, unicode):
|
if isinstance(css, unicode):
|
||||||
# Workaround for bug in lxml on windows/OS X that causes a massive
|
# Workaround for bug in lxml on windows/OS X that causes a massive
|
||||||
# memory leak with non ASCII selectors
|
# memory leak with non ASCII selectors
|
||||||
css = css.encode('ascii', 'ignore').decode('ascii')
|
css = css.encode('ascii', 'ignore').decode('ascii')
|
||||||
try:
|
try:
|
||||||
path = css_to_xpath(css)
|
path = self.LOCAL_NAME_RE.sub(r"local-name() = '", css_to_xpath(css))
|
||||||
except UnicodeEncodeError: # Bug in css_to_xpath
|
self.sel1 = etree.XPath(css_to_xpath(css), namespaces=namespaces)
|
||||||
path = '/'
|
except:
|
||||||
except NotImplementedError: # Probably a subselect like :hover
|
self.sel1 = lambda x: []
|
||||||
path = '/'
|
try:
|
||||||
path = self.LOCAL_NAME_RE.sub(r"local-name() = '", path)
|
path = self.LOCAL_NAME_RE.sub(r"local-name() = '",
|
||||||
etree.XPath.__init__(self, path, namespaces=namespaces)
|
css_to_xpath_no_case(css))
|
||||||
|
self.sel2 = etree.XPath(path, namespaces=namespaces)
|
||||||
|
except:
|
||||||
|
raise
|
||||||
|
self.sel2 = lambda x: []
|
||||||
|
self.sel2_use_logged = False
|
||||||
self.css = css
|
self.css = css
|
||||||
|
|
||||||
|
def __call__(self, node, log):
|
||||||
|
try:
|
||||||
|
ans = self.sel1(node)
|
||||||
|
except (AssertionError, ExpressionError, etree.XPathSyntaxError,
|
||||||
|
NameError, # thrown on OS X instead of SelectorSyntaxError
|
||||||
|
SelectorSyntaxError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not ans:
|
||||||
|
try:
|
||||||
|
ans = self.sel2(node)
|
||||||
|
except:
|
||||||
|
return []
|
||||||
|
else:
|
||||||
|
if ans and not self.sel2_use_logged:
|
||||||
|
self.sel2_use_logged = True
|
||||||
|
log.warn('Interpreting class and tag selectors case'
|
||||||
|
' insensitively in the CSS selector: %s'%self.css)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<%s %s for %r>' % (
|
return '<%s %s for %r>' % (
|
||||||
self.__class__.__name__,
|
self.__class__.__name__,
|
||||||
hex(abs(id(self)))[2:],
|
hex(abs(id(self)))[2:],
|
||||||
self.css)
|
self.css)
|
||||||
|
|
||||||
|
_selector_cache = {}
|
||||||
|
|
||||||
|
MIN_SPACE_RE = re.compile(r' *([>~+]) *')
|
||||||
|
|
||||||
|
def get_css_selector(raw_selector):
|
||||||
|
css = MIN_SPACE_RE.sub(r'\1', raw_selector)
|
||||||
|
if isinstance(css, unicode):
|
||||||
|
# Workaround for bug in lxml on windows/OS X that causes a massive
|
||||||
|
# memory leak with non ASCII selectors
|
||||||
|
css = css.encode('ascii', 'ignore').decode('ascii')
|
||||||
|
ans = _selector_cache.get(css, None)
|
||||||
|
if ans is None:
|
||||||
|
ans = CSSSelector(css)
|
||||||
|
_selector_cache[css] = ans
|
||||||
|
return ans
|
||||||
|
|
||||||
class Stylizer(object):
|
class Stylizer(object):
|
||||||
STYLESHEETS = WeakKeyDictionary()
|
STYLESHEETS = WeakKeyDictionary()
|
||||||
@ -223,41 +264,12 @@ class Stylizer(object):
|
|||||||
rules.sort()
|
rules.sort()
|
||||||
self.rules = rules
|
self.rules = rules
|
||||||
self._styles = {}
|
self._styles = {}
|
||||||
class_sel_pat = re.compile(r'\.[a-z]+', re.IGNORECASE)
|
|
||||||
capital_sel_pat = re.compile(r'h|[A-Z]+')
|
|
||||||
for _, _, cssdict, text, _ in rules:
|
for _, _, cssdict, text, _ in rules:
|
||||||
fl = ':first-letter' in text
|
fl = ':first-letter' in text
|
||||||
if fl:
|
if fl:
|
||||||
text = text.replace(':first-letter', '')
|
text = text.replace(':first-letter', '')
|
||||||
try:
|
selector = get_css_selector(text)
|
||||||
selector = CSSSelector(text)
|
matches = selector(tree, self.logger)
|
||||||
except (AssertionError, ExpressionError, etree.XPathSyntaxError,
|
|
||||||
NameError, # thrown on OS X instead of SelectorSyntaxError
|
|
||||||
SelectorSyntaxError):
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
matches = selector(tree)
|
|
||||||
except etree.XPathEvalError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not matches:
|
|
||||||
ntext = capital_sel_pat.sub(lambda m: m.group().lower(), text)
|
|
||||||
if ntext != text:
|
|
||||||
self.logger.warn('Transformed CSS selector', text, 'to',
|
|
||||||
ntext)
|
|
||||||
selector = CSSSelector(ntext)
|
|
||||||
matches = selector(tree)
|
|
||||||
|
|
||||||
if not matches and class_sel_pat.match(text) and text.lower() != text:
|
|
||||||
found = False
|
|
||||||
ltext = text.lower()
|
|
||||||
for x in tree.xpath('//*[@class]'):
|
|
||||||
if ltext.endswith('.'+x.get('class').lower()):
|
|
||||||
matches.append(x)
|
|
||||||
found = True
|
|
||||||
if found:
|
|
||||||
self.logger.warn('Ignoring case mismatches for CSS selector: %s in %s'
|
|
||||||
%(text, item.href))
|
|
||||||
if fl:
|
if fl:
|
||||||
from lxml.builder import ElementMaker
|
from lxml.builder import ElementMaker
|
||||||
E = ElementMaker(namespace=XHTML_NS)
|
E = ElementMaker(namespace=XHTML_NS)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user