Revert CSS pipeline changes, as python functions in lxml are broken, badly

This commit is contained in:
Kovid Goyal 2011-08-24 14:45:21 -06:00
parent 23f4463f27
commit 5119925922
3 changed files with 44 additions and 1059 deletions

File diff suppressed because it is too large Load Diff

View File

@ -504,6 +504,9 @@ class Indexer(object): # {{{
else: else:
self.indices = self.create_book_index() self.indices = self.create_book_index()
if not self.indices:
raise ValueError('No valid entries in TOC, cannot generate index')
self.records.append(self.create_index_record()) self.records.append(self.create_index_record())
self.records.insert(0, self.create_header()) self.records.insert(0, self.create_header())
self.records.extend(self.cncx.records) self.records.extend(self.cncx.records)

View File

@ -27,7 +27,6 @@ from calibre import force_unicode
from calibre.ebooks import unit_convert from calibre.ebooks import unit_convert
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize
from calibre.ebooks.cssselect import css_to_xpath_no_case
cssutils_log.setLevel(logging.WARN) cssutils_log.setLevel(logging.WARN)
@ -99,71 +98,32 @@ FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
'x-large', 'xx-large']) 'x-large', 'xx-large'])
class CSSSelector(object): class CSSSelector(etree.XPath):
MIN_SPACE_RE = re.compile(r' *([>~+]) *')
LOCAL_NAME_RE = re.compile(r"(?<!local-)name[(][)] *= *'[^:]+:") LOCAL_NAME_RE = re.compile(r"(?<!local-)name[(][)] *= *'[^:]+:")
def __init__(self, css, namespaces=XPNSMAP): def __init__(self, css, namespaces=XPNSMAP):
css = self.MIN_SPACE_RE.sub(r'\1', css)
if isinstance(css, unicode): if isinstance(css, unicode):
# Workaround for bug in lxml on windows/OS X that causes a massive # Workaround for bug in lxml on windows/OS X that causes a massive
# memory leak with non ASCII selectors # memory leak with non ASCII selectors
css = css.encode('ascii', 'ignore').decode('ascii') css = css.encode('ascii', 'ignore').decode('ascii')
try: try:
path = self.LOCAL_NAME_RE.sub(r"local-name() = '", css_to_xpath(css)) path = css_to_xpath(css)
self.sel1 = etree.XPath(css_to_xpath(css), namespaces=namespaces) except UnicodeEncodeError: # Bug in css_to_xpath
except: path = '/'
self.sel1 = lambda x: [] except NotImplementedError: # Probably a subselect like :hover
try: path = '/'
path = self.LOCAL_NAME_RE.sub(r"local-name() = '", path = self.LOCAL_NAME_RE.sub(r"local-name() = '", path)
css_to_xpath_no_case(css)) etree.XPath.__init__(self, path, namespaces=namespaces)
self.sel2 = etree.XPath(path, namespaces=namespaces)
except:
self.sel2 = lambda x: []
self.sel2_use_logged = False
self.css = css self.css = css
def __call__(self, node, log):
try:
ans = self.sel1(node)
except (AssertionError, ExpressionError, etree.XPathSyntaxError,
NameError, # thrown on OS X instead of SelectorSyntaxError
SelectorSyntaxError):
return []
if not ans:
try:
ans = self.sel2(node)
except:
return []
else:
if ans and not self.sel2_use_logged:
self.sel2_use_logged = True
log.warn('Interpreting class and tag selectors case'
' insensitively in the CSS selector: %s'%self.css)
return ans
def __repr__(self): def __repr__(self):
return '<%s %s for %r>' % ( return '<%s %s for %r>' % (
self.__class__.__name__, self.__class__.__name__,
hex(abs(id(self)))[2:], hex(abs(id(self)))[2:],
self.css) self.css)
_selector_cache = {}
MIN_SPACE_RE = re.compile(r' *([>~+]) *')
def get_css_selector(raw_selector):
css = MIN_SPACE_RE.sub(r'\1', raw_selector)
if isinstance(css, unicode):
# Workaround for bug in lxml on windows/OS X that causes a massive
# memory leak with non ASCII selectors
css = css.encode('ascii', 'ignore').decode('ascii')
ans = _selector_cache.get(css, None)
if ans is None:
ans = CSSSelector(css)
_selector_cache[css] = ans
return ans
class Stylizer(object): class Stylizer(object):
STYLESHEETS = WeakKeyDictionary() STYLESHEETS = WeakKeyDictionary()
@ -263,12 +223,41 @@ class Stylizer(object):
rules.sort() rules.sort()
self.rules = rules self.rules = rules
self._styles = {} self._styles = {}
class_sel_pat = re.compile(r'\.[a-z]+', re.IGNORECASE)
capital_sel_pat = re.compile(r'h|[A-Z]+')
for _, _, cssdict, text, _ in rules: for _, _, cssdict, text, _ in rules:
fl = ':first-letter' in text fl = ':first-letter' in text
if fl: if fl:
text = text.replace(':first-letter', '') text = text.replace(':first-letter', '')
selector = get_css_selector(text) try:
matches = selector(tree, self.logger) selector = CSSSelector(text)
except (AssertionError, ExpressionError, etree.XPathSyntaxError,
NameError, # thrown on OS X instead of SelectorSyntaxError
SelectorSyntaxError):
continue
try:
matches = selector(tree)
except etree.XPathEvalError:
continue
if not matches:
ntext = capital_sel_pat.sub(lambda m: m.group().lower(), text)
if ntext != text:
self.logger.warn('Transformed CSS selector', text, 'to',
ntext)
selector = CSSSelector(ntext)
matches = selector(tree)
if not matches and class_sel_pat.match(text) and text.lower() != text:
found = False
ltext = text.lower()
for x in tree.xpath('//*[@class]'):
if ltext.endswith('.'+x.get('class').lower()):
matches.append(x)
found = True
if found:
self.logger.warn('Ignoring case mismatches for CSS selector: %s in %s'
%(text, item.href))
if fl: if fl:
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
E = ElementMaker(namespace=XHTML_NS) E = ElementMaker(namespace=XHTML_NS)