diff --git a/src/calibre/ebooks/cssselect.py b/src/calibre/ebooks/cssselect.py new file mode 100644 index 0000000000..c4167a8e4d --- /dev/null +++ b/src/calibre/ebooks/cssselect.py @@ -0,0 +1,1007 @@ +"""CSS Selectors based on XPath. + +This module supports selecting XML/HTML tags based on CSS selectors. +See the `CSSSelector` class for details. +""" + +import re +from lxml import etree + +__all__ = ['SelectorSyntaxError', 'ExpressionError', + 'CSSSelector'] + +try: + _basestring = basestring +except NameError: + _basestring = str + +class SelectorSyntaxError(SyntaxError): + pass + +class ExpressionError(RuntimeError): + pass + +class CSSSelector(etree.XPath): + """A CSS selector. + + Usage:: + + >>> from lxml import etree, cssselect + >>> select = cssselect.CSSSelector("a tag > child") + + >>> root = etree.XML("TEXT") + >>> [ el.tag for el in select(root) ] + ['child'] + + To use CSS namespaces, you need to pass a prefix-to-namespace + mapping as ``namespaces`` keyword argument:: + + >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' + >>> select_ns = cssselect.CSSSelector('root > rdf|Description', + ... namespaces={'rdf': rdfns}) + + >>> rdf = etree.XML(( + ... '' + ... 'blah' + ... '') % rdfns) + >>> [(el.tag, el.text) for el in select_ns(rdf)] + [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')] + """ + def __init__(self, css, namespaces=None): + path = css_to_xpath_no_case(css) + etree.XPath.__init__(self, path, namespaces=namespaces) + self.css = css + + def __repr__(self): + return '<%s %s for %r>' % ( + self.__class__.__name__, + hex(abs(id(self)))[2:], + self.css) + +############################## +## Token objects: + +try: + _unicode = unicode + _unichr = unichr +except NameError: + # Python 3 + _unicode = str + _unichr = chr + +class _UniToken(_unicode): + def __new__(cls, contents, pos): + obj = _unicode.__new__(cls, contents) + obj.pos = pos + return obj + + def __repr__(self): + return '%s(%s, %r)' % ( + self.__class__.__name__, + _unicode.__repr__(self), + self.pos) + +class Symbol(_UniToken): + pass + +class String(_UniToken): + pass + +class Token(_UniToken): + pass + +############################################################ +## Parsing +############################################################ + +############################## +## Syntax objects: + +class Class(object): + """ + Represents selector.class_name + """ + + def __init__(self, selector, class_name): + self.selector = selector + # Kovid: Lowercased + self.class_name = class_name.lower() + + def __repr__(self): + return '%s[%r.%s]' % ( + self.__class__.__name__, + self.selector, + self.class_name) + + def xpath(self): + sel_xpath = self.selector.xpath() + # Kovid: Lowercased + sel_xpath.add_condition( + "contains(concat(' ', css:lower-case(normalize-space(@class)), ' '), %s)" % xpath_literal(' '+self.class_name+' ')) + return sel_xpath + +class Function(object): + """ + Represents selector:name(expr) + """ + + unsupported = [ + 'target', 'lang', 'enabled', 'disabled',] + + def __init__(self, selector, type, name, expr): + self.selector = selector + self.type = type + self.name = name + self.expr = expr + + def __repr__(self): + return '%s[%r%s%s(%r)]' % ( + self.__class__.__name__, + self.selector, + self.type, self.name, self.expr) + + def xpath(self): + sel_path = self.selector.xpath() + if self.name in self.unsupported: + raise ExpressionError( + "The pseudo-class %r is not supported" % self.name) + method = '_xpath_' + self.name.replace('-', '_') + if not hasattr(self, method): + raise ExpressionError( + "The pseudo-class %r is unknown" % self.name) + method = getattr(self, method) + return method(sel_path, self.expr) + + def _xpath_nth_child(self, xpath, expr, last=False, + add_name_test=True): + a, b = parse_series(expr) + if not a and not b and not last: + # a=0 means nothing is returned... + xpath.add_condition('false() and position() = 0') + return xpath + if add_name_test: + xpath.add_name_test() + xpath.add_star_prefix() + if a == 0: + if last: + b = 'last() - %s' % b + xpath.add_condition('position() = %s' % b) + return xpath + if last: + # FIXME: I'm not sure if this is right + a = -a + b = -b + if b > 0: + b_neg = str(-b) + else: + b_neg = '+%s' % (-b) + if a != 1: + expr = ['(position() %s) mod %s = 0' % (b_neg, a)] + else: + expr = [] + if b >= 0: + expr.append('position() >= %s' % b) + elif b < 0 and last: + expr.append('position() < (last() %s)' % b) + expr = ' and '.join(expr) + if expr: + xpath.add_condition(expr) + return xpath + # FIXME: handle an+b, odd, even + # an+b means every-a, plus b, e.g., 2n+1 means odd + # 0n+b means b + # n+0 means a=1, i.e., all elements + # an means every a elements, i.e., 2n means even + # -n means -1n + # -1n+6 means elements 6 and previous + + def _xpath_nth_last_child(self, xpath, expr): + return self._xpath_nth_child(xpath, expr, last=True) + + def _xpath_nth_of_type(self, xpath, expr): + if xpath.element == '*': + raise NotImplementedError( + "*:nth-of-type() is not implemented") + return self._xpath_nth_child(xpath, expr, add_name_test=False) + + def _xpath_nth_last_of_type(self, xpath, expr): + return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False) + + def _xpath_contains(self, xpath, expr): + # text content, minus tags, must contain expr + if isinstance(expr, Element): + expr = expr._format_element() + xpath.add_condition('contains(css:lower-case(string(.)), %s)' + % xpath_literal(expr.lower())) + # FIXME: Currently case insensitive matching doesn't seem to be happening + return xpath + + def _xpath_not(self, xpath, expr): + # everything for which not expr applies + expr = expr.xpath() + cond = expr.condition + # FIXME: should I do something about element_path? + xpath.add_condition('not(%s)' % cond) + return xpath + +def _make_lower_case(context, s): + return s.lower() + +ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') +ns.prefix = 'css' +ns['lower-case'] = _make_lower_case + +class Pseudo(object): + """ + Represents selector:ident + """ + + unsupported = ['indeterminate', 'first-line', 'first-letter', + 'selection', 'before', 'after', 'link', 'visited', + 'active', 'focus', 'hover'] + + def __init__(self, element, type, ident): + self.element = element + assert type in (':', '::') + self.type = type + self.ident = ident + + def __repr__(self): + return '%s[%r%s%s]' % ( + self.__class__.__name__, + self.element, + self.type, self.ident) + + def xpath(self): + el_xpath = self.element.xpath() + if self.ident in self.unsupported: + raise ExpressionError( + "The pseudo-class %r is unsupported" % self.ident) + method = '_xpath_' + self.ident.replace('-', '_') + if not hasattr(self, method): + raise ExpressionError( + "The pseudo-class %r is unknown" % self.ident) + method = getattr(self, method) + el_xpath = method(el_xpath) + return el_xpath + + def _xpath_checked(self, xpath): + # FIXME: is this really all the elements? + xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") + return xpath + + def _xpath_root(self, xpath): + # if this element is the root element + raise NotImplementedError + + def _xpath_first_child(self, xpath): + xpath.add_star_prefix() + xpath.add_name_test() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_last_child(self, xpath): + xpath.add_star_prefix() + xpath.add_name_test() + xpath.add_condition('position() = last()') + return xpath + + def _xpath_first_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:first-of-type is not implemented") + xpath.add_star_prefix() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_last_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:last-of-type is not implemented") + xpath.add_star_prefix() + xpath.add_condition('position() = last()') + return xpath + + def _xpath_only_child(self, xpath): + xpath.add_name_test() + xpath.add_star_prefix() + xpath.add_condition('last() = 1') + return xpath + + def _xpath_only_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:only-of-type is not implemented") + xpath.add_condition('last() = 1') + return xpath + + def _xpath_empty(self, xpath): + xpath.add_condition("not(*) and not(normalize-space())") + return xpath + +class Attrib(object): + """ + Represents selector[namespace|attrib operator value] + """ + + def __init__(self, selector, namespace, attrib, operator, value): + self.selector = selector + self.namespace = namespace + self.attrib = attrib + self.operator = operator + self.value = value + + def __repr__(self): + if self.operator == 'exists': + return '%s[%r[%s]]' % ( + self.__class__.__name__, + self.selector, + self._format_attrib()) + else: + return '%s[%r[%s %s %r]]' % ( + self.__class__.__name__, + self.selector, + self._format_attrib(), + self.operator, + self.value) + + def _format_attrib(self): + if self.namespace == '*': + return self.attrib + else: + return '%s|%s' % (self.namespace, self.attrib) + + def _xpath_attrib(self): + # FIXME: if attrib is *? + if self.namespace == '*': + return '@' + self.attrib + else: + return '@%s:%s' % (self.namespace, self.attrib) + + def xpath(self): + path = self.selector.xpath() + attrib = self._xpath_attrib() + value = self.value + if self.operator == 'exists': + assert not value + path.add_condition(attrib) + elif self.operator == '=': + path.add_condition('%s = %s' % (attrib, + xpath_literal(value))) + elif self.operator == '!=': + # FIXME: this seems like a weird hack... + if value: + path.add_condition('not(%s) or %s != %s' + % (attrib, attrib, xpath_literal(value))) + else: + path.add_condition('%s != %s' + % (attrib, xpath_literal(value))) + #path.add_condition('%s != %s' % (attrib, xpath_literal(value))) + elif self.operator == '~=': + path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' '))) + elif self.operator == '|=': + # Weird, but true... + path.add_condition('%s = %s or starts-with(%s, %s)' % ( + attrib, xpath_literal(value), + attrib, xpath_literal(value + '-'))) + elif self.operator == '^=': + path.add_condition('starts-with(%s, %s)' % ( + attrib, xpath_literal(value))) + elif self.operator == '$=': + # Oddly there is a starts-with in XPath 1.0, but not ends-with + path.add_condition('substring(%s, string-length(%s)-%s) = %s' + % (attrib, attrib, len(value)-1, xpath_literal(value))) + elif self.operator == '*=': + # FIXME: case sensitive? + path.add_condition('contains(%s, %s)' % ( + attrib, xpath_literal(value))) + else: + assert 0, ("Unknown operator: %r" % self.operator) + return path + +class Element(object): + """ + Represents namespace|element + """ + + def __init__(self, namespace, element): + self.namespace = namespace + self.element = element + + def __repr__(self): + return '%s[%s]' % ( + self.__class__.__name__, + self._format_element()) + + def _format_element(self): + if self.namespace == '*': + return self.element + else: + return '%s|%s' % (self.namespace, self.element) + + def xpath(self): + if self.namespace == '*': + el = self.element.lower() + else: + # Kovid: Lowercased + el = '%s:%s' % (self.namespace, self.element.lower()) + return XPathExpr(element=el) + +class Hash(object): + """ + Represents selector#id + """ + + def __init__(self, selector, id): + self.selector = selector + self.id = id + + def __repr__(self): + return '%s[%r#%s]' % ( + self.__class__.__name__, + self.selector, self.id) + + def xpath(self): + path = self.selector.xpath() + path.add_condition('@id = %s' % xpath_literal(self.id)) + return path + +class Or(object): + + def __init__(self, items): + self.items = items + def __repr__(self): + return '%s(%r)' % ( + self.__class__.__name__, + self.items) + + def xpath(self): + paths = [item.xpath() for item in self.items] + return XPathExprOr(paths) + +class CombinedSelector(object): + + _method_mapping = { + ' ': 'descendant', + '>': 'child', + '+': 'direct_adjacent', + '~': 'indirect_adjacent', + } + + def __init__(self, selector, combinator, subselector): + assert selector is not None + self.selector = selector + self.combinator = combinator + self.subselector = subselector + + def __repr__(self): + if self.combinator == ' ': + comb = '' + else: + comb = self.combinator + return '%s[%r %s %r]' % ( + self.__class__.__name__, + self.selector, + comb, + self.subselector) + + def xpath(self): + if self.combinator not in self._method_mapping: + raise ExpressionError( + "Unknown combinator: %r" % self.combinator) + method = '_xpath_' + self._method_mapping[self.combinator] + method = getattr(self, method) + path = self.selector.xpath() + return method(path, self.subselector) + + def _xpath_descendant(self, xpath, sub): + # when sub is a descendant in any way of xpath + xpath.join('/descendant::', sub.xpath()) + return xpath + + def _xpath_child(self, xpath, sub): + # when sub is an immediate child of xpath + xpath.join('/', sub.xpath()) + return xpath + + def _xpath_direct_adjacent(self, xpath, sub): + # when sub immediately follows xpath + xpath.join('/following-sibling::', sub.xpath()) + xpath.add_name_test() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_indirect_adjacent(self, xpath, sub): + # when sub comes somewhere after xpath as a sibling + xpath.join('/following-sibling::', sub.xpath()) + return xpath + +############################## +## XPathExpr objects: + +_el_re = re.compile(r'^\w+\s*$', re.UNICODE) +_id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE) +_class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE) + +def css_to_xpath_no_case(css_expr, prefix='descendant-or-self::'): + if isinstance(css_expr, _basestring): + match = _el_re.search(css_expr) + if match is not None: + # Kovid: Lowercased + return '%s%s' % (prefix, match.group(0).strip().lower()) + match = _id_re.search(css_expr) + if match is not None: + return "%s%s[@id = '%s']" % ( + prefix, match.group(1) or '*', match.group(2)) + match = _class_re.search(css_expr) + if match is not None: + # Kovid: lowercased + return "%s%s[contains(concat(' ', css:lower-case(normalize-space(@class)), ' '), ' %s ')]" % ( + prefix, match.group(1).lower() or '*', match.group(2).lower()) + css_expr = parse(css_expr) + expr = css_expr.xpath() + assert expr is not None, ( + "Got None for xpath expression from %s" % repr(css_expr)) + if prefix: + expr.add_prefix(prefix) + return _unicode(expr) + +class XPathExpr(object): + + def __init__(self, prefix=None, path=None, element='*', condition=None, + star_prefix=False): + self.prefix = prefix + self.path = path + self.element = element + self.condition = condition + self.star_prefix = star_prefix + + def __str__(self): + path = '' + if self.prefix is not None: + path += _unicode(self.prefix) + if self.path is not None: + path += _unicode(self.path) + path += _unicode(self.element) + if self.condition: + path += '[%s]' % self.condition + return path + + def __repr__(self): + return '%s[%s]' % ( + self.__class__.__name__, self) + + def add_condition(self, condition): + if self.condition: + self.condition = '%s and (%s)' % (self.condition, condition) + else: + self.condition = condition + + def add_path(self, part): + if self.path is None: + self.path = self.element + else: + self.path += self.element + self.element = part + + def add_prefix(self, prefix): + if self.prefix: + self.prefix = prefix + self.prefix + else: + self.prefix = prefix + + def add_name_test(self): + if self.element == '*': + # We weren't doing a test anyway + return + self.add_condition("name() = %s" % xpath_literal(self.element)) + self.element = '*' + + def add_star_prefix(self): + """ + Adds a /* prefix if there is no prefix. This is when you need + to keep context's constrained to a single parent. + """ + if self.path: + self.path += '*/' + else: + self.path = '*/' + self.star_prefix = True + + def join(self, combiner, other): + prefix = _unicode(self) + prefix += combiner + path = (other.prefix or '') + (other.path or '') + # We don't need a star prefix if we are joining to this other + # prefix; so we'll get rid of it + if other.star_prefix and path == '*/': + path = '' + self.prefix = prefix + self.path = path + self.element = other.element + self.condition = other.condition + +class XPathExprOr(XPathExpr): + """ + Represents |'d expressions. Note that unfortunately it isn't + the union, it's the sum, so duplicate elements will appear. + """ + + def __init__(self, items, prefix=None): + for item in items: + assert item is not None + self.items = items + self.prefix = prefix + + def __str__(self): + prefix = self.prefix or '' + return ' | '.join(["%s%s" % (prefix,i) for i in self.items]) + +split_at_single_quotes = re.compile("('+)").split + +def xpath_literal(s): + if isinstance(s, Element): + # This is probably a symbol that looks like an expression... + s = s._format_element() + else: + s = _unicode(s) + if "'" not in s: + s = "'%s'" % s + elif '"' not in s: + s = '"%s"' % s + else: + s = "concat(%s)" % ','.join([ + (("'" in part) and '"%s"' or "'%s'") % part + for part in split_at_single_quotes(s) if part + ]) + return s + +############################## +## Parsing functions + +def parse(string): + stream = TokenStream(tokenize(string)) + stream.source = string + try: + return parse_selector_group(stream) + except SelectorSyntaxError: + import sys + e = sys.exc_info()[1] + message = "%s at %s -> %r" % ( + e, stream.used, stream.peek()) + e.msg = message + if sys.version_info < (2,6): + e.message = message + e.args = tuple([message]) + raise + +def parse_selector_group(stream): + result = [] + while 1: + result.append(parse_selector(stream)) + if stream.peek() == ',': + stream.next() + else: + break + if len(result) == 1: + return result[0] + else: + return Or(result) + +def parse_selector(stream): + result = parse_simple_selector(stream) + while 1: + peek = stream.peek() + if peek == ',' or peek is None: + return result + elif peek in ('+', '>', '~'): + # A combinator + combinator = stream.next() + else: + combinator = ' ' + consumed = len(stream.used) + next_selector = parse_simple_selector(stream) + if consumed == len(stream.used): + raise SelectorSyntaxError( + "Expected selector, got '%s'" % stream.peek()) + result = CombinedSelector(result, combinator, next_selector) + return result + +def parse_simple_selector(stream): + peek = stream.peek() + if peek != '*' and not isinstance(peek, Symbol): + element = namespace = '*' + else: + next = stream.next() + if next != '*' and not isinstance(next, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % next) + if stream.peek() == '|': + namespace = next + stream.next() + element = stream.next() + if element != '*' and not isinstance(next, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % next) + else: + namespace = '*' + element = next + result = Element(namespace, element) + has_hash = False + while 1: + peek = stream.peek() + if peek == '#': + if has_hash: + # You can't have two hashes + # (FIXME: is there some more general rule I'm missing?) + break + stream.next() + result = Hash(result, stream.next()) + has_hash = True + continue + elif peek == '.': + stream.next() + result = Class(result, stream.next()) + continue + elif peek == '[': + stream.next() + result = parse_attrib(result, stream) + next = stream.next() + if not next == ']': + raise SelectorSyntaxError( + "] expected, got '%s'" % next) + continue + elif peek == ':' or peek == '::': + type = stream.next() + ident = stream.next() + if not isinstance(ident, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % ident) + if stream.peek() == '(': + stream.next() + peek = stream.peek() + if isinstance(peek, String): + selector = stream.next() + elif isinstance(peek, Symbol) and is_int(peek): + selector = int(stream.next()) + else: + # FIXME: parse_simple_selector, or selector, or...? + selector = parse_simple_selector(stream) + next = stream.next() + if not next == ')': + raise SelectorSyntaxError( + "Expected ')', got '%s' and '%s'" + % (next, selector)) + result = Function(result, type, ident, selector) + else: + result = Pseudo(result, type, ident) + continue + else: + if peek == ' ': + stream.next() + break + # FIXME: not sure what "negation" is + return result + +def is_int(v): + try: + int(v) + except ValueError: + return False + else: + return True + +def parse_attrib(selector, stream): + attrib = stream.next() + if stream.peek() == '|': + namespace = attrib + stream.next() + attrib = stream.next() + else: + namespace = '*' + if stream.peek() == ']': + return Attrib(selector, namespace, attrib, 'exists', None) + op = stream.next() + if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): + raise SelectorSyntaxError( + "Operator expected, got '%s'" % op) + value = stream.next() + if not isinstance(value, (Symbol, String)): + raise SelectorSyntaxError( + "Expected string or symbol, got '%s'" % value) + return Attrib(selector, namespace, attrib, op, value) + +def parse_series(s): + """ + Parses things like '1n+2', or 'an+b' generally, returning (a, b) + """ + if isinstance(s, Element): + s = s._format_element() + if not s or s == '*': + # Happens when there's nothing, which the CSS parser thinks of as * + return (0, 0) + if isinstance(s, int): + # Happens when you just get a number + return (0, s) + if s == 'odd': + return (2, 1) + elif s == 'even': + return (2, 0) + elif s == 'n': + return (1, 0) + if 'n' not in s: + # Just a b + return (0, int(s)) + a, b = s.split('n', 1) + if not a: + a = 1 + elif a == '-' or a == '+': + a = int(a+'1') + else: + a = int(a) + if not b: + b = 0 + elif b == '-' or b == '+': + b = int(b+'1') + else: + b = int(b) + return (a, b) + + +############################################################ +## Tokenizing +############################################################ + +_match_whitespace = re.compile(r'\s+', re.UNICODE).match + +_replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub + +_match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match + +def tokenize(s): + pos = 0 + s = _replace_comments('', s) + while 1: + match = _match_whitespace(s, pos=pos) + if match: + preceding_whitespace_pos = pos + pos = match.end() + else: + preceding_whitespace_pos = 0 + if pos >= len(s): + return + match = _match_count_number(s, pos=pos) + if match and match.group() != 'n': + sym = s[pos:match.end()] + yield Symbol(sym, pos) + pos = match.end() + continue + c = s[pos] + c2 = s[pos:pos+2] + if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): + yield Token(c2, pos) + pos += 2 + continue + if c in '>+~,.*=[]()|:#': + if c in '.#[' and preceding_whitespace_pos > 0: + yield Token(' ', preceding_whitespace_pos) + yield Token(c, pos) + pos += 1 + continue + if c == '"' or c == "'": + # Quoted string + old_pos = pos + sym, pos = tokenize_escaped_string(s, pos) + yield String(sym, old_pos) + continue + old_pos = pos + sym, pos = tokenize_symbol(s, pos) + yield Symbol(sym, old_pos) + continue + +split_at_string_escapes = re.compile(r'(\\(?:%s))' + % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', + '[^A-Fa-f0-9]'])).split + +def unescape_string_literal(literal): + substrings = [] + for substring in split_at_string_escapes(literal): + if not substring: + continue + elif '\\' in substring: + if substring[0] == '\\' and len(substring) > 1: + substring = substring[1:] + if substring[0] in '0123456789ABCDEFabcdef': + # int() correctly ignores the potentially trailing whitespace + substring = _unichr(int(substring, 16)) + else: + raise SelectorSyntaxError( + "Invalid escape sequence %r in string %r" + % (substring.split('\\')[1], literal)) + substrings.append(substring) + return ''.join(substrings) + +def tokenize_escaped_string(s, pos): + quote = s[pos] + assert quote in ('"', "'") + pos = pos+1 + start = pos + while 1: + next = s.find(quote, pos) + if next == -1: + raise SelectorSyntaxError( + "Expected closing %s for string in: %r" + % (quote, s[start:])) + result = s[start:next] + if result.endswith('\\'): + # next quote character is escaped + pos = next+1 + continue + if '\\' in result: + result = unescape_string_literal(result) + return result, next+1 + +_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) + +def tokenize_symbol(s, pos): + start = pos + match = _illegal_symbol.search(s, pos=pos) + if not match: + # Goes to end of s + return s[start:], len(s) + if match.start() == pos: + assert 0, ( + "Unexpected symbol: %r at %s" % (s[pos], pos)) + if not match: + result = s[start:] + pos = len(s) + else: + result = s[start:match.start()] + pos = match.start() + try: + result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') + except UnicodeDecodeError: + import sys + e = sys.exc_info()[1] + raise SelectorSyntaxError( + "Bad symbol %r: %s" % (result, e)) + return result, pos + +class TokenStream(object): + + def __init__(self, tokens, source=None): + self.used = [] + self.tokens = iter(tokens) + self.source = source + self.peeked = None + self._peeking = False + try: + self.next_token = self.tokens.next + except AttributeError: + # Python 3 + self.next_token = self.tokens.__next__ + + def next(self): + if self._peeking: + self._peeking = False + self.used.append(self.peeked) + return self.peeked + else: + try: + next = self.next_token() + self.used.append(next) + return next + except StopIteration: + return None + + def __iter__(self): + return iter(self.next, None) + + def peek(self): + if not self._peeking: + try: + self.peeked = self.next_token() + except StopIteration: + return None + self._peeking = True + return self.peeked diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index f6ff594701..88e074320d 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -27,6 +27,7 @@ from calibre import force_unicode from calibre.ebooks import unit_convert from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize +from calibre.ebooks.cssselect import css_to_xpath_no_case cssutils_log.setLevel(logging.WARN) @@ -98,32 +99,72 @@ FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large']) -class CSSSelector(etree.XPath): - MIN_SPACE_RE = re.compile(r' *([>~+]) *') +class CSSSelector(object): + LOCAL_NAME_RE = re.compile(r"(?' % ( self.__class__.__name__, hex(abs(id(self)))[2:], self.css) +_selector_cache = {} + +MIN_SPACE_RE = re.compile(r' *([>~+]) *') + +def get_css_selector(raw_selector): + css = MIN_SPACE_RE.sub(r'\1', raw_selector) + if isinstance(css, unicode): + # Workaround for bug in lxml on windows/OS X that causes a massive + # memory leak with non ASCII selectors + css = css.encode('ascii', 'ignore').decode('ascii') + ans = _selector_cache.get(css, None) + if ans is None: + ans = CSSSelector(css) + _selector_cache[css] = ans + return ans class Stylizer(object): STYLESHEETS = WeakKeyDictionary() @@ -223,41 +264,12 @@ class Stylizer(object): rules.sort() self.rules = rules self._styles = {} - class_sel_pat = re.compile(r'\.[a-z]+', re.IGNORECASE) - capital_sel_pat = re.compile(r'h|[A-Z]+') for _, _, cssdict, text, _ in rules: fl = ':first-letter' in text if fl: text = text.replace(':first-letter', '') - try: - selector = CSSSelector(text) - except (AssertionError, ExpressionError, etree.XPathSyntaxError, - NameError, # thrown on OS X instead of SelectorSyntaxError - SelectorSyntaxError): - continue - try: - matches = selector(tree) - except etree.XPathEvalError: - continue - - if not matches: - ntext = capital_sel_pat.sub(lambda m: m.group().lower(), text) - if ntext != text: - self.logger.warn('Transformed CSS selector', text, 'to', - ntext) - selector = CSSSelector(ntext) - matches = selector(tree) - - if not matches and class_sel_pat.match(text) and text.lower() != text: - found = False - ltext = text.lower() - for x in tree.xpath('//*[@class]'): - if ltext.endswith('.'+x.get('class').lower()): - matches.append(x) - found = True - if found: - self.logger.warn('Ignoring case mismatches for CSS selector: %s in %s' - %(text, item.href)) + selector = get_css_selector(text) + matches = selector(tree, self.logger) if fl: from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS)