diff --git a/src/calibre/ebooks/cssselect.py b/src/calibre/ebooks/cssselect.py
new file mode 100644
index 0000000000..c4167a8e4d
--- /dev/null
+++ b/src/calibre/ebooks/cssselect.py
@@ -0,0 +1,1007 @@
+"""CSS Selectors based on XPath.
+
+This module supports selecting XML/HTML tags based on CSS selectors.
+See the `CSSSelector` class for details.
+"""
+
+import re
+from lxml import etree
+
+__all__ = ['SelectorSyntaxError', 'ExpressionError',
+ 'CSSSelector']
+
+try:
+ _basestring = basestring
+except NameError:
+ _basestring = str
+
+class SelectorSyntaxError(SyntaxError):
+ pass
+
+class ExpressionError(RuntimeError):
+ pass
+
+class CSSSelector(etree.XPath):
+ """A CSS selector.
+
+ Usage::
+
+ >>> from lxml import etree, cssselect
+ >>> select = cssselect.CSSSelector("a tag > child")
+
+ >>> root = etree.XML("TEXT")
+ >>> [ el.tag for el in select(root) ]
+ ['child']
+
+ To use CSS namespaces, you need to pass a prefix-to-namespace
+ mapping as ``namespaces`` keyword argument::
+
+ >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
+ >>> select_ns = cssselect.CSSSelector('root > rdf|Description',
+ ... namespaces={'rdf': rdfns})
+
+ >>> rdf = etree.XML((
+ ... ''
+ ... 'blah'
+ ... '') % rdfns)
+ >>> [(el.tag, el.text) for el in select_ns(rdf)]
+ [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
+ """
+ def __init__(self, css, namespaces=None):
+ path = css_to_xpath_no_case(css)
+ etree.XPath.__init__(self, path, namespaces=namespaces)
+ self.css = css
+
+ def __repr__(self):
+ return '<%s %s for %r>' % (
+ self.__class__.__name__,
+ hex(abs(id(self)))[2:],
+ self.css)
+
+##############################
+## Token objects:
+
+try:
+ _unicode = unicode
+ _unichr = unichr
+except NameError:
+ # Python 3
+ _unicode = str
+ _unichr = chr
+
+class _UniToken(_unicode):
+ def __new__(cls, contents, pos):
+ obj = _unicode.__new__(cls, contents)
+ obj.pos = pos
+ return obj
+
+ def __repr__(self):
+ return '%s(%s, %r)' % (
+ self.__class__.__name__,
+ _unicode.__repr__(self),
+ self.pos)
+
+class Symbol(_UniToken):
+ pass
+
+class String(_UniToken):
+ pass
+
+class Token(_UniToken):
+ pass
+
+############################################################
+## Parsing
+############################################################
+
+##############################
+## Syntax objects:
+
+class Class(object):
+ """
+ Represents selector.class_name
+ """
+
+ def __init__(self, selector, class_name):
+ self.selector = selector
+ # Kovid: Lowercased
+ self.class_name = class_name.lower()
+
+ def __repr__(self):
+ return '%s[%r.%s]' % (
+ self.__class__.__name__,
+ self.selector,
+ self.class_name)
+
+ def xpath(self):
+ sel_xpath = self.selector.xpath()
+ # Kovid: Lowercased
+ sel_xpath.add_condition(
+ "contains(concat(' ', css:lower-case(normalize-space(@class)), ' '), %s)" % xpath_literal(' '+self.class_name+' '))
+ return sel_xpath
+
+class Function(object):
+ """
+ Represents selector:name(expr)
+ """
+
+ unsupported = [
+ 'target', 'lang', 'enabled', 'disabled',]
+
+ def __init__(self, selector, type, name, expr):
+ self.selector = selector
+ self.type = type
+ self.name = name
+ self.expr = expr
+
+ def __repr__(self):
+ return '%s[%r%s%s(%r)]' % (
+ self.__class__.__name__,
+ self.selector,
+ self.type, self.name, self.expr)
+
+ def xpath(self):
+ sel_path = self.selector.xpath()
+ if self.name in self.unsupported:
+ raise ExpressionError(
+ "The pseudo-class %r is not supported" % self.name)
+ method = '_xpath_' + self.name.replace('-', '_')
+ if not hasattr(self, method):
+ raise ExpressionError(
+ "The pseudo-class %r is unknown" % self.name)
+ method = getattr(self, method)
+ return method(sel_path, self.expr)
+
+ def _xpath_nth_child(self, xpath, expr, last=False,
+ add_name_test=True):
+ a, b = parse_series(expr)
+ if not a and not b and not last:
+ # a=0 means nothing is returned...
+ xpath.add_condition('false() and position() = 0')
+ return xpath
+ if add_name_test:
+ xpath.add_name_test()
+ xpath.add_star_prefix()
+ if a == 0:
+ if last:
+ b = 'last() - %s' % b
+ xpath.add_condition('position() = %s' % b)
+ return xpath
+ if last:
+ # FIXME: I'm not sure if this is right
+ a = -a
+ b = -b
+ if b > 0:
+ b_neg = str(-b)
+ else:
+ b_neg = '+%s' % (-b)
+ if a != 1:
+ expr = ['(position() %s) mod %s = 0' % (b_neg, a)]
+ else:
+ expr = []
+ if b >= 0:
+ expr.append('position() >= %s' % b)
+ elif b < 0 and last:
+ expr.append('position() < (last() %s)' % b)
+ expr = ' and '.join(expr)
+ if expr:
+ xpath.add_condition(expr)
+ return xpath
+ # FIXME: handle an+b, odd, even
+ # an+b means every-a, plus b, e.g., 2n+1 means odd
+ # 0n+b means b
+ # n+0 means a=1, i.e., all elements
+ # an means every a elements, i.e., 2n means even
+ # -n means -1n
+ # -1n+6 means elements 6 and previous
+
+ def _xpath_nth_last_child(self, xpath, expr):
+ return self._xpath_nth_child(xpath, expr, last=True)
+
+ def _xpath_nth_of_type(self, xpath, expr):
+ if xpath.element == '*':
+ raise NotImplementedError(
+ "*:nth-of-type() is not implemented")
+ return self._xpath_nth_child(xpath, expr, add_name_test=False)
+
+ def _xpath_nth_last_of_type(self, xpath, expr):
+ return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)
+
+ def _xpath_contains(self, xpath, expr):
+ # text content, minus tags, must contain expr
+ if isinstance(expr, Element):
+ expr = expr._format_element()
+ xpath.add_condition('contains(css:lower-case(string(.)), %s)'
+ % xpath_literal(expr.lower()))
+ # FIXME: Currently case insensitive matching doesn't seem to be happening
+ return xpath
+
+ def _xpath_not(self, xpath, expr):
+ # everything for which not expr applies
+ expr = expr.xpath()
+ cond = expr.condition
+ # FIXME: should I do something about element_path?
+ xpath.add_condition('not(%s)' % cond)
+ return xpath
+
+def _make_lower_case(context, s):
+ return s.lower()
+
+ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
+ns.prefix = 'css'
+ns['lower-case'] = _make_lower_case
+
+class Pseudo(object):
+ """
+ Represents selector:ident
+ """
+
+ unsupported = ['indeterminate', 'first-line', 'first-letter',
+ 'selection', 'before', 'after', 'link', 'visited',
+ 'active', 'focus', 'hover']
+
+ def __init__(self, element, type, ident):
+ self.element = element
+ assert type in (':', '::')
+ self.type = type
+ self.ident = ident
+
+ def __repr__(self):
+ return '%s[%r%s%s]' % (
+ self.__class__.__name__,
+ self.element,
+ self.type, self.ident)
+
+ def xpath(self):
+ el_xpath = self.element.xpath()
+ if self.ident in self.unsupported:
+ raise ExpressionError(
+ "The pseudo-class %r is unsupported" % self.ident)
+ method = '_xpath_' + self.ident.replace('-', '_')
+ if not hasattr(self, method):
+ raise ExpressionError(
+ "The pseudo-class %r is unknown" % self.ident)
+ method = getattr(self, method)
+ el_xpath = method(el_xpath)
+ return el_xpath
+
+ def _xpath_checked(self, xpath):
+ # FIXME: is this really all the elements?
+ xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')")
+ return xpath
+
+ def _xpath_root(self, xpath):
+ # if this element is the root element
+ raise NotImplementedError
+
+ def _xpath_first_child(self, xpath):
+ xpath.add_star_prefix()
+ xpath.add_name_test()
+ xpath.add_condition('position() = 1')
+ return xpath
+
+ def _xpath_last_child(self, xpath):
+ xpath.add_star_prefix()
+ xpath.add_name_test()
+ xpath.add_condition('position() = last()')
+ return xpath
+
+ def _xpath_first_of_type(self, xpath):
+ if xpath.element == '*':
+ raise NotImplementedError(
+ "*:first-of-type is not implemented")
+ xpath.add_star_prefix()
+ xpath.add_condition('position() = 1')
+ return xpath
+
+ def _xpath_last_of_type(self, xpath):
+ if xpath.element == '*':
+ raise NotImplementedError(
+ "*:last-of-type is not implemented")
+ xpath.add_star_prefix()
+ xpath.add_condition('position() = last()')
+ return xpath
+
+ def _xpath_only_child(self, xpath):
+ xpath.add_name_test()
+ xpath.add_star_prefix()
+ xpath.add_condition('last() = 1')
+ return xpath
+
+ def _xpath_only_of_type(self, xpath):
+ if xpath.element == '*':
+ raise NotImplementedError(
+ "*:only-of-type is not implemented")
+ xpath.add_condition('last() = 1')
+ return xpath
+
+ def _xpath_empty(self, xpath):
+ xpath.add_condition("not(*) and not(normalize-space())")
+ return xpath
+
+class Attrib(object):
+ """
+ Represents selector[namespace|attrib operator value]
+ """
+
+ def __init__(self, selector, namespace, attrib, operator, value):
+ self.selector = selector
+ self.namespace = namespace
+ self.attrib = attrib
+ self.operator = operator
+ self.value = value
+
+ def __repr__(self):
+ if self.operator == 'exists':
+ return '%s[%r[%s]]' % (
+ self.__class__.__name__,
+ self.selector,
+ self._format_attrib())
+ else:
+ return '%s[%r[%s %s %r]]' % (
+ self.__class__.__name__,
+ self.selector,
+ self._format_attrib(),
+ self.operator,
+ self.value)
+
+ def _format_attrib(self):
+ if self.namespace == '*':
+ return self.attrib
+ else:
+ return '%s|%s' % (self.namespace, self.attrib)
+
+ def _xpath_attrib(self):
+ # FIXME: if attrib is *?
+ if self.namespace == '*':
+ return '@' + self.attrib
+ else:
+ return '@%s:%s' % (self.namespace, self.attrib)
+
+ def xpath(self):
+ path = self.selector.xpath()
+ attrib = self._xpath_attrib()
+ value = self.value
+ if self.operator == 'exists':
+ assert not value
+ path.add_condition(attrib)
+ elif self.operator == '=':
+ path.add_condition('%s = %s' % (attrib,
+ xpath_literal(value)))
+ elif self.operator == '!=':
+ # FIXME: this seems like a weird hack...
+ if value:
+ path.add_condition('not(%s) or %s != %s'
+ % (attrib, attrib, xpath_literal(value)))
+ else:
+ path.add_condition('%s != %s'
+ % (attrib, xpath_literal(value)))
+ #path.add_condition('%s != %s' % (attrib, xpath_literal(value)))
+ elif self.operator == '~=':
+ path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' ')))
+ elif self.operator == '|=':
+ # Weird, but true...
+ path.add_condition('%s = %s or starts-with(%s, %s)' % (
+ attrib, xpath_literal(value),
+ attrib, xpath_literal(value + '-')))
+ elif self.operator == '^=':
+ path.add_condition('starts-with(%s, %s)' % (
+ attrib, xpath_literal(value)))
+ elif self.operator == '$=':
+ # Oddly there is a starts-with in XPath 1.0, but not ends-with
+ path.add_condition('substring(%s, string-length(%s)-%s) = %s'
+ % (attrib, attrib, len(value)-1, xpath_literal(value)))
+ elif self.operator == '*=':
+ # FIXME: case sensitive?
+ path.add_condition('contains(%s, %s)' % (
+ attrib, xpath_literal(value)))
+ else:
+ assert 0, ("Unknown operator: %r" % self.operator)
+ return path
+
+class Element(object):
+ """
+ Represents namespace|element
+ """
+
+ def __init__(self, namespace, element):
+ self.namespace = namespace
+ self.element = element
+
+ def __repr__(self):
+ return '%s[%s]' % (
+ self.__class__.__name__,
+ self._format_element())
+
+ def _format_element(self):
+ if self.namespace == '*':
+ return self.element
+ else:
+ return '%s|%s' % (self.namespace, self.element)
+
+ def xpath(self):
+ if self.namespace == '*':
+ el = self.element.lower()
+ else:
+ # Kovid: Lowercased
+ el = '%s:%s' % (self.namespace, self.element.lower())
+ return XPathExpr(element=el)
+
+class Hash(object):
+ """
+ Represents selector#id
+ """
+
+ def __init__(self, selector, id):
+ self.selector = selector
+ self.id = id
+
+ def __repr__(self):
+ return '%s[%r#%s]' % (
+ self.__class__.__name__,
+ self.selector, self.id)
+
+ def xpath(self):
+ path = self.selector.xpath()
+ path.add_condition('@id = %s' % xpath_literal(self.id))
+ return path
+
+class Or(object):
+
+ def __init__(self, items):
+ self.items = items
+ def __repr__(self):
+ return '%s(%r)' % (
+ self.__class__.__name__,
+ self.items)
+
+ def xpath(self):
+ paths = [item.xpath() for item in self.items]
+ return XPathExprOr(paths)
+
+class CombinedSelector(object):
+
+ _method_mapping = {
+ ' ': 'descendant',
+ '>': 'child',
+ '+': 'direct_adjacent',
+ '~': 'indirect_adjacent',
+ }
+
+ def __init__(self, selector, combinator, subselector):
+ assert selector is not None
+ self.selector = selector
+ self.combinator = combinator
+ self.subselector = subselector
+
+ def __repr__(self):
+ if self.combinator == ' ':
+ comb = ''
+ else:
+ comb = self.combinator
+ return '%s[%r %s %r]' % (
+ self.__class__.__name__,
+ self.selector,
+ comb,
+ self.subselector)
+
+ def xpath(self):
+ if self.combinator not in self._method_mapping:
+ raise ExpressionError(
+ "Unknown combinator: %r" % self.combinator)
+ method = '_xpath_' + self._method_mapping[self.combinator]
+ method = getattr(self, method)
+ path = self.selector.xpath()
+ return method(path, self.subselector)
+
+ def _xpath_descendant(self, xpath, sub):
+ # when sub is a descendant in any way of xpath
+ xpath.join('/descendant::', sub.xpath())
+ return xpath
+
+ def _xpath_child(self, xpath, sub):
+ # when sub is an immediate child of xpath
+ xpath.join('/', sub.xpath())
+ return xpath
+
+ def _xpath_direct_adjacent(self, xpath, sub):
+ # when sub immediately follows xpath
+ xpath.join('/following-sibling::', sub.xpath())
+ xpath.add_name_test()
+ xpath.add_condition('position() = 1')
+ return xpath
+
+ def _xpath_indirect_adjacent(self, xpath, sub):
+ # when sub comes somewhere after xpath as a sibling
+ xpath.join('/following-sibling::', sub.xpath())
+ return xpath
+
+##############################
+## XPathExpr objects:
+
+_el_re = re.compile(r'^\w+\s*$', re.UNICODE)
+_id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE)
+_class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE)
+
+def css_to_xpath_no_case(css_expr, prefix='descendant-or-self::'):
+ if isinstance(css_expr, _basestring):
+ match = _el_re.search(css_expr)
+ if match is not None:
+ # Kovid: Lowercased
+ return '%s%s' % (prefix, match.group(0).strip().lower())
+ match = _id_re.search(css_expr)
+ if match is not None:
+ return "%s%s[@id = '%s']" % (
+ prefix, match.group(1) or '*', match.group(2))
+ match = _class_re.search(css_expr)
+ if match is not None:
+ # Kovid: lowercased
+ return "%s%s[contains(concat(' ', css:lower-case(normalize-space(@class)), ' '), ' %s ')]" % (
+ prefix, match.group(1).lower() or '*', match.group(2).lower())
+ css_expr = parse(css_expr)
+ expr = css_expr.xpath()
+ assert expr is not None, (
+ "Got None for xpath expression from %s" % repr(css_expr))
+ if prefix:
+ expr.add_prefix(prefix)
+ return _unicode(expr)
+
+class XPathExpr(object):
+
+ def __init__(self, prefix=None, path=None, element='*', condition=None,
+ star_prefix=False):
+ self.prefix = prefix
+ self.path = path
+ self.element = element
+ self.condition = condition
+ self.star_prefix = star_prefix
+
+ def __str__(self):
+ path = ''
+ if self.prefix is not None:
+ path += _unicode(self.prefix)
+ if self.path is not None:
+ path += _unicode(self.path)
+ path += _unicode(self.element)
+ if self.condition:
+ path += '[%s]' % self.condition
+ return path
+
+ def __repr__(self):
+ return '%s[%s]' % (
+ self.__class__.__name__, self)
+
+ def add_condition(self, condition):
+ if self.condition:
+ self.condition = '%s and (%s)' % (self.condition, condition)
+ else:
+ self.condition = condition
+
+ def add_path(self, part):
+ if self.path is None:
+ self.path = self.element
+ else:
+ self.path += self.element
+ self.element = part
+
+ def add_prefix(self, prefix):
+ if self.prefix:
+ self.prefix = prefix + self.prefix
+ else:
+ self.prefix = prefix
+
+ def add_name_test(self):
+ if self.element == '*':
+ # We weren't doing a test anyway
+ return
+ self.add_condition("name() = %s" % xpath_literal(self.element))
+ self.element = '*'
+
+ def add_star_prefix(self):
+ """
+ Adds a /* prefix if there is no prefix. This is when you need
+ to keep context's constrained to a single parent.
+ """
+ if self.path:
+ self.path += '*/'
+ else:
+ self.path = '*/'
+ self.star_prefix = True
+
+ def join(self, combiner, other):
+ prefix = _unicode(self)
+ prefix += combiner
+ path = (other.prefix or '') + (other.path or '')
+ # We don't need a star prefix if we are joining to this other
+ # prefix; so we'll get rid of it
+ if other.star_prefix and path == '*/':
+ path = ''
+ self.prefix = prefix
+ self.path = path
+ self.element = other.element
+ self.condition = other.condition
+
+class XPathExprOr(XPathExpr):
+ """
+ Represents |'d expressions. Note that unfortunately it isn't
+ the union, it's the sum, so duplicate elements will appear.
+ """
+
+ def __init__(self, items, prefix=None):
+ for item in items:
+ assert item is not None
+ self.items = items
+ self.prefix = prefix
+
+ def __str__(self):
+ prefix = self.prefix or ''
+ return ' | '.join(["%s%s" % (prefix,i) for i in self.items])
+
+split_at_single_quotes = re.compile("('+)").split
+
+def xpath_literal(s):
+ if isinstance(s, Element):
+ # This is probably a symbol that looks like an expression...
+ s = s._format_element()
+ else:
+ s = _unicode(s)
+ if "'" not in s:
+ s = "'%s'" % s
+ elif '"' not in s:
+ s = '"%s"' % s
+ else:
+ s = "concat(%s)" % ','.join([
+ (("'" in part) and '"%s"' or "'%s'") % part
+ for part in split_at_single_quotes(s) if part
+ ])
+ return s
+
+##############################
+## Parsing functions
+
+def parse(string):
+ stream = TokenStream(tokenize(string))
+ stream.source = string
+ try:
+ return parse_selector_group(stream)
+ except SelectorSyntaxError:
+ import sys
+ e = sys.exc_info()[1]
+ message = "%s at %s -> %r" % (
+ e, stream.used, stream.peek())
+ e.msg = message
+ if sys.version_info < (2,6):
+ e.message = message
+ e.args = tuple([message])
+ raise
+
+def parse_selector_group(stream):
+ result = []
+ while 1:
+ result.append(parse_selector(stream))
+ if stream.peek() == ',':
+ stream.next()
+ else:
+ break
+ if len(result) == 1:
+ return result[0]
+ else:
+ return Or(result)
+
+def parse_selector(stream):
+ result = parse_simple_selector(stream)
+ while 1:
+ peek = stream.peek()
+ if peek == ',' or peek is None:
+ return result
+ elif peek in ('+', '>', '~'):
+ # A combinator
+ combinator = stream.next()
+ else:
+ combinator = ' '
+ consumed = len(stream.used)
+ next_selector = parse_simple_selector(stream)
+ if consumed == len(stream.used):
+ raise SelectorSyntaxError(
+ "Expected selector, got '%s'" % stream.peek())
+ result = CombinedSelector(result, combinator, next_selector)
+ return result
+
+def parse_simple_selector(stream):
+ peek = stream.peek()
+ if peek != '*' and not isinstance(peek, Symbol):
+ element = namespace = '*'
+ else:
+ next = stream.next()
+ if next != '*' and not isinstance(next, Symbol):
+ raise SelectorSyntaxError(
+ "Expected symbol, got '%s'" % next)
+ if stream.peek() == '|':
+ namespace = next
+ stream.next()
+ element = stream.next()
+ if element != '*' and not isinstance(next, Symbol):
+ raise SelectorSyntaxError(
+ "Expected symbol, got '%s'" % next)
+ else:
+ namespace = '*'
+ element = next
+ result = Element(namespace, element)
+ has_hash = False
+ while 1:
+ peek = stream.peek()
+ if peek == '#':
+ if has_hash:
+ # You can't have two hashes
+ # (FIXME: is there some more general rule I'm missing?)
+ break
+ stream.next()
+ result = Hash(result, stream.next())
+ has_hash = True
+ continue
+ elif peek == '.':
+ stream.next()
+ result = Class(result, stream.next())
+ continue
+ elif peek == '[':
+ stream.next()
+ result = parse_attrib(result, stream)
+ next = stream.next()
+ if not next == ']':
+ raise SelectorSyntaxError(
+ "] expected, got '%s'" % next)
+ continue
+ elif peek == ':' or peek == '::':
+ type = stream.next()
+ ident = stream.next()
+ if not isinstance(ident, Symbol):
+ raise SelectorSyntaxError(
+ "Expected symbol, got '%s'" % ident)
+ if stream.peek() == '(':
+ stream.next()
+ peek = stream.peek()
+ if isinstance(peek, String):
+ selector = stream.next()
+ elif isinstance(peek, Symbol) and is_int(peek):
+ selector = int(stream.next())
+ else:
+ # FIXME: parse_simple_selector, or selector, or...?
+ selector = parse_simple_selector(stream)
+ next = stream.next()
+ if not next == ')':
+ raise SelectorSyntaxError(
+ "Expected ')', got '%s' and '%s'"
+ % (next, selector))
+ result = Function(result, type, ident, selector)
+ else:
+ result = Pseudo(result, type, ident)
+ continue
+ else:
+ if peek == ' ':
+ stream.next()
+ break
+ # FIXME: not sure what "negation" is
+ return result
+
+def is_int(v):
+ try:
+ int(v)
+ except ValueError:
+ return False
+ else:
+ return True
+
+def parse_attrib(selector, stream):
+ attrib = stream.next()
+ if stream.peek() == '|':
+ namespace = attrib
+ stream.next()
+ attrib = stream.next()
+ else:
+ namespace = '*'
+ if stream.peek() == ']':
+ return Attrib(selector, namespace, attrib, 'exists', None)
+ op = stream.next()
+ if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
+ raise SelectorSyntaxError(
+ "Operator expected, got '%s'" % op)
+ value = stream.next()
+ if not isinstance(value, (Symbol, String)):
+ raise SelectorSyntaxError(
+ "Expected string or symbol, got '%s'" % value)
+ return Attrib(selector, namespace, attrib, op, value)
+
+def parse_series(s):
+ """
+ Parses things like '1n+2', or 'an+b' generally, returning (a, b)
+ """
+ if isinstance(s, Element):
+ s = s._format_element()
+ if not s or s == '*':
+ # Happens when there's nothing, which the CSS parser thinks of as *
+ return (0, 0)
+ if isinstance(s, int):
+ # Happens when you just get a number
+ return (0, s)
+ if s == 'odd':
+ return (2, 1)
+ elif s == 'even':
+ return (2, 0)
+ elif s == 'n':
+ return (1, 0)
+ if 'n' not in s:
+ # Just a b
+ return (0, int(s))
+ a, b = s.split('n', 1)
+ if not a:
+ a = 1
+ elif a == '-' or a == '+':
+ a = int(a+'1')
+ else:
+ a = int(a)
+ if not b:
+ b = 0
+ elif b == '-' or b == '+':
+ b = int(b+'1')
+ else:
+ b = int(b)
+ return (a, b)
+
+
+############################################################
+## Tokenizing
+############################################################
+
+_match_whitespace = re.compile(r'\s+', re.UNICODE).match
+
+_replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub
+
+_match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match
+
+def tokenize(s):
+ pos = 0
+ s = _replace_comments('', s)
+ while 1:
+ match = _match_whitespace(s, pos=pos)
+ if match:
+ preceding_whitespace_pos = pos
+ pos = match.end()
+ else:
+ preceding_whitespace_pos = 0
+ if pos >= len(s):
+ return
+ match = _match_count_number(s, pos=pos)
+ if match and match.group() != 'n':
+ sym = s[pos:match.end()]
+ yield Symbol(sym, pos)
+ pos = match.end()
+ continue
+ c = s[pos]
+ c2 = s[pos:pos+2]
+ if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='):
+ yield Token(c2, pos)
+ pos += 2
+ continue
+ if c in '>+~,.*=[]()|:#':
+ if c in '.#[' and preceding_whitespace_pos > 0:
+ yield Token(' ', preceding_whitespace_pos)
+ yield Token(c, pos)
+ pos += 1
+ continue
+ if c == '"' or c == "'":
+ # Quoted string
+ old_pos = pos
+ sym, pos = tokenize_escaped_string(s, pos)
+ yield String(sym, old_pos)
+ continue
+ old_pos = pos
+ sym, pos = tokenize_symbol(s, pos)
+ yield Symbol(sym, old_pos)
+ continue
+
+split_at_string_escapes = re.compile(r'(\\(?:%s))'
+ % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?',
+ '[^A-Fa-f0-9]'])).split
+
+def unescape_string_literal(literal):
+ substrings = []
+ for substring in split_at_string_escapes(literal):
+ if not substring:
+ continue
+ elif '\\' in substring:
+ if substring[0] == '\\' and len(substring) > 1:
+ substring = substring[1:]
+ if substring[0] in '0123456789ABCDEFabcdef':
+ # int() correctly ignores the potentially trailing whitespace
+ substring = _unichr(int(substring, 16))
+ else:
+ raise SelectorSyntaxError(
+ "Invalid escape sequence %r in string %r"
+ % (substring.split('\\')[1], literal))
+ substrings.append(substring)
+ return ''.join(substrings)
+
+def tokenize_escaped_string(s, pos):
+ quote = s[pos]
+ assert quote in ('"', "'")
+ pos = pos+1
+ start = pos
+ while 1:
+ next = s.find(quote, pos)
+ if next == -1:
+ raise SelectorSyntaxError(
+ "Expected closing %s for string in: %r"
+ % (quote, s[start:]))
+ result = s[start:next]
+ if result.endswith('\\'):
+ # next quote character is escaped
+ pos = next+1
+ continue
+ if '\\' in result:
+ result = unescape_string_literal(result)
+ return result, next+1
+
+_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
+
+def tokenize_symbol(s, pos):
+ start = pos
+ match = _illegal_symbol.search(s, pos=pos)
+ if not match:
+ # Goes to end of s
+ return s[start:], len(s)
+ if match.start() == pos:
+ assert 0, (
+ "Unexpected symbol: %r at %s" % (s[pos], pos))
+ if not match:
+ result = s[start:]
+ pos = len(s)
+ else:
+ result = s[start:match.start()]
+ pos = match.start()
+ try:
+ result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
+ except UnicodeDecodeError:
+ import sys
+ e = sys.exc_info()[1]
+ raise SelectorSyntaxError(
+ "Bad symbol %r: %s" % (result, e))
+ return result, pos
+
+class TokenStream(object):
+
+ def __init__(self, tokens, source=None):
+ self.used = []
+ self.tokens = iter(tokens)
+ self.source = source
+ self.peeked = None
+ self._peeking = False
+ try:
+ self.next_token = self.tokens.next
+ except AttributeError:
+ # Python 3
+ self.next_token = self.tokens.__next__
+
+ def next(self):
+ if self._peeking:
+ self._peeking = False
+ self.used.append(self.peeked)
+ return self.peeked
+ else:
+ try:
+ next = self.next_token()
+ self.used.append(next)
+ return next
+ except StopIteration:
+ return None
+
+ def __iter__(self):
+ return iter(self.next, None)
+
+ def peek(self):
+ if not self._peeking:
+ try:
+ self.peeked = self.next_token()
+ except StopIteration:
+ return None
+ self._peeking = True
+ return self.peeked
diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py
index f6ff594701..88e074320d 100644
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@@ -27,6 +27,7 @@ from calibre import force_unicode
from calibre.ebooks import unit_convert
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize
+from calibre.ebooks.cssselect import css_to_xpath_no_case
cssutils_log.setLevel(logging.WARN)
@@ -98,32 +99,72 @@ FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
'x-large', 'xx-large'])
-class CSSSelector(etree.XPath):
- MIN_SPACE_RE = re.compile(r' *([>~+]) *')
+class CSSSelector(object):
+
LOCAL_NAME_RE = re.compile(r"(?' % (
self.__class__.__name__,
hex(abs(id(self)))[2:],
self.css)
+_selector_cache = {}
+
+MIN_SPACE_RE = re.compile(r' *([>~+]) *')
+
+def get_css_selector(raw_selector):
+ css = MIN_SPACE_RE.sub(r'\1', raw_selector)
+ if isinstance(css, unicode):
+ # Workaround for bug in lxml on windows/OS X that causes a massive
+ # memory leak with non ASCII selectors
+ css = css.encode('ascii', 'ignore').decode('ascii')
+ ans = _selector_cache.get(css, None)
+ if ans is None:
+ ans = CSSSelector(css)
+ _selector_cache[css] = ans
+ return ans
class Stylizer(object):
STYLESHEETS = WeakKeyDictionary()
@@ -223,41 +264,12 @@ class Stylizer(object):
rules.sort()
self.rules = rules
self._styles = {}
- class_sel_pat = re.compile(r'\.[a-z]+', re.IGNORECASE)
- capital_sel_pat = re.compile(r'h|[A-Z]+')
for _, _, cssdict, text, _ in rules:
fl = ':first-letter' in text
if fl:
text = text.replace(':first-letter', '')
- try:
- selector = CSSSelector(text)
- except (AssertionError, ExpressionError, etree.XPathSyntaxError,
- NameError, # thrown on OS X instead of SelectorSyntaxError
- SelectorSyntaxError):
- continue
- try:
- matches = selector(tree)
- except etree.XPathEvalError:
- continue
-
- if not matches:
- ntext = capital_sel_pat.sub(lambda m: m.group().lower(), text)
- if ntext != text:
- self.logger.warn('Transformed CSS selector', text, 'to',
- ntext)
- selector = CSSSelector(ntext)
- matches = selector(tree)
-
- if not matches and class_sel_pat.match(text) and text.lower() != text:
- found = False
- ltext = text.lower()
- for x in tree.xpath('//*[@class]'):
- if ltext.endswith('.'+x.get('class').lower()):
- matches.append(x)
- found = True
- if found:
- self.logger.warn('Ignoring case mismatches for CSS selector: %s in %s'
- %(text, item.href))
+ selector = get_css_selector(text)
+ matches = selector(tree, self.logger)
if fl:
from lxml.builder import ElementMaker
E = ElementMaker(namespace=XHTML_NS)