diff --git a/setup/installer/linux/freeze2.py b/setup/installer/linux/freeze2.py
index 0ed49c9fef..2a91876d58 100644
--- a/setup/installer/linux/freeze2.py
+++ b/setup/installer/linux/freeze2.py
@@ -16,7 +16,7 @@ SITE_PACKAGES = ['PIL', 'dateutil', 'dns', 'PyQt4', 'mechanize',
'sip.so', 'BeautifulSoup.py', 'cssutils', 'encutils', 'lxml',
'sipconfig.py', 'xdg', 'dbus', '_dbus_bindings.so', 'dbus_bindings.py',
'_dbus_glib_bindings.so', 'netifaces.so', '_psutil_posix.so',
- '_psutil_linux.so', 'psutil']
+ '_psutil_linux.so', 'psutil', 'cssselect']
QTDIR = '/usr/lib/qt4'
QTDLLS = ('QtCore', 'QtGui', 'QtNetwork', 'QtSvg', 'QtXml', 'QtWebKit', 'QtDBus')
diff --git a/setup/installer/windows/notes.rst b/setup/installer/windows/notes.rst
index c70e6f3010..9502135f6e 100644
--- a/setup/installer/windows/notes.rst
+++ b/setup/installer/windows/notes.rst
@@ -30,7 +30,7 @@ If there are no windows binaries already compiled for the version of python you
Run the following command to install python dependencies::
- easy_install --always-unzip -U mechanize pyreadline python-dateutil dnspython cssutils clientform pycrypto
+ easy_install --always-unzip -U mechanize pyreadline python-dateutil dnspython cssutils clientform pycrypto cssselect
Install BeautifulSoup 3.0.x manually into site-packages (3.1.x parses broken HTML very poorly)
diff --git a/src/calibre/ebooks/cssselect.py b/src/calibre/ebooks/cssselect.py
deleted file mode 100644
index 1c2bfcc4fa..0000000000
--- a/src/calibre/ebooks/cssselect.py
+++ /dev/null
@@ -1,1012 +0,0 @@
-"""CSS Selectors based on XPath.
-
-This module supports selecting XML/HTML tags based on CSS selectors.
-See the `CSSSelector` class for details.
-"""
-
-import re
-from lxml import etree
-
-__all__ = ['SelectorSyntaxError', 'ExpressionError',
- 'CSSSelector']
-
-try:
- _basestring = basestring
-except NameError:
- _basestring = str
-
-class SelectorSyntaxError(SyntaxError):
- pass
-
-class ExpressionError(RuntimeError):
- pass
-
-class CSSSelector(etree.XPath):
- """A CSS selector.
-
- Usage::
-
- >>> from lxml import etree, cssselect
- >>> select = cssselect.CSSSelector("a tag > child")
-
- >>> root = etree.XML("TEXT")
- >>> [ el.tag for el in select(root) ]
- ['child']
-
- To use CSS namespaces, you need to pass a prefix-to-namespace
- mapping as ``namespaces`` keyword argument::
-
- >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
- >>> select_ns = cssselect.CSSSelector('root > rdf|Description',
- ... namespaces={'rdf': rdfns})
-
- >>> rdf = etree.XML((
- ... ''
- ... 'blah'
- ... '') % rdfns)
- >>> [(el.tag, el.text) for el in select_ns(rdf)]
- [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
- """
- def __init__(self, css, namespaces=None):
- path = css_to_xpath_no_case(css)
- etree.XPath.__init__(self, path, namespaces=namespaces)
- self.css = css
-
- def __repr__(self):
- return '<%s %s for %r>' % (
- self.__class__.__name__,
- hex(abs(id(self)))[2:],
- self.css)
-
-##############################
-## Token objects:
-
-try:
- _unicode = unicode
- _unichr = unichr
-except NameError:
- # Python 3
- _unicode = str
- _unichr = chr
-
-class _UniToken(_unicode):
- def __new__(cls, contents, pos):
- obj = _unicode.__new__(cls, contents)
- obj.pos = pos
- return obj
-
- def __repr__(self):
- return '%s(%s, %r)' % (
- self.__class__.__name__,
- _unicode.__repr__(self),
- self.pos)
-
-class Symbol(_UniToken):
- pass
-
-class String(_UniToken):
- pass
-
-class Token(_UniToken):
- pass
-
-############################################################
-## Parsing
-############################################################
-
-##############################
-## Syntax objects:
-
-class Class(object):
- """
- Represents selector.class_name
- """
-
- def __init__(self, selector, class_name):
- self.selector = selector
- # Kovid: Lowercased
- self.class_name = class_name.lower()
-
- def __repr__(self):
- return '%s[%r.%s]' % (
- self.__class__.__name__,
- self.selector,
- self.class_name)
-
- def xpath(self):
- sel_xpath = self.selector.xpath()
- # Kovid: Lowercased
- sel_xpath.add_condition(
- "contains(concat(' ', normalize-space(%s), ' '), %s)" % (
- lower_case('@class'),
- xpath_literal(' '+self.class_name+' ')))
- return sel_xpath
-
-class Function(object):
- """
- Represents selector:name(expr)
- """
-
- unsupported = [
- 'target', 'lang', 'enabled', 'disabled',]
-
- def __init__(self, selector, type, name, expr):
- self.selector = selector
- self.type = type
- self.name = name
- self.expr = expr
-
- def __repr__(self):
- return '%s[%r%s%s(%r)]' % (
- self.__class__.__name__,
- self.selector,
- self.type, self.name, self.expr)
-
- def xpath(self):
- sel_path = self.selector.xpath()
- if self.name in self.unsupported:
- raise ExpressionError(
- "The pseudo-class %r is not supported" % self.name)
- method = '_xpath_' + self.name.replace('-', '_')
- if not hasattr(self, method):
- raise ExpressionError(
- "The pseudo-class %r is unknown" % self.name)
- method = getattr(self, method)
- return method(sel_path, self.expr)
-
- def _xpath_nth_child(self, xpath, expr, last=False,
- add_name_test=True):
- a, b = parse_series(expr)
- if not a and not b and not last:
- # a=0 means nothing is returned...
- xpath.add_condition('false() and position() = 0')
- return xpath
- if add_name_test:
- xpath.add_name_test()
- xpath.add_star_prefix()
- if a == 0:
- if last:
- b = 'last() - %s' % b
- xpath.add_condition('position() = %s' % b)
- return xpath
- if last:
- # FIXME: I'm not sure if this is right
- a = -a
- b = -b
- if b > 0:
- b_neg = str(-b)
- else:
- b_neg = '+%s' % (-b)
- if a != 1:
- expr = ['(position() %s) mod %s = 0' % (b_neg, a)]
- else:
- expr = []
- if b >= 0:
- expr.append('position() >= %s' % b)
- elif b < 0 and last:
- expr.append('position() < (last() %s)' % b)
- expr = ' and '.join(expr)
- if expr:
- xpath.add_condition(expr)
- return xpath
- # FIXME: handle an+b, odd, even
- # an+b means every-a, plus b, e.g., 2n+1 means odd
- # 0n+b means b
- # n+0 means a=1, i.e., all elements
- # an means every a elements, i.e., 2n means even
- # -n means -1n
- # -1n+6 means elements 6 and previous
-
- def _xpath_nth_last_child(self, xpath, expr):
- return self._xpath_nth_child(xpath, expr, last=True)
-
- def _xpath_nth_of_type(self, xpath, expr):
- if xpath.element == '*':
- raise NotImplementedError(
- "*:nth-of-type() is not implemented")
- return self._xpath_nth_child(xpath, expr, add_name_test=False)
-
- def _xpath_nth_last_of_type(self, xpath, expr):
- return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)
-
- def _xpath_contains(self, xpath, expr):
- # text content, minus tags, must contain expr
- if isinstance(expr, Element):
- expr = expr._format_element()
- # Kovid: Use ASCII lower case that works
- xpath.add_condition('contains(%s), %s)' % (
- lower_case('string(.)'),
- xpath_literal(expr.lower())))
- return xpath
-
- def _xpath_not(self, xpath, expr):
- # everything for which not expr applies
- expr = expr.xpath()
- cond = expr.condition
- # FIXME: should I do something about element_path?
- xpath.add_condition('not(%s)' % cond)
- return xpath
-
-# Kovid: Python functions dont work in lxml, so use translate()
-# instead of the python lowercase function
-def lower_case(arg):
- 'An ASCII lowercase function'
- return ("translate(%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
- "'abcdefghijklmnopqrstuvwxyz')")%arg
-
-class Pseudo(object):
- """
- Represents selector:ident
- """
-
- unsupported = ['indeterminate', 'first-line', 'first-letter',
- 'selection', 'before', 'after', 'link', 'visited',
- 'active', 'focus', 'hover']
-
- def __init__(self, element, type, ident):
- self.element = element
- assert type in (':', '::')
- self.type = type
- self.ident = ident
-
- def __repr__(self):
- return '%s[%r%s%s]' % (
- self.__class__.__name__,
- self.element,
- self.type, self.ident)
-
- def xpath(self):
- el_xpath = self.element.xpath()
- if self.ident in self.unsupported:
- raise ExpressionError(
- "The pseudo-class %r is unsupported" % self.ident)
- method = '_xpath_' + self.ident.replace('-', '_')
- if not hasattr(self, method):
- raise ExpressionError(
- "The pseudo-class %r is unknown" % self.ident)
- method = getattr(self, method)
- el_xpath = method(el_xpath)
- return el_xpath
-
- def _xpath_checked(self, xpath):
- # FIXME: is this really all the elements?
- xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')")
- return xpath
-
- def _xpath_root(self, xpath):
- # if this element is the root element
- raise NotImplementedError
-
- def _xpath_first_child(self, xpath):
- xpath.add_star_prefix()
- xpath.add_name_test()
- xpath.add_condition('position() = 1')
- return xpath
-
- def _xpath_last_child(self, xpath):
- xpath.add_star_prefix()
- xpath.add_name_test()
- xpath.add_condition('position() = last()')
- return xpath
-
- def _xpath_first_of_type(self, xpath):
- if xpath.element == '*':
- raise NotImplementedError(
- "*:first-of-type is not implemented")
- xpath.add_star_prefix()
- xpath.add_condition('position() = 1')
- return xpath
-
- def _xpath_last_of_type(self, xpath):
- if xpath.element == '*':
- raise NotImplementedError(
- "*:last-of-type is not implemented")
- xpath.add_star_prefix()
- xpath.add_condition('position() = last()')
- return xpath
-
- def _xpath_only_child(self, xpath):
- xpath.add_name_test()
- xpath.add_star_prefix()
- xpath.add_condition('last() = 1')
- return xpath
-
- def _xpath_only_of_type(self, xpath):
- if xpath.element == '*':
- raise NotImplementedError(
- "*:only-of-type is not implemented")
- xpath.add_condition('last() = 1')
- return xpath
-
- def _xpath_empty(self, xpath):
- xpath.add_condition("not(*) and not(normalize-space())")
- return xpath
-
-class Attrib(object):
- """
- Represents selector[namespace|attrib operator value]
- """
-
- def __init__(self, selector, namespace, attrib, operator, value):
- self.selector = selector
- self.namespace = namespace
- self.attrib = attrib
- self.operator = operator
- self.value = value
-
- def __repr__(self):
- if self.operator == 'exists':
- return '%s[%r[%s]]' % (
- self.__class__.__name__,
- self.selector,
- self._format_attrib())
- else:
- return '%s[%r[%s %s %r]]' % (
- self.__class__.__name__,
- self.selector,
- self._format_attrib(),
- self.operator,
- self.value)
-
- def _format_attrib(self):
- if self.namespace == '*':
- return self.attrib
- else:
- return '%s|%s' % (self.namespace, self.attrib)
-
- def _xpath_attrib(self):
- # FIXME: if attrib is *?
- if self.namespace == '*':
- return '@' + self.attrib
- else:
- return '@%s:%s' % (self.namespace, self.attrib)
-
- def xpath(self):
- path = self.selector.xpath()
- attrib = self._xpath_attrib()
- value = self.value
- if self.operator == 'exists':
- assert not value
- path.add_condition(attrib)
- elif self.operator == '=':
- path.add_condition('%s = %s' % (attrib,
- xpath_literal(value)))
- elif self.operator == '!=':
- # FIXME: this seems like a weird hack...
- if value:
- path.add_condition('not(%s) or %s != %s'
- % (attrib, attrib, xpath_literal(value)))
- else:
- path.add_condition('%s != %s'
- % (attrib, xpath_literal(value)))
- #path.add_condition('%s != %s' % (attrib, xpath_literal(value)))
- elif self.operator == '~=':
- path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' ')))
- elif self.operator == '|=':
- # Weird, but true...
- path.add_condition('%s = %s or starts-with(%s, %s)' % (
- attrib, xpath_literal(value),
- attrib, xpath_literal(value + '-')))
- elif self.operator == '^=':
- path.add_condition('starts-with(%s, %s)' % (
- attrib, xpath_literal(value)))
- elif self.operator == '$=':
- # Oddly there is a starts-with in XPath 1.0, but not ends-with
- path.add_condition('substring(%s, string-length(%s)-%s) = %s'
- % (attrib, attrib, len(value)-1, xpath_literal(value)))
- elif self.operator == '*=':
- # FIXME: case sensitive?
- path.add_condition('contains(%s, %s)' % (
- attrib, xpath_literal(value)))
- else:
- assert 0, ("Unknown operator: %r" % self.operator)
- return path
-
-class Element(object):
- """
- Represents namespace|element
- """
-
- def __init__(self, namespace, element):
- self.namespace = namespace
- self.element = element
-
- def __repr__(self):
- return '%s[%s]' % (
- self.__class__.__name__,
- self._format_element())
-
- def _format_element(self):
- if self.namespace == '*':
- return self.element
- else:
- return '%s|%s' % (self.namespace, self.element)
-
- def xpath(self):
- if self.namespace == '*':
- el = self.element.lower()
- else:
- # Kovid: Lowercased
- el = '%s:%s' % (self.namespace, self.element.lower())
- return XPathExpr(element=el)
-
-class Hash(object):
- """
- Represents selector#id
- """
-
- def __init__(self, selector, id):
- self.selector = selector
- self.id = id
-
- def __repr__(self):
- return '%s[%r#%s]' % (
- self.__class__.__name__,
- self.selector, self.id)
-
- def xpath(self):
- path = self.selector.xpath()
- path.add_condition('@id = %s' % xpath_literal(self.id))
- return path
-
-class Or(object):
-
- def __init__(self, items):
- self.items = items
- def __repr__(self):
- return '%s(%r)' % (
- self.__class__.__name__,
- self.items)
-
- def xpath(self):
- paths = [item.xpath() for item in self.items]
- return XPathExprOr(paths)
-
-class CombinedSelector(object):
-
- _method_mapping = {
- ' ': 'descendant',
- '>': 'child',
- '+': 'direct_adjacent',
- '~': 'indirect_adjacent',
- }
-
- def __init__(self, selector, combinator, subselector):
- assert selector is not None
- self.selector = selector
- self.combinator = combinator
- self.subselector = subselector
-
- def __repr__(self):
- if self.combinator == ' ':
- comb = ''
- else:
- comb = self.combinator
- return '%s[%r %s %r]' % (
- self.__class__.__name__,
- self.selector,
- comb,
- self.subselector)
-
- def xpath(self):
- if self.combinator not in self._method_mapping:
- raise ExpressionError(
- "Unknown combinator: %r" % self.combinator)
- method = '_xpath_' + self._method_mapping[self.combinator]
- method = getattr(self, method)
- path = self.selector.xpath()
- return method(path, self.subselector)
-
- def _xpath_descendant(self, xpath, sub):
- # when sub is a descendant in any way of xpath
- xpath.join('/descendant::', sub.xpath())
- return xpath
-
- def _xpath_child(self, xpath, sub):
- # when sub is an immediate child of xpath
- xpath.join('/', sub.xpath())
- return xpath
-
- def _xpath_direct_adjacent(self, xpath, sub):
- # when sub immediately follows xpath
- xpath.join('/following-sibling::', sub.xpath())
- xpath.add_name_test()
- xpath.add_condition('position() = 1')
- return xpath
-
- def _xpath_indirect_adjacent(self, xpath, sub):
- # when sub comes somewhere after xpath as a sibling
- xpath.join('/following-sibling::', sub.xpath())
- return xpath
-
-##############################
-## XPathExpr objects:
-
-_el_re = re.compile(r'^\w+\s*$', re.UNICODE)
-_id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE)
-_class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE)
-
-
-def css_to_xpath_no_case(css_expr, prefix='descendant-or-self::'):
- if isinstance(css_expr, _basestring):
- match = _el_re.search(css_expr)
- if match is not None:
- # Kovid: Lowercased
- return '%s%s' % (prefix, match.group(0).strip().lower())
- match = _id_re.search(css_expr)
- if match is not None:
- return "%s%s[@id = '%s']" % (
- prefix, match.group(1) or '*', match.group(2))
- match = _class_re.search(css_expr)
- if match is not None:
- # Kovid: lowercased
- return "%s%s[contains(concat(' ', normalize-space(%s), ' '), ' %s ')]" % (
- prefix, match.group(1).lower() or '*',
- lower_case('@class'), match.group(2).lower())
- css_expr = parse(css_expr)
- expr = css_expr.xpath()
- assert expr is not None, (
- "Got None for xpath expression from %s" % repr(css_expr))
- if prefix:
- expr.add_prefix(prefix)
- return _unicode(expr)
-
-class XPathExpr(object):
-
- def __init__(self, prefix=None, path=None, element='*', condition=None,
- star_prefix=False):
- self.prefix = prefix
- self.path = path
- self.element = element
- self.condition = condition
- self.star_prefix = star_prefix
-
- def __str__(self):
- path = ''
- if self.prefix is not None:
- path += _unicode(self.prefix)
- if self.path is not None:
- path += _unicode(self.path)
- path += _unicode(self.element)
- if self.condition:
- path += '[%s]' % self.condition
- return path
-
- def __repr__(self):
- return '%s[%s]' % (
- self.__class__.__name__, self)
-
- def add_condition(self, condition):
- if self.condition:
- self.condition = '%s and (%s)' % (self.condition, condition)
- else:
- self.condition = condition
-
- def add_path(self, part):
- if self.path is None:
- self.path = self.element
- else:
- self.path += self.element
- self.element = part
-
- def add_prefix(self, prefix):
- if self.prefix:
- self.prefix = prefix + self.prefix
- else:
- self.prefix = prefix
-
- def add_name_test(self):
- if self.element == '*':
- # We weren't doing a test anyway
- return
- self.add_condition("name() = %s" % xpath_literal(self.element))
- self.element = '*'
-
- def add_star_prefix(self):
- """
- Adds a /* prefix if there is no prefix. This is when you need
- to keep context's constrained to a single parent.
- """
- if self.path:
- self.path += '*/'
- else:
- self.path = '*/'
- self.star_prefix = True
-
- def join(self, combiner, other):
- prefix = _unicode(self)
- prefix += combiner
- path = (other.prefix or '') + (other.path or '')
- # We don't need a star prefix if we are joining to this other
- # prefix; so we'll get rid of it
- if other.star_prefix and path == '*/':
- path = ''
- self.prefix = prefix
- self.path = path
- self.element = other.element
- self.condition = other.condition
-
-class XPathExprOr(XPathExpr):
- """
- Represents |'d expressions. Note that unfortunately it isn't
- the union, it's the sum, so duplicate elements will appear.
- """
-
- def __init__(self, items, prefix=None):
- for item in items:
- assert item is not None
- self.items = items
- self.prefix = prefix
-
- def __str__(self):
- prefix = self.prefix or ''
- return ' | '.join(["%s%s" % (prefix,i) for i in self.items])
-
-split_at_single_quotes = re.compile("('+)").split
-
-def xpath_literal(s):
- if isinstance(s, Element):
- # This is probably a symbol that looks like an expression...
- s = s._format_element()
- else:
- s = _unicode(s)
- if "'" not in s:
- s = "'%s'" % s
- elif '"' not in s:
- s = '"%s"' % s
- else:
- s = "concat(%s)" % ','.join([
- (("'" in part) and '"%s"' or "'%s'") % part
- for part in split_at_single_quotes(s) if part
- ])
- return s
-
-##############################
-## Parsing functions
-
-def parse(string):
- stream = TokenStream(tokenize(string))
- stream.source = string
- try:
- return parse_selector_group(stream)
- except SelectorSyntaxError:
- import sys
- e = sys.exc_info()[1]
- message = "%s at %s -> %r" % (
- e, stream.used, stream.peek())
- e.msg = message
- if sys.version_info < (2,6):
- e.message = message
- e.args = tuple([message])
- raise
-
-def parse_selector_group(stream):
- result = []
- while 1:
- result.append(parse_selector(stream))
- if stream.peek() == ',':
- stream.next()
- else:
- break
- if len(result) == 1:
- return result[0]
- else:
- return Or(result)
-
-def parse_selector(stream):
- result = parse_simple_selector(stream)
- while 1:
- peek = stream.peek()
- if peek == ',' or peek is None:
- return result
- elif peek in ('+', '>', '~'):
- # A combinator
- combinator = stream.next()
- else:
- combinator = ' '
- consumed = len(stream.used)
- next_selector = parse_simple_selector(stream)
- if consumed == len(stream.used):
- raise SelectorSyntaxError(
- "Expected selector, got '%s'" % stream.peek())
- result = CombinedSelector(result, combinator, next_selector)
- return result
-
-def parse_simple_selector(stream):
- peek = stream.peek()
- if peek != '*' and not isinstance(peek, Symbol):
- element = namespace = '*'
- else:
- next = stream.next()
- if next != '*' and not isinstance(next, Symbol):
- raise SelectorSyntaxError(
- "Expected symbol, got '%s'" % next)
- if stream.peek() == '|':
- namespace = next
- stream.next()
- element = stream.next()
- if element != '*' and not isinstance(next, Symbol):
- raise SelectorSyntaxError(
- "Expected symbol, got '%s'" % next)
- else:
- namespace = '*'
- element = next
- result = Element(namespace, element)
- has_hash = False
- while 1:
- peek = stream.peek()
- if peek == '#':
- if has_hash:
- # You can't have two hashes
- # (FIXME: is there some more general rule I'm missing?)
- break
- stream.next()
- result = Hash(result, stream.next())
- has_hash = True
- continue
- elif peek == '.':
- stream.next()
- result = Class(result, stream.next())
- continue
- elif peek == '[':
- stream.next()
- result = parse_attrib(result, stream)
- next = stream.next()
- if not next == ']':
- raise SelectorSyntaxError(
- "] expected, got '%s'" % next)
- continue
- elif peek == ':' or peek == '::':
- type = stream.next()
- ident = stream.next()
- if not isinstance(ident, Symbol):
- raise SelectorSyntaxError(
- "Expected symbol, got '%s'" % ident)
- if stream.peek() == '(':
- stream.next()
- peek = stream.peek()
- if isinstance(peek, String):
- selector = stream.next()
- elif isinstance(peek, Symbol) and is_int(peek):
- selector = int(stream.next())
- else:
- # FIXME: parse_simple_selector, or selector, or...?
- selector = parse_simple_selector(stream)
- next = stream.next()
- if not next == ')':
- raise SelectorSyntaxError(
- "Expected ')', got '%s' and '%s'"
- % (next, selector))
- result = Function(result, type, ident, selector)
- else:
- result = Pseudo(result, type, ident)
- continue
- else:
- if peek == ' ':
- stream.next()
- break
- # FIXME: not sure what "negation" is
- return result
-
-def is_int(v):
- try:
- int(v)
- except ValueError:
- return False
- else:
- return True
-
-def parse_attrib(selector, stream):
- attrib = stream.next()
- if stream.peek() == '|':
- namespace = attrib
- stream.next()
- attrib = stream.next()
- else:
- namespace = '*'
- if stream.peek() == ']':
- return Attrib(selector, namespace, attrib, 'exists', None)
- op = stream.next()
- if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
- raise SelectorSyntaxError(
- "Operator expected, got '%s'" % op)
- value = stream.next()
- if not isinstance(value, (Symbol, String)):
- raise SelectorSyntaxError(
- "Expected string or symbol, got '%s'" % value)
- return Attrib(selector, namespace, attrib, op, value)
-
-def parse_series(s):
- """
- Parses things like '1n+2', or 'an+b' generally, returning (a, b)
- """
- if isinstance(s, Element):
- s = s._format_element()
- if not s or s == '*':
- # Happens when there's nothing, which the CSS parser thinks of as *
- return (0, 0)
- if isinstance(s, int):
- # Happens when you just get a number
- return (0, s)
- if s == 'odd':
- return (2, 1)
- elif s == 'even':
- return (2, 0)
- elif s == 'n':
- return (1, 0)
- if 'n' not in s:
- # Just a b
- return (0, int(s))
- a, b = s.split('n', 1)
- if not a:
- a = 1
- elif a == '-' or a == '+':
- a = int(a+'1')
- else:
- a = int(a)
- if not b:
- b = 0
- elif b == '-' or b == '+':
- b = int(b+'1')
- else:
- b = int(b)
- return (a, b)
-
-
-############################################################
-## Tokenizing
-############################################################
-
-_match_whitespace = re.compile(r'\s+', re.UNICODE).match
-
-_replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub
-
-_match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match
-
-def tokenize(s):
- pos = 0
- s = _replace_comments('', s)
- while 1:
- match = _match_whitespace(s, pos=pos)
- if match:
- preceding_whitespace_pos = pos
- pos = match.end()
- else:
- preceding_whitespace_pos = 0
- if pos >= len(s):
- return
- match = _match_count_number(s, pos=pos)
- if match and match.group() != 'n':
- sym = s[pos:match.end()]
- yield Symbol(sym, pos)
- pos = match.end()
- continue
- c = s[pos]
- c2 = s[pos:pos+2]
- if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='):
- yield Token(c2, pos)
- pos += 2
- continue
- if c in '>+~,.*=[]()|:#':
- if c in '.#[' and preceding_whitespace_pos > 0:
- yield Token(' ', preceding_whitespace_pos)
- yield Token(c, pos)
- pos += 1
- continue
- if c == '"' or c == "'":
- # Quoted string
- old_pos = pos
- sym, pos = tokenize_escaped_string(s, pos)
- yield String(sym, old_pos)
- continue
- old_pos = pos
- sym, pos = tokenize_symbol(s, pos)
- yield Symbol(sym, old_pos)
- continue
-
-split_at_string_escapes = re.compile(r'(\\(?:%s))'
- % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?',
- '[^A-Fa-f0-9]'])).split
-
-def unescape_string_literal(literal):
- substrings = []
- for substring in split_at_string_escapes(literal):
- if not substring:
- continue
- elif '\\' in substring:
- if substring[0] == '\\' and len(substring) > 1:
- substring = substring[1:]
- if substring[0] in '0123456789ABCDEFabcdef':
- # int() correctly ignores the potentially trailing whitespace
- substring = _unichr(int(substring, 16))
- else:
- raise SelectorSyntaxError(
- "Invalid escape sequence %r in string %r"
- % (substring.split('\\')[1], literal))
- substrings.append(substring)
- return ''.join(substrings)
-
-def tokenize_escaped_string(s, pos):
- quote = s[pos]
- assert quote in ('"', "'")
- pos = pos+1
- start = pos
- while 1:
- next = s.find(quote, pos)
- if next == -1:
- raise SelectorSyntaxError(
- "Expected closing %s for string in: %r"
- % (quote, s[start:]))
- result = s[start:next]
- if result.endswith('\\'):
- # next quote character is escaped
- pos = next+1
- continue
- if '\\' in result:
- result = unescape_string_literal(result)
- return result, next+1
-
-_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
-
-def tokenize_symbol(s, pos):
- start = pos
- match = _illegal_symbol.search(s, pos=pos)
- if not match:
- # Goes to end of s
- return s[start:], len(s)
- if match.start() == pos:
- assert 0, (
- "Unexpected symbol: %r at %s" % (s[pos], pos))
- if not match:
- result = s[start:]
- pos = len(s)
- else:
- result = s[start:match.start()]
- pos = match.start()
- try:
- result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
- except UnicodeDecodeError:
- import sys
- e = sys.exc_info()[1]
- raise SelectorSyntaxError(
- "Bad symbol %r: %s" % (result, e))
- return result, pos
-
-class TokenStream(object):
-
- def __init__(self, tokens, source=None):
- self.used = []
- self.tokens = iter(tokens)
- self.source = source
- self.peeked = None
- self._peeking = False
- try:
- self.next_token = self.tokens.next
- except AttributeError:
- # Python 3
- self.next_token = self.tokens.__next__
-
- def next(self):
- if self._peeking:
- self._peeking = False
- self.used.append(self.peeked)
- return self.peeked
- else:
- try:
- next = self.next_token()
- self.used.append(next)
- return next
- except StopIteration:
- return None
-
- def __iter__(self):
- return iter(self.next, None)
-
- def peek(self):
- if not self._peeking:
- try:
- self.peeked = self.next_token()
- except StopIteration:
- return None
- self._peeking = True
- return self.peeked
diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py
index 37641c91f2..68978b9637 100644
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@@ -20,16 +20,17 @@ except ImportError:
from cssutils import (profile as cssprofiles, parseString, parseStyle, log as
cssutils_log, CSSParser, profiles, replaceUrls)
from lxml import etree
-from lxml.cssselect import css_to_xpath, ExpressionError, SelectorSyntaxError
+from cssselect import HTMLTranslator
+
from calibre import force_unicode
from calibre.ebooks import unit_convert
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize
-from calibre.ebooks.cssselect import css_to_xpath_no_case
cssutils_log.setLevel(logging.WARN)
_html_css_stylesheet = None
+css_to_xpath = HTMLTranslator().css_to_xpath
def html_css_stylesheet():
global _html_css_stylesheet
@@ -96,70 +97,86 @@ DEFAULTS = {'azimuth': 'center', 'background-attachment': 'scroll',
FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
'x-large', 'xx-large'])
+def xpath_lower_case(arg):
+ 'An ASCII lowercase function for XPath'
+ return ("translate(%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
+ "'abcdefghijklmnopqrstuvwxyz')")%arg
+is_non_whitespace = re.compile(r'^[^ \t\r\n\f]+$').match
+
+class CaseInsensitiveAttributesTranslator(HTMLTranslator):
+ 'Treat class and id CSS selectors case-insensitively'
+
+ def xpath_class(self, class_selector):
+ """Translate a class selector."""
+ x = self.xpath(class_selector.selector)
+ if is_non_whitespace(class_selector.class_name):
+ x.add_condition(
+ "%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
+ % ('@class', xpath_lower_case('@class'), self.xpath_literal(
+ ' '+class_selector.class_name.lower()+' ')))
+ else:
+ x.add_condition('0')
+ return x
+
+ def xpath_hash(self, id_selector):
+ """Translate an ID selector."""
+ x = self.xpath(id_selector.selector)
+ return self.xpath_attrib_equals(x, xpath_lower_case('@id'),
+ (id_selector.id.lower()))
+
+ci_css_to_xpath = CaseInsensitiveAttributesTranslator().css_to_xpath
class CSSSelector(object):
- LOCAL_NAME_RE = re.compile(r"(?' % (
- self.__class__.__name__,
- hex(abs(id(self)))[2:],
- self.css)
-
_selector_cache = {}
MIN_SPACE_RE = re.compile(r' *([>~+]) *')
-def get_css_selector(raw_selector):
+def get_css_selector(raw_selector, log):
css = MIN_SPACE_RE.sub(r'\1', raw_selector)
- if isinstance(css, unicode):
- # Workaround for bug in lxml on windows/OS X that causes a massive
- # memory leak with non ASCII selectors
- css = css.encode('ascii', 'ignore').decode('ascii')
ans = _selector_cache.get(css, None)
if ans is None:
- ans = CSSSelector(css)
+ ans = CSSSelector(css, log)
_selector_cache[css] = ans
return ans
@@ -272,7 +289,7 @@ class Stylizer(object):
fl = pseudo_pat.search(text)
if fl is not None:
text = text.replace(fl.group(), '')
- selector = get_css_selector(text)
+ selector = get_css_selector(text, self.oeb.log)
matches = selector(tree, self.logger)
if fl is not None:
fl = fl.group(1)
diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py
index 7ad4bc50d9..e46ddb5fb5 100644
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@@ -13,7 +13,7 @@ import os, math, functools, collections, re, copy
from lxml.etree import XPath as _XPath
from lxml import etree
-from lxml.cssselect import CSSSelector
+from cssselect import HTMLTranslator
from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
urldefrag, rewrite_links, urlunquote, barename, XHTML, urlnormalize)
@@ -73,6 +73,7 @@ class Split(object):
def find_page_breaks(self, item):
if self.page_break_selectors is None:
+ css_to_xpath = HTMLTranslator().css_to_xpath
self.page_break_selectors = set([])
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES]
@@ -83,7 +84,7 @@ class Split(object):
'page-break-after'), 'cssText', '').strip().lower()
try:
if before and before not in {'avoid', 'auto', 'inherit'}:
- self.page_break_selectors.add((CSSSelector(rule.selectorText),
+ self.page_break_selectors.add((XPath(css_to_xpath(rule.selectorText)),
True))
if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-before')
@@ -91,7 +92,7 @@ class Split(object):
pass
try:
if after and after not in {'avoid', 'auto', 'inherit'}:
- self.page_break_selectors.add((CSSSelector(rule.selectorText),
+ self.page_break_selectors.add((XPath(css_to_xpath(rule.selectorText)),
False))
if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-after')
diff --git a/src/calibre/ebooks/readability/htmls.py b/src/calibre/ebooks/readability/htmls.py
index 11f9adebb9..f882f95254 100644
--- a/src/calibre/ebooks/readability/htmls.py
+++ b/src/calibre/ebooks/readability/htmls.py
@@ -64,8 +64,12 @@ def shorten_title(doc):
if e.text_content():
add_match(candidates, e.text_content(), orig)
- for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
- for e in doc.cssselect(item):
+ from cssselect import HTMLTranslator
+ css_to_xpath = HTMLTranslator().css_to_xpath
+ for item in ('#title', '#head', '#heading', '.pageTitle', '.news_title',
+ '.title', '.head', '.heading', '.contentheading',
+ '.small_header_red'):
+ for e in doc.xpath(css_to_xpath(item)):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():