mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add some documentation for css_selectors
Also allow the Select class to work with other tree implementations
This commit is contained in:
parent
77726d774a
commit
6150a664c2
@ -6,4 +6,8 @@ from __future__ import (unicode_literals, division, absolute_import,
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
from css_selectors.parse import parse
|
||||||
|
from css_selectors.select import Select
|
||||||
|
from css_selectors.errors import SelectorError, SelectorSyntaxError, ExpressionError
|
||||||
|
|
||||||
|
__all__ = ['parse', 'Select', 'SelectorError', 'SelectorSyntaxError', 'ExpressionError']
|
||||||
|
@ -80,6 +80,28 @@ def normalize_language_tag(tag):
|
|||||||
|
|
||||||
class Select(object):
|
class Select(object):
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
This class implements CSS Level 3 selectors on an lxml tree, with caching
|
||||||
|
for performance. To use:
|
||||||
|
|
||||||
|
>>> select = Select(root)
|
||||||
|
>>> print(tuple(select('p.myclass')))
|
||||||
|
|
||||||
|
Tags are returned in document order. Note that attribute and tag names are
|
||||||
|
matched case-insensitively. Also namespaces are ignored (this is for
|
||||||
|
performance of the common case).
|
||||||
|
|
||||||
|
WARNING: This class uses internal caches. You *must not* make any changes
|
||||||
|
to the lxml tree. If you do make some changes, either create a new Select
|
||||||
|
object or call :meth:`invalidate_caches`.
|
||||||
|
|
||||||
|
This class can be easily sub-classes to work with tree implementations
|
||||||
|
other than lxml. Simply override the methods in the ``Tree Integration``
|
||||||
|
block.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
combinator_mapping = {
|
combinator_mapping = {
|
||||||
' ': 'descendant',
|
' ': 'descendant',
|
||||||
'>': 'child',
|
'>': 'child',
|
||||||
@ -106,6 +128,7 @@ class Select(object):
|
|||||||
self.dispatch_map = {k:trace_wrapper(v) for k, v in self.dispatch_map.iteritems()}
|
self.dispatch_map = {k:trace_wrapper(v) for k, v in self.dispatch_map.iteritems()}
|
||||||
|
|
||||||
def invalidate_caches(self):
|
def invalidate_caches(self):
|
||||||
|
'Invalidate all caches. You must call this before using this object if you have made changes to the HTML tree'
|
||||||
self._element_map = None
|
self._element_map = None
|
||||||
self._id_map = None
|
self._id_map = None
|
||||||
self._class_map = None
|
self._class_map = None
|
||||||
@ -114,6 +137,7 @@ class Select(object):
|
|||||||
self._lang_map = None
|
self._lang_map = None
|
||||||
|
|
||||||
def __call__(self, selector):
|
def __call__(self, selector):
|
||||||
|
'Return an iterator over all matching tags, in document order.'
|
||||||
seen = set()
|
seen = set()
|
||||||
for selector in get_parsed_selector(selector):
|
for selector in get_parsed_selector(selector):
|
||||||
parsed_selector = selector.parsed_tree
|
parsed_selector = selector.parsed_tree
|
||||||
@ -140,7 +164,7 @@ class Select(object):
|
|||||||
def map_tag_name(x):
|
def map_tag_name(x):
|
||||||
return ascii_lower(x.rpartition('}')[2])
|
return ascii_lower(x.rpartition('}')[2])
|
||||||
|
|
||||||
for tag in self.root.iter('*'):
|
for tag in self.itertag():
|
||||||
em[map_tag_name(tag.tag)].add(tag)
|
em[map_tag_name(tag.tag)].add(tag)
|
||||||
return self._element_map
|
return self._element_map
|
||||||
|
|
||||||
@ -149,7 +173,7 @@ class Select(object):
|
|||||||
if self._id_map is None:
|
if self._id_map is None:
|
||||||
self._id_map = im = defaultdict(OrderedSet)
|
self._id_map = im = defaultdict(OrderedSet)
|
||||||
lower = ascii_lower
|
lower = ascii_lower
|
||||||
for elem in get_compiled_xpath('//*[@id]')(self.root):
|
for elem in self.iteridtags():
|
||||||
im[lower(elem.get('id'))].add(elem)
|
im[lower(elem.get('id'))].add(elem)
|
||||||
return self._id_map
|
return self._id_map
|
||||||
|
|
||||||
@ -158,7 +182,7 @@ class Select(object):
|
|||||||
if self._class_map is None:
|
if self._class_map is None:
|
||||||
self._class_map = cm = defaultdict(OrderedSet)
|
self._class_map = cm = defaultdict(OrderedSet)
|
||||||
lower = ascii_lower
|
lower = ascii_lower
|
||||||
for elem in get_compiled_xpath('//*[@class]')(self.root):
|
for elem in self.iterclasstags():
|
||||||
for cls in elem.get('class').split():
|
for cls in elem.get('class').split():
|
||||||
cm[lower(cls)].add(elem)
|
cm[lower(cls)].add(elem)
|
||||||
return self._class_map
|
return self._class_map
|
||||||
@ -171,7 +195,7 @@ class Select(object):
|
|||||||
if '{' in self.root.tag:
|
if '{' in self.root.tag:
|
||||||
def map_attrib_name(x):
|
def map_attrib_name(x):
|
||||||
return ascii_lower(x.rpartition('}')[2])
|
return ascii_lower(x.rpartition('}')[2])
|
||||||
for tag in self.root.iter('*'):
|
for tag in self.itertag():
|
||||||
for attr, val in tag.attrib.iteritems():
|
for attr, val in tag.attrib.iteritems():
|
||||||
am[map_attrib_name(attr)][val].add(tag)
|
am[map_attrib_name(attr)][val].add(tag)
|
||||||
return self._attrib_map
|
return self._attrib_map
|
||||||
@ -184,7 +208,7 @@ class Select(object):
|
|||||||
if '{' in self.root.tag:
|
if '{' in self.root.tag:
|
||||||
def map_attrib_name(x):
|
def map_attrib_name(x):
|
||||||
return ascii_lower(x.rpartition('}')[2])
|
return ascii_lower(x.rpartition('}')[2])
|
||||||
for tag in self.root.iter('*'):
|
for tag in self.itertag():
|
||||||
for attr, val in tag.attrib.iteritems():
|
for attr, val in tag.attrib.iteritems():
|
||||||
for v in val.split():
|
for v in val.split():
|
||||||
am[map_attrib_name(attr)][v].add(tag)
|
am[map_attrib_name(attr)][v].add(tag)
|
||||||
@ -195,20 +219,40 @@ class Select(object):
|
|||||||
if self._lang_map is None:
|
if self._lang_map is None:
|
||||||
self._lang_map = lm = defaultdict(OrderedSet)
|
self._lang_map = lm = defaultdict(OrderedSet)
|
||||||
dl = normalize_language_tag(self.default_lang) if self.default_lang else None
|
dl = normalize_language_tag(self.default_lang) if self.default_lang else None
|
||||||
lmap = {tag:dl for tag in self.root.iter('*')} if dl else {}
|
lmap = {tag:dl for tag in self.itertag()} if dl else {}
|
||||||
for tag in self.root.iter('*'):
|
for tag in self.itertag():
|
||||||
lang = None
|
lang = None
|
||||||
for attr in ('{http://www.w3.org/XML/1998/namespace}lang', 'lang'):
|
for attr in ('{http://www.w3.org/XML/1998/namespace}lang', 'lang'):
|
||||||
lang = tag.get(attr)
|
lang = tag.get(attr)
|
||||||
if lang:
|
if lang:
|
||||||
lang = normalize_language_tag(lang)
|
lang = normalize_language_tag(lang)
|
||||||
for dtag in tag.iter('*'):
|
for dtag in self.itertag(tag):
|
||||||
lmap[dtag] = lang
|
lmap[dtag] = lang
|
||||||
for tag, langs in lmap.iteritems():
|
for tag, langs in lmap.iteritems():
|
||||||
for lang in langs:
|
for lang in langs:
|
||||||
lm[lang].add(tag)
|
lm[lang].add(tag)
|
||||||
return self._lang_map
|
return self._lang_map
|
||||||
|
|
||||||
|
# Tree Integration {{{
|
||||||
|
def itertag(self, tag=None):
|
||||||
|
return (self.root if tag is None else tag).iter('*')
|
||||||
|
|
||||||
|
def iterdescendants(self, tag=None):
|
||||||
|
return (self.root if tag is None else tag).iterdescendants('*')
|
||||||
|
|
||||||
|
def iterchildren(self, tag=None):
|
||||||
|
return (self.root if tag is None else tag).iterchildren('*')
|
||||||
|
|
||||||
|
def itersiblings(self, tag=None, preceding=False):
|
||||||
|
return (self.root if tag is None else tag).itersiblings('*', preceding=preceding)
|
||||||
|
|
||||||
|
def iteridtags(self):
|
||||||
|
return get_compiled_xpath('//*[@id]')(self.root)
|
||||||
|
|
||||||
|
def iterclasstags(self):
|
||||||
|
return get_compiled_xpath('//*[@class]')(self.root)
|
||||||
|
# }}}
|
||||||
|
|
||||||
# Combinators {{{
|
# Combinators {{{
|
||||||
|
|
||||||
def select_combinedselector(cache, combined):
|
def select_combinedselector(cache, combined):
|
||||||
@ -217,39 +261,39 @@ def select_combinedselector(cache, combined):
|
|||||||
# Fast path for when the sub-selector is all elements
|
# Fast path for when the sub-selector is all elements
|
||||||
right = None if isinstance(combined.subselector, Element) and (
|
right = None if isinstance(combined.subselector, Element) and (
|
||||||
combined.subselector.element or '*') == '*' else cache.iterparsedselector(combined.subselector)
|
combined.subselector.element or '*') == '*' else cache.iterparsedselector(combined.subselector)
|
||||||
for item in cache.dispatch_map[combinator](cache.iterparsedselector(combined.selector), right):
|
for item in cache.dispatch_map[combinator](cache, cache.iterparsedselector(combined.selector), right):
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
def select_descendant(left, right):
|
def select_descendant(cache, left, right):
|
||||||
"""right is a child, grand-child or further descendant of left"""
|
"""right is a child, grand-child or further descendant of left"""
|
||||||
right = always_in if right is None else frozenset(right)
|
right = always_in if right is None else frozenset(right)
|
||||||
for ancestor in left:
|
for ancestor in left:
|
||||||
for descendant in ancestor.iterdescendants('*'):
|
for descendant in cache.iterdescendants(ancestor):
|
||||||
if descendant in right:
|
if descendant in right:
|
||||||
yield descendant
|
yield descendant
|
||||||
|
|
||||||
def select_child(left, right):
|
def select_child(cache, left, right):
|
||||||
"""right is an immediate child of left"""
|
"""right is an immediate child of left"""
|
||||||
right = always_in if right is None else frozenset(right)
|
right = always_in if right is None else frozenset(right)
|
||||||
for parent in left:
|
for parent in left:
|
||||||
for child in parent.iterchildren('*'):
|
for child in cache.iterchildren(parent):
|
||||||
if child in right:
|
if child in right:
|
||||||
yield child
|
yield child
|
||||||
|
|
||||||
def select_direct_adjacent(left, right):
|
def select_direct_adjacent(cache, left, right):
|
||||||
"""right is a sibling immediately after left"""
|
"""right is a sibling immediately after left"""
|
||||||
right = always_in if right is None else frozenset(right)
|
right = always_in if right is None else frozenset(right)
|
||||||
for parent in left:
|
for parent in left:
|
||||||
for sibling in parent.itersiblings('*'):
|
for sibling in cache.itersiblings(parent):
|
||||||
if sibling in right:
|
if sibling in right:
|
||||||
yield sibling
|
yield sibling
|
||||||
break
|
break
|
||||||
|
|
||||||
def select_indirect_adjacent(left, right):
|
def select_indirect_adjacent(cache, left, right):
|
||||||
"""right is a sibling after left, immediately or not"""
|
"""right is a sibling after left, immediately or not"""
|
||||||
right = always_in if right is None else frozenset(right)
|
right = always_in if right is None else frozenset(right)
|
||||||
for parent in left:
|
for parent in left:
|
||||||
for sibling in parent.itersiblings('*'):
|
for sibling in cache.itersiblings(parent):
|
||||||
if sibling in right:
|
if sibling in right:
|
||||||
yield sibling
|
yield sibling
|
||||||
# }}}
|
# }}}
|
||||||
@ -258,7 +302,7 @@ def select_element(cache, selector):
|
|||||||
"""A type or universal selector."""
|
"""A type or universal selector."""
|
||||||
element = selector.element
|
element = selector.element
|
||||||
if not element or element == '*':
|
if not element or element == '*':
|
||||||
for elem in cache.root.iter('*'):
|
for elem in cache.itertag():
|
||||||
yield elem
|
yield elem
|
||||||
else:
|
else:
|
||||||
for elem in cache.element_map[ascii_lower(element)]:
|
for elem in cache.element_map[ascii_lower(element)]:
|
||||||
@ -367,4 +411,4 @@ if __name__ == '__main__':
|
|||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
root = etree.fromstring('<body xmlns="xxx" xml:lang="en"><p id="p" class="one two" lang="fr"><a id="a"/></p></body>')
|
root = etree.fromstring('<body xmlns="xxx" xml:lang="en"><p id="p" class="one two" lang="fr"><a id="a"/></p></body>')
|
||||||
select = Select(root, trace=True)
|
select = Select(root, trace=True)
|
||||||
pprint(list(select(':lang(en)')))
|
pprint(list(select('p a')))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user