mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Forgot to exclude ruby tags from searching in the native viewer
Also fix ignore_text not recursing and add some performance improvements to the python function for extracting searchable text
This commit is contained in:
parent
f3b35c318f
commit
69cf7e684b
@ -224,21 +224,24 @@ class SearchResult:
|
|||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def searchable_text_for_name(name):
|
def searchable_text_for_name(name):
|
||||||
ans = []
|
ans = []
|
||||||
|
add_text = ans.append
|
||||||
serialized_data = json.loads(get_data(name)[0])
|
serialized_data = json.loads(get_data(name)[0])
|
||||||
stack = []
|
stack = []
|
||||||
|
a = stack.append
|
||||||
removed_tails = []
|
removed_tails = []
|
||||||
|
no_visit = frozenset({'script', 'style', 'title', 'head'})
|
||||||
|
ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'})
|
||||||
for child in serialized_data['tree']['c']:
|
for child in serialized_data['tree']['c']:
|
||||||
if child.get('n') == 'body':
|
if child.get('n') == 'body':
|
||||||
stack.append(child)
|
a((child, False))
|
||||||
# the JS code does not add the tail of body tags to flat text
|
# the JS code does not add the tail of body tags to flat text
|
||||||
removed_tails.append((child.pop('l', None), child))
|
removed_tails.append((child.pop('l', None), child))
|
||||||
ignore_text = {'script', 'style', 'title'}
|
|
||||||
text_pos = 0
|
text_pos = 0
|
||||||
anchor_offset_map = OrderedDict()
|
anchor_offset_map = OrderedDict()
|
||||||
while stack:
|
while stack:
|
||||||
node = stack.pop()
|
node, text_ignored_in_parent = stack.pop()
|
||||||
if isinstance(node, str):
|
if isinstance(node, str):
|
||||||
ans.append(node)
|
add_text(node)
|
||||||
text_pos += len(node)
|
text_pos += len(node)
|
||||||
continue
|
continue
|
||||||
g = node.get
|
g = node.get
|
||||||
@ -253,13 +256,18 @@ def searchable_text_for_name(name):
|
|||||||
aid = x[1]
|
aid = x[1]
|
||||||
if aid not in anchor_offset_map:
|
if aid not in anchor_offset_map:
|
||||||
anchor_offset_map[aid] = text_pos
|
anchor_offset_map[aid] = text_pos
|
||||||
if name and text and name not in ignore_text:
|
if name in no_visit:
|
||||||
ans.append(text)
|
continue
|
||||||
|
ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text
|
||||||
|
|
||||||
|
if text and not ignore_text_in_node_and_children:
|
||||||
|
add_text(text)
|
||||||
text_pos += len(text)
|
text_pos += len(text)
|
||||||
if tail:
|
if tail and not text_ignored_in_parent:
|
||||||
stack.append(tail)
|
a((tail, ignore_text_in_node_and_children))
|
||||||
if children:
|
if children:
|
||||||
stack.extend(reversed(children))
|
for child in reversed(children):
|
||||||
|
a((child, ignore_text_in_node_and_children))
|
||||||
for (tail, body) in removed_tails:
|
for (tail, body) in removed_tails:
|
||||||
if tail is not None:
|
if tail is not None:
|
||||||
body['l'] = tail
|
body['l'] = tail
|
||||||
|
@ -436,7 +436,7 @@ def text_from_serialized_html(data, get_anchor_offset_map):
|
|||||||
anchor_offset_map[aid] = text_pos
|
anchor_offset_map[aid] = text_pos
|
||||||
if no_visit[src.n]:
|
if no_visit[src.n]:
|
||||||
continue
|
continue
|
||||||
ignore_text_in_node_and_children = v'!!ignore_text[src.n]'
|
ignore_text_in_node_and_children = text_ignored_in_parent or v'!!ignore_text[src.n]'
|
||||||
if not ignore_text_in_node_and_children and src.x:
|
if not ignore_text_in_node_and_children and src.x:
|
||||||
ans.push(src.x)
|
ans.push(src.x)
|
||||||
text_pos += src.x.length
|
text_pos += src.x.length
|
||||||
|
Loading…
x
Reference in New Issue
Block a user