Implement text search for HTML files

This commit is contained in:
Kovid Goyal 2016-06-02 17:36:44 +05:30
parent 3c27f28fdb
commit 298b664669
3 changed files with 92 additions and 4 deletions

View File

@ -10,7 +10,7 @@ import sys, re
from operator import itemgetter
from cssutils import parseStyle
from PyQt5.Qt import QTextEdit, Qt
from PyQt5.Qt import QTextEdit, Qt, QTextCursor
from calibre import prepare_string_for_xml, xml_entity_to_unicode
from calibre.ebooks.oeb.polish.container import OEB_DOCS
@ -672,9 +672,62 @@ class Smarts(NullSmarts):
return 'complete_names', (names_type, doc_name, c.root), query
def find_text(self, pat, cursor):
from calibre.gui2.tweak_book.text_search import find_text_in_chunks
chunks = []
c = QTextCursor(cursor)
c.setPosition(0)
in_text = True
block = c.block()
cstart = min(cursor.position(), cursor.anchor())
cend = max(cursor.position(), cursor.anchor())
def append(text, start):
after = start + len(text)
if start <= cend and cstart < after:
extra = after - (cend + 1)
if extra > 0:
text = text[:-extra]
extra = cstart - start
if extra > 0:
text = text[extra:]
chunks.append((text, start + max(extra, 0)))
while block.isValid() and block.position() <= cend and block.position() + block.length() > cstart:
boundaries = sorted(block.userData().tags, key=get_offset)
if not boundaries:
# Add the whole line
if in_text:
text = block.text()
if text:
append(text, block.position())
else:
start = block.position()
c.setPosition(start)
for b in boundaries:
if in_text:
c.setPosition(start + b.offset, c.KeepAnchor)
if c.hasSelection():
append(c.selectedText(), c.anchor())
in_text = not b.is_start
c.setPosition(start + b.offset + 1)
if in_text:
# Add remaining text in block
c.setPosition(block.position() + boundaries[-1].offset + 1)
c.movePosition(c.EndOfBlock, c.KeepAnchor)
if c.hasSelection():
append(c.selectedText(), c.anchor())
block = block.next()
s, e = find_text_in_chunks(pat, chunks)
return s != -1 and e != -1, s, e
if __name__ == '__main__': # {{{
from calibre.gui2.tweak_book.editor.widget import launch_editor
launch_editor('''\
if sys.argv[-1].endswith('.html'):
raw = lopen(sys.argv[-1], 'rb').read().decode('utf-8')
else:
raw = '''\
<!DOCTYPE html>
<html xml:lang="en" lang="en">
<!--
@ -703,5 +756,9 @@ if __name__ == '__main__': # {{{
<p>Some non-BMP unicode text:\U0001f431\U0001f431\U0001f431</p>
</body>
</html>
''', path_is_raw=True, syntax='xml')
'''
def callback(ed):
import regex
ed.find_text(regex.compile('A bold word'))
launch_editor(raw, path_is_raw=True, syntax='html', callback=callback)
# }}}

View File

@ -380,6 +380,7 @@ class TextEdit(PlainTextEdit):
pos = c.End if reverse else c.Start
c.movePosition(pos, c.KeepAnchor)
if hasattr(self.smarts, 'find_text'):
self.highlighter.join()
found, start, end = self.smarts.find_text(pat, c)
if not found:
return False

View File

@ -166,7 +166,7 @@ def run_text_search(search, current_editor, current_editor_name, searchable_name
for fname, syntax in files.iteritems():
ed = editors.get(fname, None)
if ed is not None:
if ed.find_text(pat, complete=True, save_match='gui'):
if ed.find_text(pat, complete=True):
show_editor(fname)
return True
else:
@ -182,3 +182,33 @@ def run_text_search(search, current_editor, current_editor_name, searchable_name
msg = '<p>' + _('No matches were found for %s') % ('<pre style="font-style:italic">' + prepare_string_for_xml(search['find']) + '</pre>')
return error_dialog(gui_parent, _('Not found'), msg, show=True)
def find_text_in_chunks(pat, chunks):
text = ''.join(x[0] for x in chunks)
m = pat.search(text)
if m is None:
return -1, -1
start, after = m.span()
def contains(clen, pt):
return offset <= pt < offset + clen
offset = 0
start_pos = end_pos = None
for chunk, chunk_start in chunks:
clen = len(chunk)
if offset + clen < start:
offset += clen
continue # this chunk ends before start
if start_pos is None:
if contains(clen, start):
start_pos = chunk_start + (start - offset)
if start_pos is not None:
if contains(clen, after-1):
end_pos = chunk_start + (after - offset)
return start_pos, end_pos
offset += clen
if offset > after:
break # the next chunk starts after end
return -1, -1