Implement text search for HTML files

This commit is contained in:
Kovid Goyal 2016-06-02 17:36:44 +05:30
parent 3c27f28fdb
commit 298b664669
3 changed files with 92 additions and 4 deletions

View File

@ -10,7 +10,7 @@ import sys, re
from operator import itemgetter from operator import itemgetter
from cssutils import parseStyle from cssutils import parseStyle
from PyQt5.Qt import QTextEdit, Qt from PyQt5.Qt import QTextEdit, Qt, QTextCursor
from calibre import prepare_string_for_xml, xml_entity_to_unicode from calibre import prepare_string_for_xml, xml_entity_to_unicode
from calibre.ebooks.oeb.polish.container import OEB_DOCS from calibre.ebooks.oeb.polish.container import OEB_DOCS
@ -672,9 +672,62 @@ class Smarts(NullSmarts):
return 'complete_names', (names_type, doc_name, c.root), query return 'complete_names', (names_type, doc_name, c.root), query
def find_text(self, pat, cursor):
from calibre.gui2.tweak_book.text_search import find_text_in_chunks
chunks = []
c = QTextCursor(cursor)
c.setPosition(0)
in_text = True
block = c.block()
cstart = min(cursor.position(), cursor.anchor())
cend = max(cursor.position(), cursor.anchor())
def append(text, start):
after = start + len(text)
if start <= cend and cstart < after:
extra = after - (cend + 1)
if extra > 0:
text = text[:-extra]
extra = cstart - start
if extra > 0:
text = text[extra:]
chunks.append((text, start + max(extra, 0)))
while block.isValid() and block.position() <= cend and block.position() + block.length() > cstart:
boundaries = sorted(block.userData().tags, key=get_offset)
if not boundaries:
# Add the whole line
if in_text:
text = block.text()
if text:
append(text, block.position())
else:
start = block.position()
c.setPosition(start)
for b in boundaries:
if in_text:
c.setPosition(start + b.offset, c.KeepAnchor)
if c.hasSelection():
append(c.selectedText(), c.anchor())
in_text = not b.is_start
c.setPosition(start + b.offset + 1)
if in_text:
# Add remaining text in block
c.setPosition(block.position() + boundaries[-1].offset + 1)
c.movePosition(c.EndOfBlock, c.KeepAnchor)
if c.hasSelection():
append(c.selectedText(), c.anchor())
block = block.next()
s, e = find_text_in_chunks(pat, chunks)
return s != -1 and e != -1, s, e
if __name__ == '__main__': # {{{ if __name__ == '__main__': # {{{
from calibre.gui2.tweak_book.editor.widget import launch_editor from calibre.gui2.tweak_book.editor.widget import launch_editor
launch_editor('''\ if sys.argv[-1].endswith('.html'):
raw = lopen(sys.argv[-1], 'rb').read().decode('utf-8')
else:
raw = '''\
<!DOCTYPE html> <!DOCTYPE html>
<html xml:lang="en" lang="en"> <html xml:lang="en" lang="en">
<!-- <!--
@ -703,5 +756,9 @@ if __name__ == '__main__': # {{{
<p>Some non-BMP unicode text:\U0001f431\U0001f431\U0001f431</p> <p>Some non-BMP unicode text:\U0001f431\U0001f431\U0001f431</p>
</body> </body>
</html> </html>
''', path_is_raw=True, syntax='xml') '''
def callback(ed):
import regex
ed.find_text(regex.compile('A bold word'))
launch_editor(raw, path_is_raw=True, syntax='html', callback=callback)
# }}} # }}}

View File

@ -380,6 +380,7 @@ class TextEdit(PlainTextEdit):
pos = c.End if reverse else c.Start pos = c.End if reverse else c.Start
c.movePosition(pos, c.KeepAnchor) c.movePosition(pos, c.KeepAnchor)
if hasattr(self.smarts, 'find_text'): if hasattr(self.smarts, 'find_text'):
self.highlighter.join()
found, start, end = self.smarts.find_text(pat, c) found, start, end = self.smarts.find_text(pat, c)
if not found: if not found:
return False return False

View File

@ -166,7 +166,7 @@ def run_text_search(search, current_editor, current_editor_name, searchable_name
for fname, syntax in files.iteritems(): for fname, syntax in files.iteritems():
ed = editors.get(fname, None) ed = editors.get(fname, None)
if ed is not None: if ed is not None:
if ed.find_text(pat, complete=True, save_match='gui'): if ed.find_text(pat, complete=True):
show_editor(fname) show_editor(fname)
return True return True
else: else:
@ -182,3 +182,33 @@ def run_text_search(search, current_editor, current_editor_name, searchable_name
msg = '<p>' + _('No matches were found for %s') % ('<pre style="font-style:italic">' + prepare_string_for_xml(search['find']) + '</pre>') msg = '<p>' + _('No matches were found for %s') % ('<pre style="font-style:italic">' + prepare_string_for_xml(search['find']) + '</pre>')
return error_dialog(gui_parent, _('Not found'), msg, show=True) return error_dialog(gui_parent, _('Not found'), msg, show=True)
def find_text_in_chunks(pat, chunks):
text = ''.join(x[0] for x in chunks)
m = pat.search(text)
if m is None:
return -1, -1
start, after = m.span()
def contains(clen, pt):
return offset <= pt < offset + clen
offset = 0
start_pos = end_pos = None
for chunk, chunk_start in chunks:
clen = len(chunk)
if offset + clen < start:
offset += clen
continue # this chunk ends before start
if start_pos is None:
if contains(clen, start):
start_pos = chunk_start + (start - offset)
if start_pos is not None:
if contains(clen, after-1):
end_pos = chunk_start + (after - offset)
return start_pos, end_pos
offset += clen
if offset > after:
break # the next chunk starts after end
return -1, -1