Handle positions when matching on non BMP chars on narrow python builds correctly

This commit is contained in:
Kovid Goyal 2014-03-09 16:44:50 +05:30
parent b4e2b9e93f
commit bdbc6ccfaa
3 changed files with 24 additions and 15 deletions

View File

@ -17,6 +17,7 @@ from PyQt4.Qt import (
from calibre import prepare_string_for_xml from calibre import prepare_string_for_xml
from calibre.gui2 import error_dialog, choose_files, choose_save_file from calibre.gui2 import error_dialog, choose_files, choose_save_file
from calibre.gui2.tweak_book import tprefs from calibre.gui2.tweak_book import tprefs
from calibre.utils.matcher import get_char, Matcher
class Dialog(QDialog): class Dialog(QDialog):
@ -309,7 +310,8 @@ class Results(QWidget):
positions = sorted(set(positions) - {-1}, reverse=True) positions = sorted(set(positions) - {-1}, reverse=True)
text = prepare_string_for_xml(text) text = prepare_string_for_xml(text)
for p in positions: for p in positions:
text = '%s<span style="%s">%s</span>%s' % (text[:p], self.EMPH, text[p], text[p+1:]) ch = get_char(text, p)
text = '%s<span style="%s">%s</span>%s' % (text[:p], self.EMPH, ch, text[p+len(ch):])
text = QStaticText(text) text = QStaticText(text)
text.setTextFormat(Qt.RichText) text.setTextFormat(Qt.RichText)
return text return text
@ -363,7 +365,6 @@ class Results(QWidget):
class QuickOpen(Dialog): class QuickOpen(Dialog):
def __init__(self, items, parent=None): def __init__(self, items, parent=None):
from calibre.utils.matcher import Matcher
self.matcher = Matcher(items) self.matcher = Matcher(items)
self.matches = () self.matches = ()
self.selected_result = None self.selected_result = None

View File

@ -155,6 +155,10 @@ static double calc_score_for_char(MatchInfo *m, UChar32 last, UChar32 current, i
} }
static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) { static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif
// The positions array stores character positions as byte offsets in string, convert them into character offsets // The positions array stores character positions as byte offsets in string, convert them into character offsets
int32_t i, *end; int32_t i, *end;
@ -163,7 +167,11 @@ static void convert_positions(int32_t *positions, int32_t *final_positions, UCha
end = final_positions + char_len; end = final_positions + char_len;
for (i = 0; i < byte_len && final_positions < end; i++) { for (i = 0; i < byte_len && final_positions < end; i++) {
if (positions[i] == -1) continue; if (positions[i] == -1) continue;
#ifdef Py_UNICODE_WIDE
*final_positions = u_countChar32(string, positions[i]); *final_positions = u_countChar32(string, positions[i]);
#else
*final_positions = positions[i];
#endif
final_positions += 1; final_positions += 1;
} }
} }

View File

@ -139,6 +139,7 @@ class FilesystemMatcher(Matcher):
def __init__(self, basedir, *args, **kwargs): def __init__(self, basedir, *args, **kwargs):
Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs) Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs)
# Python implementation of the scoring algorithm {{{
def calc_score_for_char(ctx, prev, current, distance): def calc_score_for_char(ctx, prev, current, distance):
factor = 1.0 factor = 1.0
ans = ctx.max_score_per_char ans = ctx.max_score_per_char
@ -202,11 +203,11 @@ class PyScorer(object):
self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0 self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0
self.memory = {} self.memory = {}
yield process_item(self, item, needle) yield process_item(self, item, needle)
# }}}
class CScorer(object): class CScorer(object):
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3): def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
speedup, err = plugins['matcher'] speedup, err = plugins['matcher']
if speedup is None: if speedup is None:
raise PluginFailed('Failed to load the matcher plugin with error: %s' % err) raise PluginFailed('Failed to load the matcher plugin with error: %s' % err)
@ -217,14 +218,6 @@ class CScorer(object):
for score, pos in izip(scores, positions): for score, pos in izip(scores, positions):
yield score, pos yield score, pos
def test2():
items = ['.driveinfo.calibre', 'Suspense.xls', 'p/parsed/content.opf', 'ns.html']
for q in (PyScorer, CScorer):
print (q)
m = Matcher(items, scorer=q)
for item, positions in m('ns').iteritems():
print ('\tns', item, positions)
def test(): def test():
items = ['mx\U0001f431nxox'] items = ['mx\U0001f431nxox']
for q in (PyScorer, CScorer): for q in (PyScorer, CScorer):
@ -237,7 +230,6 @@ def test():
print (item[p], end=' ') print (item[p], end=' ')
print () print ()
def test_mem(): def test_mem():
from calibre.utils.mem import gc_histogram, diff_hists from calibre.utils.mem import gc_histogram, diff_hists
m = Matcher(['a']) m = Matcher(['a'])
@ -255,6 +247,13 @@ def test_mem():
h2 = gc_histogram() h2 = gc_histogram()
diff_hists(h1, h2) diff_hists(h1, h2)
if sys.maxunicode >= 0x10ffff:
get_char = lambda string, pos: string[pos]
else:
def get_char(string, pos):
chs = 2 if ('\ud800' <= string[pos] <= '\udbff') else 1 # UTF-16 surrogate pair in python narrow builds
return string[pos:pos+chs]
def main(basedir=None, query=None): def main(basedir=None, query=None):
from calibre import prints from calibre import prints
from calibre.utils.terminal import ColoredStream from calibre.utils.terminal import ColoredStream
@ -279,11 +278,12 @@ def main(basedir=None, query=None):
while positions: while positions:
pos = positions.pop(0) pos = positions.pop(0)
if pos == -1: if pos == -1:
break continue
prints(path[p:pos], end='') prints(path[p:pos], end='')
ch = get_char(path, pos)
with emph: with emph:
prints(path[pos], end='') prints(ch, end='')
p = pos + 1 p = pos + len(ch)
prints(path[p:]) prints(path[p:])
query = None query = None