Handle positions when matching on non BMP chars on narrow python builds correctly

2025-07-09 03:04:10 -04:00 · 2014-03-09 16:44:50 +05:30 · 2014-03-09 16:44:50 +05:30 · bdbc6ccfaa
commit bdbc6ccfaa
parent b4e2b9e93f
3 changed files with 24 additions and 15 deletions
--- a/src/calibre/gui2/tweak_book/widgets.py
+++ b/src/calibre/gui2/tweak_book/widgets.py
@ -17,6 +17,7 @@ from PyQt4.Qt import (
 from calibre import prepare_string_for_xml
 from calibre.gui2 import error_dialog, choose_files, choose_save_file
 from calibre.gui2.tweak_book import tprefs
 from calibre.utils.matcher import get_char, Matcher
 class Dialog(QDialog):
@ -309,7 +310,8 @@ class Results(QWidget):
        positions = sorted(set(positions) - {-1}, reverse=True)
        text = prepare_string_for_xml(text)
        for p in positions:
-            text = '%s<span style="%s">%s</span>%s' % (text[:p], self.EMPH, text[p], text[p+1:])
+            ch = get_char(text, p)
            text = '%s<span style="%s">%s</span>%s' % (text[:p], self.EMPH, ch, text[p+len(ch):])
        text = QStaticText(text)
        text.setTextFormat(Qt.RichText)
        return text
@ -363,7 +365,6 @@ class Results(QWidget):
 class QuickOpen(Dialog):
    def __init__(self, items, parent=None):
        from calibre.utils.matcher import Matcher
        self.matcher = Matcher(items)
        self.matches = ()
        self.selected_result = None
--- a/src/calibre/utils/matcher.c
+++ b/src/calibre/utils/matcher.c
@ -155,6 +155,10 @@ static double calc_score_for_char(MatchInfo *m, UChar32 last, UChar32 current, i
 }
 static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) {
 #if PY_VERSION_HEX >= 0x03030000 
 #error Not implemented for python >= 3.3
 #endif
    // The positions array stores character positions as byte offsets in string, convert them into character offsets
    int32_t i, *end;
@ -163,7 +167,11 @@ static void convert_positions(int32_t *positions, int32_t *final_positions, UCha
    end = final_positions + char_len;
    for (i = 0; i < byte_len && final_positions < end; i++) {
        if (positions[i] == -1) continue;
 #ifdef Py_UNICODE_WIDE
        *final_positions = u_countChar32(string, positions[i]);
 #else
        *final_positions = positions[i];
 #endif
        final_positions += 1;
    }
 }
--- a/src/calibre/utils/matcher.py
+++ b/src/calibre/utils/matcher.py
@ -139,6 +139,7 @@ class FilesystemMatcher(Matcher):
    def __init__(self, basedir, *args, **kwargs):
        Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs)
 # Python implementation of the scoring algorithm {{{
 def calc_score_for_char(ctx, prev, current, distance):
    factor = 1.0
    ans = ctx.max_score_per_char
@ -202,11 +203,11 @@ class PyScorer(object):
            self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0
            self.memory = {}
            yield process_item(self, item, needle)
 # }}}
 class CScorer(object):
    def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
        speedup, err = plugins['matcher']
        if speedup is None:
            raise PluginFailed('Failed to load the matcher plugin with error: %s' % err)
@ -217,14 +218,6 @@ class CScorer(object):
        for score, pos in izip(scores, positions):
            yield score, pos
 def test2():
    items = ['.driveinfo.calibre', 'Suspense.xls', 'p/parsed/content.opf', 'ns.html']
    for q in (PyScorer, CScorer):
        print (q)
        m = Matcher(items, scorer=q)
        for item, positions in m('ns').iteritems():
            print ('\tns', item, positions)
 def test():
    items = ['mx\U0001f431nxox']
    for q in (PyScorer, CScorer):
@ -237,7 +230,6 @@ def test():
                    print (item[p], end=' ')
                print ()
 def test_mem():
    from calibre.utils.mem import gc_histogram, diff_hists
    m = Matcher(['a'])
@ -255,6 +247,13 @@ def test_mem():
    h2 = gc_histogram()
    diff_hists(h1, h2)
 if sys.maxunicode >= 0x10ffff:
    get_char = lambda string, pos: string[pos]
 else:
    def get_char(string, pos):
        chs = 2 if ('\ud800' <= string[pos] <= '\udbff') else 1  # UTF-16 surrogate pair in python narrow builds
        return string[pos:pos+chs]
 def main(basedir=None, query=None):
    from calibre import prints
    from calibre.utils.terminal import ColoredStream
@ -279,11 +278,12 @@ def main(basedir=None, query=None):
            while positions:
                pos = positions.pop(0)
                if pos == -1:
-                    break
+                    continue
                prints(path[p:pos], end='')
                ch = get_char(path, pos)
                with emph:
-                    prints(path[pos], end='')
+                    prints(ch, end='')
-                p = pos + 1
+                p = pos + len(ch)
            prints(path[p:])
        query = None