Command line interface to filesystem matcher

Also fix a couple of bugs in the matcher algorithms
This commit is contained in:
Kovid Goyal 2014-03-08 20:55:20 +05:30
parent 09be666ea0
commit bd88666bb0
2 changed files with 129 additions and 45 deletions

View File

@ -158,10 +158,9 @@ static void convert_positions(int32_t *positions, int32_t *final_positions, UCha
// The positions array stores character positions as byte offsets in string, convert them into character offsets // The positions array stores character positions as byte offsets in string, convert them into character offsets
int32_t i, *end; int32_t i, *end;
if (score == 0.0) { if (score == 0.0) { for (i = 0; i < char_len; i++) final_positions[i] = -1; return; }
for (i = 0; i < char_len; i++) final_positions[i] = -1;
return; if (char_len == byte_len) { memcpy(final_positions, positions, sizeof(*positions) * char_len); return; }
}
end = final_positions + char_len; end = final_positions + char_len;
for (i = 0; i < byte_len && final_positions < end; i++) { for (i = 0; i < byte_len && final_positions < end; i++) {
@ -293,16 +292,14 @@ static bool match(UChar **items, int32_t *item_lengths, uint32_t item_count, UCh
if (stack.items == NULL || memo == NULL) {PyErr_NoMemory(); goto end;} if (stack.items == NULL || memo == NULL) {PyErr_NoMemory(); goto end;}
for (i = 0; i < (int32_t)item_count; i++) { for (i = 0; i < (int32_t)item_count; i++) {
for (r = 0; r < needle_len; r++) { for (r = 0; r < needle_len; r++) positions[r] = -1;
positions[r] = -1;
}
stack_clear(&stack); stack_clear(&stack);
clear_memory(memo, needle_len, matches[i].haystack_len); clear_memory(memo, needle_len, matches[i].haystack_len);
free_searches(searches, needle_len); free_searches(searches, needle_len);
if (!create_searches(searches, matches[i].haystack, matches[i].haystack_len, needle, needle_len, collator)) goto end; if (!create_searches(searches, matches[i].haystack, matches[i].haystack_len, needle, needle_len, collator)) goto end;
matches[i].memo = memo; matches[i].memo = memo;
match_results[i].score = process_item(&matches[i], &stack, positions, searches); match_results[i].score = process_item(&matches[i], &stack, positions, searches);
convert_positions(positions, final_positions + i, matches[i].haystack, needle_char_len, needle_len, match_results[i].score); convert_positions(positions, final_positions + i * needle_char_len, matches[i].haystack, needle_char_len, needle_len, match_results[i].score);
} }
ok = TRUE; ok = TRUE;
@ -430,7 +427,7 @@ Matcher_calculate_scores(Matcher *self, PyObject *args) {
score = PyFloat_FromDouble(matches[i].score); score = PyFloat_FromDouble(matches[i].score);
if (score == NULL) { PyErr_NoMemory(); goto end; } if (score == NULL) { PyErr_NoMemory(); goto end; }
PyTuple_SET_ITEM(items, (Py_ssize_t)i, score); PyTuple_SET_ITEM(items, (Py_ssize_t)i, score);
p = final_positions + i; p = final_positions + (i * needle_char_len);
for (j = 0; j < needle_char_len; j++) { for (j = 0; j < needle_char_len; j++) {
score = PyInt_FromLong((long)p[j]); score = PyInt_FromLong((long)p[j]);
if (score == NULL) { PyErr_NoMemory(); goto end; } if (score == NULL) { PyErr_NoMemory(); goto end; }

View File

@ -6,17 +6,20 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import atexit import atexit, os, sys
from math import ceil from math import ceil
from unicodedata import normalize from unicodedata import normalize
from threading import Thread, Lock from threading import Thread, Lock
from Queue import Queue from Queue import Queue
from operator import itemgetter
from collections import OrderedDict
from itertools import islice
from itertools import izip from itertools import izip
from future_builtins import map from future_builtins import map
from calibre import detect_ncpus as cpu_count from calibre import detect_ncpus as cpu_count, as_unicode
from calibre.constants import plugins from calibre.constants import plugins, filesystem_encoding
from calibre.utils.icu import primary_sort_key, primary_find, primary_collator from calibre.utils.icu import primary_sort_key, primary_find, primary_collator
DEFAULT_LEVEL1 = '/' DEFAULT_LEVEL1 = '/'
@ -38,35 +41,35 @@ class Worker(Thread):
if x is None: if x is None:
break break
try: try:
self.results.put((True, self.process_query(*x))) i, scorer, query = x
except: self.results.put((True, (i, scorer(query))))
import traceback except Exception as e:
self.results.put((False, traceback.format_exc())) self.results.put((False, as_unicode(e)))
# import traceback
# traceback.print_exc()
wlock = Lock() wlock = Lock()
workers = [] workers = []
def split(tasks, pool_size): def split(tasks, pool_size):
''' '''
Split a list into a list of sub lists, with the number of sub lists being Split a list into a list of sub lists, with the number of sub lists being
no more than the number of workers this server supports. Each sublist contains no more than pool_size. Each sublist contains
2-tuples of the form (i, x) where x is an element from the original list 2-tuples of the form (i, x) where x is an element from the original list
and i is the index of the element x in the original list. and i is the index of the element x in the original list.
''' '''
ans, count, pos = [], 0, 0 ans, count = [], 0
delta = int(ceil(len(tasks)/pool_size)) delta = int(ceil(len(tasks)/pool_size))
while count < len(tasks): while tasks:
section = [] section = [(count+i, task) for i, task in enumerate(tasks[:delta])]
for t in tasks[pos:pos+delta]: tasks = tasks[delta:]
section.append((count, t)) count += len(section)
count += 1
ans.append(section) ans.append(section)
pos += delta
return ans return ans
class Matcher(object): class Matcher(object):
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3): def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3, scorer=None):
with wlock: with wlock:
if not workers: if not workers:
requests, results = Queue(), Queue() requests, results = Queue(), Queue()
@ -75,12 +78,57 @@ class Matcher(object):
workers.extend(w) workers.extend(w)
items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items)) items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
self.items = items = tuple(items) self.items = items = tuple(items)
self.sort_keys = tuple(map(primary_sort_key, items)) tasks = split(items, len(workers))
self.task_maps = [{j:i for j, (i, _) in enumerate(task)} for task in tasks]
scorer = scorer or CScorer
self.scorers = [scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks]
self.sort_keys = None
def __call__(self, query): def __call__(self, query):
query = normalize('NFC', unicode(query)).encode('utf-8') query = normalize('NFC', unicode(query))
return map(lambda x:x.decode('utf-8'), self.m.get_matches(query)) with wlock:
for i, scorer in enumerate(self.scorers):
workers[0].requests.put((i, scorer, query))
if self.sort_keys is None:
self.sort_keys = {i:primary_sort_key(x) for i, x in enumerate(self.items)}
num = len(self.task_maps)
scores, positions = {}, {}
error = None
while num > 0:
ok, x = workers[0].results.get()
num -= 1
if ok:
task_num, vals = x
task_map = self.task_maps[task_num]
for i, (score, pos) in enumerate(vals):
item = task_map[i]
scores[item] = score
positions[item] = pos
else:
error = x
if error is not None:
raise Exception('Failed to score items: %s' % error)
items = sorted(((-scores[i], item, positions[i]) for i, item in enumerate(self.items)),
key=itemgetter(0))
return OrderedDict(x[1:] for x in items)
def get_items_from_dir(basedir):
if isinstance(basedir, bytes):
basedir = basedir.decode(filesystem_encoding)
relsep = os.sep != '/'
for dirpath, dirnames, filenames in os.walk(basedir):
for f in filenames:
x = os.path.join(dirpath, f)
x = os.path.relpath(x, basedir)
if relsep:
x = x.replace(os.sep, '/')
yield x
class FilesystemMatcher(Matcher):
def __init__(self, basedir, *args, **kwargs):
Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs)
def calc_score_for_char(ctx, prev, current, distance): def calc_score_for_char(ctx, prev, current, distance):
factor = 1.0 factor = 1.0
@ -112,10 +160,11 @@ def process_item(ctx, haystack, needle):
if (len(haystack) - hidx < len(needle) - i): if (len(haystack) - hidx < len(needle) - i):
score = 0 score = 0
break break
pos = primary_find(n, haystack[hidx:])[0] + hidx pos = primary_find(n, haystack[hidx:])[0]
if pos == -1: if pos == -1:
score = 0 score = 0
break break
pos += hidx
distance = pos - last_idx distance = pos - last_idx
score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(ctx, haystack[pos-1], haystack[pos], distance) score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(ctx, haystack[pos-1], haystack[pos], distance)
@ -137,7 +186,7 @@ class PyScorer(object):
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3): def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
self.level1, self.level2, self.level3 = level1, level2, level3 self.level1, self.level2, self.level3 = level1, level2, level3
self.max_score_per_char = 0 self.max_score_per_char = 0
self.items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items)) self.items = items
def __call__(self, needle): def __call__(self, needle):
for item in self.items: for item in self.items:
@ -148,7 +197,6 @@ class PyScorer(object):
class CScorer(object): class CScorer(object):
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3): def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
items = tuple(map(lambda x: normalize('NFC', unicode(x)), filter(None, items)))
speedup, err = plugins['matcher'] speedup, err = plugins['matcher']
if speedup is None: if speedup is None:
@ -156,23 +204,32 @@ class CScorer(object):
self.m = speedup.Matcher(items, primary_collator().capsule, unicode(level1), unicode(level2), unicode(level3)) self.m = speedup.Matcher(items, primary_collator().capsule, unicode(level1), unicode(level2), unicode(level3))
def __call__(self, query): def __call__(self, query):
query = normalize('NFC', unicode(query))
scores, positions = self.m.calculate_scores(query) scores, positions = self.m.calculate_scores(query)
for score, pos in izip(scores, positions): for score, pos in izip(scores, positions):
yield score, pos yield score, pos
def test(): def test2():
items = ['m1mn34o/mno'] items = ['.driveinfo.calibre', 'Suspense.xls', 'p/parsed/content.opf', 'ns.html']
s = PyScorer(items) for q in (PyScorer, CScorer):
c = CScorer(items)
for q in (s, c):
print (q) print (q)
for item, (score, positions) in izip(items, q('MNO')): m = Matcher(items, scorer=q)
print (item, score, positions) for item, positions in m('ns').iteritems():
print ('\tns', item, positions)
def test():
items = ['m1mn34o/mno', 'xxx/XXX', 'mxnxox']
for q in (PyScorer, CScorer):
print (q)
m = Matcher(items, scorer=q)
for item, positions in m('MNO').iteritems():
print ('\tMNO', item, positions)
for item, positions in m('xxx').iteritems():
print ('\txxx', item, positions)
def test_mem(): def test_mem():
from calibre.utils.mem import gc_histogram, diff_hists from calibre.utils.mem import gc_histogram, diff_hists
m = Matcher([]) m = Matcher(['a'])
m('a')
del m del m
def doit(c): def doit(c):
m = Matcher([c+'im/one.gif', c+'im/two.gif', c+'text/one.html',]) m = Matcher([c+'im/one.gif', c+'im/two.gif', c+'text/one.html',])
@ -182,12 +239,42 @@ def test_mem():
h1 = gc_histogram() h1 = gc_histogram()
for i in xrange(100): for i in xrange(100):
doit(str(i)) doit(str(i))
gc.collect()
h2 = gc_histogram() h2 = gc_histogram()
diff_hists(h1, h2) diff_hists(h1, h2)
def main(basedir=None, query=None):
from calibre import prints
from calibre.utils.terminal import ColoredStream
if basedir is None:
try:
basedir = raw_input('Enter directory to scan [%s]: ' % os.getcwdu()).decode(sys.stdin.encoding).strip() or os.getcwdu()
except (EOFError, KeyboardInterrupt):
return
m = FilesystemMatcher(basedir)
emph = ColoredStream(sys.stdout, fg='red', bold=True)
while True:
if query is None:
try:
query = raw_input('Enter query: ').decode(sys.stdin.encoding)
except (EOFError, KeyboardInterrupt):
break
if not query:
break
for path, positions in islice(m(query).iteritems(), 0, 10):
positions = list(positions)
p = 0
while positions:
pos = positions.pop(0)
if pos == -1:
break
prints(path[p:pos], end='')
with emph:
prints(path[pos], end='')
p = pos + 1
prints(path[p:])
query = None
if __name__ == '__main__': if __name__ == '__main__':
test() # main(basedir='/t', query='ns')
# m = Matcher(['image/one.png', 'image/two.gif', 'text/one.html']) main()
# for q in ('one', 'ONE', 'ton', 'imo'):
# print (q, '->', tuple(m(q)))
# test_mem()