mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Command line interface to filesystem matcher
Also fix a couple of bugs in the matcher algorithms
This commit is contained in:
parent
09be666ea0
commit
bd88666bb0
@ -158,10 +158,9 @@ static void convert_positions(int32_t *positions, int32_t *final_positions, UCha
|
|||||||
// The positions array stores character positions as byte offsets in string, convert them into character offsets
|
// The positions array stores character positions as byte offsets in string, convert them into character offsets
|
||||||
int32_t i, *end;
|
int32_t i, *end;
|
||||||
|
|
||||||
if (score == 0.0) {
|
if (score == 0.0) { for (i = 0; i < char_len; i++) final_positions[i] = -1; return; }
|
||||||
for (i = 0; i < char_len; i++) final_positions[i] = -1;
|
|
||||||
return;
|
if (char_len == byte_len) { memcpy(final_positions, positions, sizeof(*positions) * char_len); return; }
|
||||||
}
|
|
||||||
|
|
||||||
end = final_positions + char_len;
|
end = final_positions + char_len;
|
||||||
for (i = 0; i < byte_len && final_positions < end; i++) {
|
for (i = 0; i < byte_len && final_positions < end; i++) {
|
||||||
@ -293,16 +292,14 @@ static bool match(UChar **items, int32_t *item_lengths, uint32_t item_count, UCh
|
|||||||
if (stack.items == NULL || memo == NULL) {PyErr_NoMemory(); goto end;}
|
if (stack.items == NULL || memo == NULL) {PyErr_NoMemory(); goto end;}
|
||||||
|
|
||||||
for (i = 0; i < (int32_t)item_count; i++) {
|
for (i = 0; i < (int32_t)item_count; i++) {
|
||||||
for (r = 0; r < needle_len; r++) {
|
for (r = 0; r < needle_len; r++) positions[r] = -1;
|
||||||
positions[r] = -1;
|
|
||||||
}
|
|
||||||
stack_clear(&stack);
|
stack_clear(&stack);
|
||||||
clear_memory(memo, needle_len, matches[i].haystack_len);
|
clear_memory(memo, needle_len, matches[i].haystack_len);
|
||||||
free_searches(searches, needle_len);
|
free_searches(searches, needle_len);
|
||||||
if (!create_searches(searches, matches[i].haystack, matches[i].haystack_len, needle, needle_len, collator)) goto end;
|
if (!create_searches(searches, matches[i].haystack, matches[i].haystack_len, needle, needle_len, collator)) goto end;
|
||||||
matches[i].memo = memo;
|
matches[i].memo = memo;
|
||||||
match_results[i].score = process_item(&matches[i], &stack, positions, searches);
|
match_results[i].score = process_item(&matches[i], &stack, positions, searches);
|
||||||
convert_positions(positions, final_positions + i, matches[i].haystack, needle_char_len, needle_len, match_results[i].score);
|
convert_positions(positions, final_positions + i * needle_char_len, matches[i].haystack, needle_char_len, needle_len, match_results[i].score);
|
||||||
}
|
}
|
||||||
|
|
||||||
ok = TRUE;
|
ok = TRUE;
|
||||||
@ -430,7 +427,7 @@ Matcher_calculate_scores(Matcher *self, PyObject *args) {
|
|||||||
score = PyFloat_FromDouble(matches[i].score);
|
score = PyFloat_FromDouble(matches[i].score);
|
||||||
if (score == NULL) { PyErr_NoMemory(); goto end; }
|
if (score == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
PyTuple_SET_ITEM(items, (Py_ssize_t)i, score);
|
PyTuple_SET_ITEM(items, (Py_ssize_t)i, score);
|
||||||
p = final_positions + i;
|
p = final_positions + (i * needle_char_len);
|
||||||
for (j = 0; j < needle_char_len; j++) {
|
for (j = 0; j < needle_char_len; j++) {
|
||||||
score = PyInt_FromLong((long)p[j]);
|
score = PyInt_FromLong((long)p[j]);
|
||||||
if (score == NULL) { PyErr_NoMemory(); goto end; }
|
if (score == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
|
@ -6,17 +6,20 @@ from __future__ import (unicode_literals, division, absolute_import,
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
import atexit
|
import atexit, os, sys
|
||||||
from math import ceil
|
from math import ceil
|
||||||
from unicodedata import normalize
|
from unicodedata import normalize
|
||||||
from threading import Thread, Lock
|
from threading import Thread, Lock
|
||||||
from Queue import Queue
|
from Queue import Queue
|
||||||
|
from operator import itemgetter
|
||||||
|
from collections import OrderedDict
|
||||||
|
from itertools import islice
|
||||||
|
|
||||||
from itertools import izip
|
from itertools import izip
|
||||||
from future_builtins import map
|
from future_builtins import map
|
||||||
|
|
||||||
from calibre import detect_ncpus as cpu_count
|
from calibre import detect_ncpus as cpu_count, as_unicode
|
||||||
from calibre.constants import plugins
|
from calibre.constants import plugins, filesystem_encoding
|
||||||
from calibre.utils.icu import primary_sort_key, primary_find, primary_collator
|
from calibre.utils.icu import primary_sort_key, primary_find, primary_collator
|
||||||
|
|
||||||
DEFAULT_LEVEL1 = '/'
|
DEFAULT_LEVEL1 = '/'
|
||||||
@ -38,35 +41,35 @@ class Worker(Thread):
|
|||||||
if x is None:
|
if x is None:
|
||||||
break
|
break
|
||||||
try:
|
try:
|
||||||
self.results.put((True, self.process_query(*x)))
|
i, scorer, query = x
|
||||||
except:
|
self.results.put((True, (i, scorer(query))))
|
||||||
import traceback
|
except Exception as e:
|
||||||
self.results.put((False, traceback.format_exc()))
|
self.results.put((False, as_unicode(e)))
|
||||||
|
# import traceback
|
||||||
|
# traceback.print_exc()
|
||||||
wlock = Lock()
|
wlock = Lock()
|
||||||
workers = []
|
workers = []
|
||||||
|
|
||||||
def split(tasks, pool_size):
|
def split(tasks, pool_size):
|
||||||
'''
|
'''
|
||||||
Split a list into a list of sub lists, with the number of sub lists being
|
Split a list into a list of sub lists, with the number of sub lists being
|
||||||
no more than the number of workers this server supports. Each sublist contains
|
no more than pool_size. Each sublist contains
|
||||||
2-tuples of the form (i, x) where x is an element from the original list
|
2-tuples of the form (i, x) where x is an element from the original list
|
||||||
and i is the index of the element x in the original list.
|
and i is the index of the element x in the original list.
|
||||||
'''
|
'''
|
||||||
ans, count, pos = [], 0, 0
|
ans, count = [], 0
|
||||||
delta = int(ceil(len(tasks)/pool_size))
|
delta = int(ceil(len(tasks)/pool_size))
|
||||||
while count < len(tasks):
|
while tasks:
|
||||||
section = []
|
section = [(count+i, task) for i, task in enumerate(tasks[:delta])]
|
||||||
for t in tasks[pos:pos+delta]:
|
tasks = tasks[delta:]
|
||||||
section.append((count, t))
|
count += len(section)
|
||||||
count += 1
|
|
||||||
ans.append(section)
|
ans.append(section)
|
||||||
pos += delta
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
class Matcher(object):
|
class Matcher(object):
|
||||||
|
|
||||||
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3, scorer=None):
|
||||||
with wlock:
|
with wlock:
|
||||||
if not workers:
|
if not workers:
|
||||||
requests, results = Queue(), Queue()
|
requests, results = Queue(), Queue()
|
||||||
@ -75,12 +78,57 @@ class Matcher(object):
|
|||||||
workers.extend(w)
|
workers.extend(w)
|
||||||
items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
|
items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
|
||||||
self.items = items = tuple(items)
|
self.items = items = tuple(items)
|
||||||
self.sort_keys = tuple(map(primary_sort_key, items))
|
tasks = split(items, len(workers))
|
||||||
|
self.task_maps = [{j:i for j, (i, _) in enumerate(task)} for task in tasks]
|
||||||
|
scorer = scorer or CScorer
|
||||||
|
self.scorers = [scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks]
|
||||||
|
self.sort_keys = None
|
||||||
|
|
||||||
def __call__(self, query):
|
def __call__(self, query):
|
||||||
query = normalize('NFC', unicode(query)).encode('utf-8')
|
query = normalize('NFC', unicode(query))
|
||||||
return map(lambda x:x.decode('utf-8'), self.m.get_matches(query))
|
with wlock:
|
||||||
|
for i, scorer in enumerate(self.scorers):
|
||||||
|
workers[0].requests.put((i, scorer, query))
|
||||||
|
if self.sort_keys is None:
|
||||||
|
self.sort_keys = {i:primary_sort_key(x) for i, x in enumerate(self.items)}
|
||||||
|
num = len(self.task_maps)
|
||||||
|
scores, positions = {}, {}
|
||||||
|
error = None
|
||||||
|
while num > 0:
|
||||||
|
ok, x = workers[0].results.get()
|
||||||
|
num -= 1
|
||||||
|
if ok:
|
||||||
|
task_num, vals = x
|
||||||
|
task_map = self.task_maps[task_num]
|
||||||
|
for i, (score, pos) in enumerate(vals):
|
||||||
|
item = task_map[i]
|
||||||
|
scores[item] = score
|
||||||
|
positions[item] = pos
|
||||||
|
else:
|
||||||
|
error = x
|
||||||
|
|
||||||
|
if error is not None:
|
||||||
|
raise Exception('Failed to score items: %s' % error)
|
||||||
|
items = sorted(((-scores[i], item, positions[i]) for i, item in enumerate(self.items)),
|
||||||
|
key=itemgetter(0))
|
||||||
|
return OrderedDict(x[1:] for x in items)
|
||||||
|
|
||||||
|
def get_items_from_dir(basedir):
|
||||||
|
if isinstance(basedir, bytes):
|
||||||
|
basedir = basedir.decode(filesystem_encoding)
|
||||||
|
relsep = os.sep != '/'
|
||||||
|
for dirpath, dirnames, filenames in os.walk(basedir):
|
||||||
|
for f in filenames:
|
||||||
|
x = os.path.join(dirpath, f)
|
||||||
|
x = os.path.relpath(x, basedir)
|
||||||
|
if relsep:
|
||||||
|
x = x.replace(os.sep, '/')
|
||||||
|
yield x
|
||||||
|
|
||||||
|
class FilesystemMatcher(Matcher):
|
||||||
|
|
||||||
|
def __init__(self, basedir, *args, **kwargs):
|
||||||
|
Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs)
|
||||||
|
|
||||||
def calc_score_for_char(ctx, prev, current, distance):
|
def calc_score_for_char(ctx, prev, current, distance):
|
||||||
factor = 1.0
|
factor = 1.0
|
||||||
@ -112,10 +160,11 @@ def process_item(ctx, haystack, needle):
|
|||||||
if (len(haystack) - hidx < len(needle) - i):
|
if (len(haystack) - hidx < len(needle) - i):
|
||||||
score = 0
|
score = 0
|
||||||
break
|
break
|
||||||
pos = primary_find(n, haystack[hidx:])[0] + hidx
|
pos = primary_find(n, haystack[hidx:])[0]
|
||||||
if pos == -1:
|
if pos == -1:
|
||||||
score = 0
|
score = 0
|
||||||
break
|
break
|
||||||
|
pos += hidx
|
||||||
|
|
||||||
distance = pos - last_idx
|
distance = pos - last_idx
|
||||||
score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(ctx, haystack[pos-1], haystack[pos], distance)
|
score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(ctx, haystack[pos-1], haystack[pos], distance)
|
||||||
@ -137,7 +186,7 @@ class PyScorer(object):
|
|||||||
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
||||||
self.level1, self.level2, self.level3 = level1, level2, level3
|
self.level1, self.level2, self.level3 = level1, level2, level3
|
||||||
self.max_score_per_char = 0
|
self.max_score_per_char = 0
|
||||||
self.items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
|
self.items = items
|
||||||
|
|
||||||
def __call__(self, needle):
|
def __call__(self, needle):
|
||||||
for item in self.items:
|
for item in self.items:
|
||||||
@ -148,7 +197,6 @@ class PyScorer(object):
|
|||||||
class CScorer(object):
|
class CScorer(object):
|
||||||
|
|
||||||
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
||||||
items = tuple(map(lambda x: normalize('NFC', unicode(x)), filter(None, items)))
|
|
||||||
|
|
||||||
speedup, err = plugins['matcher']
|
speedup, err = plugins['matcher']
|
||||||
if speedup is None:
|
if speedup is None:
|
||||||
@ -156,23 +204,32 @@ class CScorer(object):
|
|||||||
self.m = speedup.Matcher(items, primary_collator().capsule, unicode(level1), unicode(level2), unicode(level3))
|
self.m = speedup.Matcher(items, primary_collator().capsule, unicode(level1), unicode(level2), unicode(level3))
|
||||||
|
|
||||||
def __call__(self, query):
|
def __call__(self, query):
|
||||||
query = normalize('NFC', unicode(query))
|
|
||||||
scores, positions = self.m.calculate_scores(query)
|
scores, positions = self.m.calculate_scores(query)
|
||||||
for score, pos in izip(scores, positions):
|
for score, pos in izip(scores, positions):
|
||||||
yield score, pos
|
yield score, pos
|
||||||
|
|
||||||
def test():
|
def test2():
|
||||||
items = ['m1mn34o/mno']
|
items = ['.driveinfo.calibre', 'Suspense.xls', 'p/parsed/content.opf', 'ns.html']
|
||||||
s = PyScorer(items)
|
for q in (PyScorer, CScorer):
|
||||||
c = CScorer(items)
|
|
||||||
for q in (s, c):
|
|
||||||
print (q)
|
print (q)
|
||||||
for item, (score, positions) in izip(items, q('MNO')):
|
m = Matcher(items, scorer=q)
|
||||||
print (item, score, positions)
|
for item, positions in m('ns').iteritems():
|
||||||
|
print ('\tns', item, positions)
|
||||||
|
|
||||||
|
def test():
|
||||||
|
items = ['m1mn34o/mno', 'xxx/XXX', 'mxnxox']
|
||||||
|
for q in (PyScorer, CScorer):
|
||||||
|
print (q)
|
||||||
|
m = Matcher(items, scorer=q)
|
||||||
|
for item, positions in m('MNO').iteritems():
|
||||||
|
print ('\tMNO', item, positions)
|
||||||
|
for item, positions in m('xxx').iteritems():
|
||||||
|
print ('\txxx', item, positions)
|
||||||
|
|
||||||
def test_mem():
|
def test_mem():
|
||||||
from calibre.utils.mem import gc_histogram, diff_hists
|
from calibre.utils.mem import gc_histogram, diff_hists
|
||||||
m = Matcher([])
|
m = Matcher(['a'])
|
||||||
|
m('a')
|
||||||
del m
|
del m
|
||||||
def doit(c):
|
def doit(c):
|
||||||
m = Matcher([c+'im/one.gif', c+'im/two.gif', c+'text/one.html',])
|
m = Matcher([c+'im/one.gif', c+'im/two.gif', c+'text/one.html',])
|
||||||
@ -182,12 +239,42 @@ def test_mem():
|
|||||||
h1 = gc_histogram()
|
h1 = gc_histogram()
|
||||||
for i in xrange(100):
|
for i in xrange(100):
|
||||||
doit(str(i))
|
doit(str(i))
|
||||||
|
gc.collect()
|
||||||
h2 = gc_histogram()
|
h2 = gc_histogram()
|
||||||
diff_hists(h1, h2)
|
diff_hists(h1, h2)
|
||||||
|
|
||||||
|
def main(basedir=None, query=None):
|
||||||
|
from calibre import prints
|
||||||
|
from calibre.utils.terminal import ColoredStream
|
||||||
|
if basedir is None:
|
||||||
|
try:
|
||||||
|
basedir = raw_input('Enter directory to scan [%s]: ' % os.getcwdu()).decode(sys.stdin.encoding).strip() or os.getcwdu()
|
||||||
|
except (EOFError, KeyboardInterrupt):
|
||||||
|
return
|
||||||
|
m = FilesystemMatcher(basedir)
|
||||||
|
emph = ColoredStream(sys.stdout, fg='red', bold=True)
|
||||||
|
while True:
|
||||||
|
if query is None:
|
||||||
|
try:
|
||||||
|
query = raw_input('Enter query: ').decode(sys.stdin.encoding)
|
||||||
|
except (EOFError, KeyboardInterrupt):
|
||||||
|
break
|
||||||
|
if not query:
|
||||||
|
break
|
||||||
|
for path, positions in islice(m(query).iteritems(), 0, 10):
|
||||||
|
positions = list(positions)
|
||||||
|
p = 0
|
||||||
|
while positions:
|
||||||
|
pos = positions.pop(0)
|
||||||
|
if pos == -1:
|
||||||
|
break
|
||||||
|
prints(path[p:pos], end='')
|
||||||
|
with emph:
|
||||||
|
prints(path[pos], end='')
|
||||||
|
p = pos + 1
|
||||||
|
prints(path[p:])
|
||||||
|
query = None
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
test()
|
# main(basedir='/t', query='ns')
|
||||||
# m = Matcher(['image/one.png', 'image/two.gif', 'text/one.html'])
|
main()
|
||||||
# for q in ('one', 'ONE', 'ton', 'imo'):
|
|
||||||
# print (q, '->', tuple(m(q)))
|
|
||||||
# test_mem()
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user