mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
pep8 and dont fail matcher memleak test is used memory for 10 rounds is zero
This commit is contained in:
parent
8c6433d843
commit
dab1bd61e9
@ -1,7 +1,6 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
@ -38,7 +37,7 @@ class Worker(Thread):
|
||||
def __init__(self, requests, results):
|
||||
Thread.__init__(self)
|
||||
self.requests, self.results = requests, results
|
||||
atexit.register(lambda : requests.put(None))
|
||||
atexit.register(lambda: requests.put(None))
|
||||
|
||||
def run(self):
|
||||
while True:
|
||||
@ -52,6 +51,8 @@ class Worker(Thread):
|
||||
self.results.put((False, as_unicode(e)))
|
||||
# import traceback
|
||||
# traceback.print_exc()
|
||||
|
||||
|
||||
wlock = Lock()
|
||||
workers = []
|
||||
|
||||
@ -64,9 +65,9 @@ def split(tasks, pool_size):
|
||||
and i is the index of the element x in the original list.
|
||||
'''
|
||||
ans, count = [], 0
|
||||
delta = int(ceil(len(tasks)/pool_size))
|
||||
delta = int(ceil(len(tasks) / pool_size))
|
||||
while tasks:
|
||||
section = [(count+i, task) for i, task in enumerate(tasks[:delta])]
|
||||
section = [(count + i, task) for i, task in enumerate(tasks[:delta])]
|
||||
tasks = tasks[delta:]
|
||||
count += len(section)
|
||||
ans.append(section)
|
||||
@ -82,7 +83,14 @@ def default_scorer(*args, **kwargs):
|
||||
|
||||
class Matcher(object):
|
||||
|
||||
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3, scorer=None):
|
||||
def __init__(
|
||||
self,
|
||||
items,
|
||||
level1=DEFAULT_LEVEL1,
|
||||
level2=DEFAULT_LEVEL2,
|
||||
level3=DEFAULT_LEVEL3,
|
||||
scorer=None
|
||||
):
|
||||
with wlock:
|
||||
if not workers:
|
||||
requests, results = Queue(), Queue()
|
||||
@ -92,9 +100,11 @@ class Matcher(object):
|
||||
items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
|
||||
self.items = items = tuple(items)
|
||||
tasks = split(items, len(workers))
|
||||
self.task_maps = [{j:i for j, (i, _) in enumerate(task)} for task in tasks]
|
||||
self.task_maps = [{j: i for j, (i, _) in enumerate(task)} for task in tasks]
|
||||
scorer = scorer or default_scorer
|
||||
self.scorers = [scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks]
|
||||
self.scorers = [
|
||||
scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks
|
||||
]
|
||||
self.sort_keys = None
|
||||
|
||||
def __call__(self, query, limit=None):
|
||||
@ -103,7 +113,10 @@ class Matcher(object):
|
||||
for i, scorer in enumerate(self.scorers):
|
||||
workers[0].requests.put((i, scorer, query))
|
||||
if self.sort_keys is None:
|
||||
self.sort_keys = {i:primary_sort_key(x) for i, x in enumerate(self.items)}
|
||||
self.sort_keys = {
|
||||
i: primary_sort_key(x)
|
||||
for i, x in enumerate(self.items)
|
||||
}
|
||||
num = len(self.task_maps)
|
||||
scores, positions = {}, {}
|
||||
error = None
|
||||
@ -122,7 +135,8 @@ class Matcher(object):
|
||||
|
||||
if error is not None:
|
||||
raise Exception('Failed to score items: %s' % error)
|
||||
items = sorted(((-scores[i], item, positions[i]) for i, item in enumerate(self.items)),
|
||||
items = sorted(((-scores[i], item, positions[i])
|
||||
for i, item in enumerate(self.items)),
|
||||
key=itemgetter(0))
|
||||
if limit is not None:
|
||||
del items[limit:]
|
||||
@ -148,6 +162,7 @@ class FilesystemMatcher(Matcher):
|
||||
def __init__(self, basedir, *args, **kwargs):
|
||||
Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs)
|
||||
|
||||
|
||||
# Python implementation of the scoring algorithm {{{
|
||||
|
||||
|
||||
@ -157,7 +172,9 @@ def calc_score_for_char(ctx, prev, current, distance):
|
||||
|
||||
if prev in ctx.level1:
|
||||
factor = 0.9
|
||||
elif prev in ctx.level2 or (icu_lower(prev) == prev and icu_upper(current) == current):
|
||||
elif prev in ctx.level2 or (
|
||||
icu_lower(prev) == prev and icu_upper(current) == current
|
||||
):
|
||||
factor = 0.8
|
||||
elif prev in ctx.level3:
|
||||
factor = 0.7
|
||||
@ -169,7 +186,7 @@ def calc_score_for_char(ctx, prev, current, distance):
|
||||
|
||||
def process_item(ctx, haystack, needle):
|
||||
# non-recursive implementation using a stack
|
||||
stack = [(0, 0, 0, 0, [-1]*len(needle))]
|
||||
stack = [(0, 0, 0, 0, [-1] * len(needle))]
|
||||
final_score, final_positions = stack[0][-2:]
|
||||
push, pop = stack.append, stack.pop
|
||||
while stack:
|
||||
@ -189,7 +206,9 @@ def process_item(ctx, haystack, needle):
|
||||
pos += hidx
|
||||
|
||||
distance = pos - last_idx
|
||||
score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(ctx, haystack[pos-1], haystack[pos], distance)
|
||||
score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(
|
||||
ctx, haystack[pos - 1], haystack[pos], distance
|
||||
)
|
||||
hidx = pos + 1
|
||||
push((hidx, i, last_idx, score, list(positions)))
|
||||
last_idx = positions[i] = pos
|
||||
@ -204,9 +223,17 @@ def process_item(ctx, haystack, needle):
|
||||
|
||||
|
||||
class PyScorer(object):
|
||||
__slots__ = ('level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory')
|
||||
__slots__ = (
|
||||
'level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory'
|
||||
)
|
||||
|
||||
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
||||
def __init__(
|
||||
self,
|
||||
items,
|
||||
level1=DEFAULT_LEVEL1,
|
||||
level2=DEFAULT_LEVEL2,
|
||||
level3=DEFAULT_LEVEL3
|
||||
):
|
||||
self.level1, self.level2, self.level3 = level1, level2, level3
|
||||
self.max_score_per_char = 0
|
||||
self.items = items
|
||||
@ -216,16 +243,30 @@ class PyScorer(object):
|
||||
self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0
|
||||
self.memory = {}
|
||||
yield process_item(self, item, needle)
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
class CScorer(object):
|
||||
|
||||
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
||||
def __init__(
|
||||
self,
|
||||
items,
|
||||
level1=DEFAULT_LEVEL1,
|
||||
level2=DEFAULT_LEVEL2,
|
||||
level3=DEFAULT_LEVEL3
|
||||
):
|
||||
speedup, err = plugins['matcher']
|
||||
if speedup is None:
|
||||
raise PluginFailed('Failed to load the matcher plugin with error: %s' % err)
|
||||
self.m = speedup.Matcher(items, primary_collator().capsule, unicode(level1), unicode(level2), unicode(level3))
|
||||
raise PluginFailed(
|
||||
'Failed to load the matcher plugin with error: %s' % err
|
||||
)
|
||||
self.m = speedup.Matcher(
|
||||
items,
|
||||
primary_collator().capsule,
|
||||
unicode(level1), unicode(level2), unicode(level3)
|
||||
)
|
||||
|
||||
def __call__(self, query):
|
||||
scores, positions = self.m.calculate_scores(query)
|
||||
@ -245,8 +286,14 @@ def test(return_tests=False):
|
||||
m('a')
|
||||
|
||||
def doit(c):
|
||||
m = Matcher([c+'im/one.gif', c+'im/two.gif', c+'text/one.html',], scorer=CScorer)
|
||||
m = Matcher([
|
||||
c + 'im/one.gif',
|
||||
c + 'im/two.gif',
|
||||
c + 'text/one.html',
|
||||
],
|
||||
scorer=CScorer)
|
||||
m('one')
|
||||
|
||||
start = memory()
|
||||
for i in xrange(10):
|
||||
doit(str(i))
|
||||
@ -257,14 +304,16 @@ def test(return_tests=False):
|
||||
doit(str(i))
|
||||
gc.collect()
|
||||
used100 = memory() - start
|
||||
if used100 > 0 and used10 >= 0:
|
||||
if used100 > 0 and used10 > 0:
|
||||
self.assertLessEqual(used100, 2 * used10)
|
||||
|
||||
def test_non_bmp(self):
|
||||
raw = '_\U0001f431-'
|
||||
m = Matcher([raw], scorer=CScorer)
|
||||
positions = next(m(raw).itervalues())
|
||||
self.assertEqual(positions, (0, 1, (2 if sys.maxunicode >= 0x10ffff else 3)))
|
||||
self.assertEqual(
|
||||
positions, (0, 1, (2 if sys.maxunicode >= 0x10ffff else 3))
|
||||
)
|
||||
|
||||
if return_tests:
|
||||
return unittest.TestLoader().loadTestsFromTestCase(Test)
|
||||
@ -277,12 +326,15 @@ def test(return_tests=False):
|
||||
|
||||
TestRunner(verbosity=4)
|
||||
|
||||
|
||||
if sys.maxunicode >= 0x10ffff:
|
||||
get_char = lambda string, pos: string[pos]
|
||||
else:
|
||||
|
||||
def get_char(string, pos):
|
||||
chs = 2 if ('\ud800' <= string[pos] <= '\udbff') else 1 # UTF-16 surrogate pair in python narrow builds
|
||||
return string[pos:pos+chs]
|
||||
chs = 2 if ('\ud800' <= string[pos] <= '\udbff'
|
||||
) else 1 # UTF-16 surrogate pair in python narrow builds
|
||||
return string[pos:pos + chs]
|
||||
|
||||
|
||||
def main(basedir=None, query=None):
|
||||
@ -290,7 +342,8 @@ def main(basedir=None, query=None):
|
||||
from calibre.utils.terminal import ColoredStream
|
||||
if basedir is None:
|
||||
try:
|
||||
basedir = raw_input('Enter directory to scan [%s]: ' % os.getcwdu()).decode(sys.stdin.encoding).strip() or os.getcwdu()
|
||||
basedir = raw_input('Enter directory to scan [%s]: ' % os.getcwdu()
|
||||
).decode(sys.stdin.encoding).strip() or os.getcwdu()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
return
|
||||
m = FilesystemMatcher(basedir)
|
||||
@ -318,6 +371,7 @@ def main(basedir=None, query=None):
|
||||
prints(path[p:])
|
||||
query = None
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# main(basedir='/t', query='ns')
|
||||
# test()
|
||||
|
Loading…
x
Reference in New Issue
Block a user