mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
pep8 and dont fail matcher memleak test is used memory for 10 rounds is zero
This commit is contained in:
parent
8c6433d843
commit
dab1bd61e9
@ -1,7 +1,6 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
# vim:fileencoding=utf-8
|
# vim:fileencoding=utf-8
|
||||||
from __future__ import (unicode_literals, division, absolute_import,
|
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||||
print_function)
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
@ -38,7 +37,7 @@ class Worker(Thread):
|
|||||||
def __init__(self, requests, results):
|
def __init__(self, requests, results):
|
||||||
Thread.__init__(self)
|
Thread.__init__(self)
|
||||||
self.requests, self.results = requests, results
|
self.requests, self.results = requests, results
|
||||||
atexit.register(lambda : requests.put(None))
|
atexit.register(lambda: requests.put(None))
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
while True:
|
while True:
|
||||||
@ -52,6 +51,8 @@ class Worker(Thread):
|
|||||||
self.results.put((False, as_unicode(e)))
|
self.results.put((False, as_unicode(e)))
|
||||||
# import traceback
|
# import traceback
|
||||||
# traceback.print_exc()
|
# traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
wlock = Lock()
|
wlock = Lock()
|
||||||
workers = []
|
workers = []
|
||||||
|
|
||||||
@ -64,9 +65,9 @@ def split(tasks, pool_size):
|
|||||||
and i is the index of the element x in the original list.
|
and i is the index of the element x in the original list.
|
||||||
'''
|
'''
|
||||||
ans, count = [], 0
|
ans, count = [], 0
|
||||||
delta = int(ceil(len(tasks)/pool_size))
|
delta = int(ceil(len(tasks) / pool_size))
|
||||||
while tasks:
|
while tasks:
|
||||||
section = [(count+i, task) for i, task in enumerate(tasks[:delta])]
|
section = [(count + i, task) for i, task in enumerate(tasks[:delta])]
|
||||||
tasks = tasks[delta:]
|
tasks = tasks[delta:]
|
||||||
count += len(section)
|
count += len(section)
|
||||||
ans.append(section)
|
ans.append(section)
|
||||||
@ -82,7 +83,14 @@ def default_scorer(*args, **kwargs):
|
|||||||
|
|
||||||
class Matcher(object):
|
class Matcher(object):
|
||||||
|
|
||||||
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3, scorer=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
items,
|
||||||
|
level1=DEFAULT_LEVEL1,
|
||||||
|
level2=DEFAULT_LEVEL2,
|
||||||
|
level3=DEFAULT_LEVEL3,
|
||||||
|
scorer=None
|
||||||
|
):
|
||||||
with wlock:
|
with wlock:
|
||||||
if not workers:
|
if not workers:
|
||||||
requests, results = Queue(), Queue()
|
requests, results = Queue(), Queue()
|
||||||
@ -92,9 +100,11 @@ class Matcher(object):
|
|||||||
items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
|
items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
|
||||||
self.items = items = tuple(items)
|
self.items = items = tuple(items)
|
||||||
tasks = split(items, len(workers))
|
tasks = split(items, len(workers))
|
||||||
self.task_maps = [{j:i for j, (i, _) in enumerate(task)} for task in tasks]
|
self.task_maps = [{j: i for j, (i, _) in enumerate(task)} for task in tasks]
|
||||||
scorer = scorer or default_scorer
|
scorer = scorer or default_scorer
|
||||||
self.scorers = [scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks]
|
self.scorers = [
|
||||||
|
scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks
|
||||||
|
]
|
||||||
self.sort_keys = None
|
self.sort_keys = None
|
||||||
|
|
||||||
def __call__(self, query, limit=None):
|
def __call__(self, query, limit=None):
|
||||||
@ -103,7 +113,10 @@ class Matcher(object):
|
|||||||
for i, scorer in enumerate(self.scorers):
|
for i, scorer in enumerate(self.scorers):
|
||||||
workers[0].requests.put((i, scorer, query))
|
workers[0].requests.put((i, scorer, query))
|
||||||
if self.sort_keys is None:
|
if self.sort_keys is None:
|
||||||
self.sort_keys = {i:primary_sort_key(x) for i, x in enumerate(self.items)}
|
self.sort_keys = {
|
||||||
|
i: primary_sort_key(x)
|
||||||
|
for i, x in enumerate(self.items)
|
||||||
|
}
|
||||||
num = len(self.task_maps)
|
num = len(self.task_maps)
|
||||||
scores, positions = {}, {}
|
scores, positions = {}, {}
|
||||||
error = None
|
error = None
|
||||||
@ -122,7 +135,8 @@ class Matcher(object):
|
|||||||
|
|
||||||
if error is not None:
|
if error is not None:
|
||||||
raise Exception('Failed to score items: %s' % error)
|
raise Exception('Failed to score items: %s' % error)
|
||||||
items = sorted(((-scores[i], item, positions[i]) for i, item in enumerate(self.items)),
|
items = sorted(((-scores[i], item, positions[i])
|
||||||
|
for i, item in enumerate(self.items)),
|
||||||
key=itemgetter(0))
|
key=itemgetter(0))
|
||||||
if limit is not None:
|
if limit is not None:
|
||||||
del items[limit:]
|
del items[limit:]
|
||||||
@ -148,6 +162,7 @@ class FilesystemMatcher(Matcher):
|
|||||||
def __init__(self, basedir, *args, **kwargs):
|
def __init__(self, basedir, *args, **kwargs):
|
||||||
Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs)
|
Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
# Python implementation of the scoring algorithm {{{
|
# Python implementation of the scoring algorithm {{{
|
||||||
|
|
||||||
|
|
||||||
@ -157,7 +172,9 @@ def calc_score_for_char(ctx, prev, current, distance):
|
|||||||
|
|
||||||
if prev in ctx.level1:
|
if prev in ctx.level1:
|
||||||
factor = 0.9
|
factor = 0.9
|
||||||
elif prev in ctx.level2 or (icu_lower(prev) == prev and icu_upper(current) == current):
|
elif prev in ctx.level2 or (
|
||||||
|
icu_lower(prev) == prev and icu_upper(current) == current
|
||||||
|
):
|
||||||
factor = 0.8
|
factor = 0.8
|
||||||
elif prev in ctx.level3:
|
elif prev in ctx.level3:
|
||||||
factor = 0.7
|
factor = 0.7
|
||||||
@ -169,7 +186,7 @@ def calc_score_for_char(ctx, prev, current, distance):
|
|||||||
|
|
||||||
def process_item(ctx, haystack, needle):
|
def process_item(ctx, haystack, needle):
|
||||||
# non-recursive implementation using a stack
|
# non-recursive implementation using a stack
|
||||||
stack = [(0, 0, 0, 0, [-1]*len(needle))]
|
stack = [(0, 0, 0, 0, [-1] * len(needle))]
|
||||||
final_score, final_positions = stack[0][-2:]
|
final_score, final_positions = stack[0][-2:]
|
||||||
push, pop = stack.append, stack.pop
|
push, pop = stack.append, stack.pop
|
||||||
while stack:
|
while stack:
|
||||||
@ -189,7 +206,9 @@ def process_item(ctx, haystack, needle):
|
|||||||
pos += hidx
|
pos += hidx
|
||||||
|
|
||||||
distance = pos - last_idx
|
distance = pos - last_idx
|
||||||
score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(ctx, haystack[pos-1], haystack[pos], distance)
|
score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(
|
||||||
|
ctx, haystack[pos - 1], haystack[pos], distance
|
||||||
|
)
|
||||||
hidx = pos + 1
|
hidx = pos + 1
|
||||||
push((hidx, i, last_idx, score, list(positions)))
|
push((hidx, i, last_idx, score, list(positions)))
|
||||||
last_idx = positions[i] = pos
|
last_idx = positions[i] = pos
|
||||||
@ -204,9 +223,17 @@ def process_item(ctx, haystack, needle):
|
|||||||
|
|
||||||
|
|
||||||
class PyScorer(object):
|
class PyScorer(object):
|
||||||
__slots__ = ('level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory')
|
__slots__ = (
|
||||||
|
'level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory'
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
def __init__(
|
||||||
|
self,
|
||||||
|
items,
|
||||||
|
level1=DEFAULT_LEVEL1,
|
||||||
|
level2=DEFAULT_LEVEL2,
|
||||||
|
level3=DEFAULT_LEVEL3
|
||||||
|
):
|
||||||
self.level1, self.level2, self.level3 = level1, level2, level3
|
self.level1, self.level2, self.level3 = level1, level2, level3
|
||||||
self.max_score_per_char = 0
|
self.max_score_per_char = 0
|
||||||
self.items = items
|
self.items = items
|
||||||
@ -216,16 +243,30 @@ class PyScorer(object):
|
|||||||
self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0
|
self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0
|
||||||
self.memory = {}
|
self.memory = {}
|
||||||
yield process_item(self, item, needle)
|
yield process_item(self, item, needle)
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
class CScorer(object):
|
class CScorer(object):
|
||||||
|
|
||||||
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
def __init__(
|
||||||
|
self,
|
||||||
|
items,
|
||||||
|
level1=DEFAULT_LEVEL1,
|
||||||
|
level2=DEFAULT_LEVEL2,
|
||||||
|
level3=DEFAULT_LEVEL3
|
||||||
|
):
|
||||||
speedup, err = plugins['matcher']
|
speedup, err = plugins['matcher']
|
||||||
if speedup is None:
|
if speedup is None:
|
||||||
raise PluginFailed('Failed to load the matcher plugin with error: %s' % err)
|
raise PluginFailed(
|
||||||
self.m = speedup.Matcher(items, primary_collator().capsule, unicode(level1), unicode(level2), unicode(level3))
|
'Failed to load the matcher plugin with error: %s' % err
|
||||||
|
)
|
||||||
|
self.m = speedup.Matcher(
|
||||||
|
items,
|
||||||
|
primary_collator().capsule,
|
||||||
|
unicode(level1), unicode(level2), unicode(level3)
|
||||||
|
)
|
||||||
|
|
||||||
def __call__(self, query):
|
def __call__(self, query):
|
||||||
scores, positions = self.m.calculate_scores(query)
|
scores, positions = self.m.calculate_scores(query)
|
||||||
@ -245,8 +286,14 @@ def test(return_tests=False):
|
|||||||
m('a')
|
m('a')
|
||||||
|
|
||||||
def doit(c):
|
def doit(c):
|
||||||
m = Matcher([c+'im/one.gif', c+'im/two.gif', c+'text/one.html',], scorer=CScorer)
|
m = Matcher([
|
||||||
|
c + 'im/one.gif',
|
||||||
|
c + 'im/two.gif',
|
||||||
|
c + 'text/one.html',
|
||||||
|
],
|
||||||
|
scorer=CScorer)
|
||||||
m('one')
|
m('one')
|
||||||
|
|
||||||
start = memory()
|
start = memory()
|
||||||
for i in xrange(10):
|
for i in xrange(10):
|
||||||
doit(str(i))
|
doit(str(i))
|
||||||
@ -257,14 +304,16 @@ def test(return_tests=False):
|
|||||||
doit(str(i))
|
doit(str(i))
|
||||||
gc.collect()
|
gc.collect()
|
||||||
used100 = memory() - start
|
used100 = memory() - start
|
||||||
if used100 > 0 and used10 >= 0:
|
if used100 > 0 and used10 > 0:
|
||||||
self.assertLessEqual(used100, 2 * used10)
|
self.assertLessEqual(used100, 2 * used10)
|
||||||
|
|
||||||
def test_non_bmp(self):
|
def test_non_bmp(self):
|
||||||
raw = '_\U0001f431-'
|
raw = '_\U0001f431-'
|
||||||
m = Matcher([raw], scorer=CScorer)
|
m = Matcher([raw], scorer=CScorer)
|
||||||
positions = next(m(raw).itervalues())
|
positions = next(m(raw).itervalues())
|
||||||
self.assertEqual(positions, (0, 1, (2 if sys.maxunicode >= 0x10ffff else 3)))
|
self.assertEqual(
|
||||||
|
positions, (0, 1, (2 if sys.maxunicode >= 0x10ffff else 3))
|
||||||
|
)
|
||||||
|
|
||||||
if return_tests:
|
if return_tests:
|
||||||
return unittest.TestLoader().loadTestsFromTestCase(Test)
|
return unittest.TestLoader().loadTestsFromTestCase(Test)
|
||||||
@ -277,12 +326,15 @@ def test(return_tests=False):
|
|||||||
|
|
||||||
TestRunner(verbosity=4)
|
TestRunner(verbosity=4)
|
||||||
|
|
||||||
|
|
||||||
if sys.maxunicode >= 0x10ffff:
|
if sys.maxunicode >= 0x10ffff:
|
||||||
get_char = lambda string, pos: string[pos]
|
get_char = lambda string, pos: string[pos]
|
||||||
else:
|
else:
|
||||||
|
|
||||||
def get_char(string, pos):
|
def get_char(string, pos):
|
||||||
chs = 2 if ('\ud800' <= string[pos] <= '\udbff') else 1 # UTF-16 surrogate pair in python narrow builds
|
chs = 2 if ('\ud800' <= string[pos] <= '\udbff'
|
||||||
return string[pos:pos+chs]
|
) else 1 # UTF-16 surrogate pair in python narrow builds
|
||||||
|
return string[pos:pos + chs]
|
||||||
|
|
||||||
|
|
||||||
def main(basedir=None, query=None):
|
def main(basedir=None, query=None):
|
||||||
@ -290,7 +342,8 @@ def main(basedir=None, query=None):
|
|||||||
from calibre.utils.terminal import ColoredStream
|
from calibre.utils.terminal import ColoredStream
|
||||||
if basedir is None:
|
if basedir is None:
|
||||||
try:
|
try:
|
||||||
basedir = raw_input('Enter directory to scan [%s]: ' % os.getcwdu()).decode(sys.stdin.encoding).strip() or os.getcwdu()
|
basedir = raw_input('Enter directory to scan [%s]: ' % os.getcwdu()
|
||||||
|
).decode(sys.stdin.encoding).strip() or os.getcwdu()
|
||||||
except (EOFError, KeyboardInterrupt):
|
except (EOFError, KeyboardInterrupt):
|
||||||
return
|
return
|
||||||
m = FilesystemMatcher(basedir)
|
m = FilesystemMatcher(basedir)
|
||||||
@ -318,6 +371,7 @@ def main(basedir=None, query=None):
|
|||||||
prints(path[p:])
|
prints(path[p:])
|
||||||
query = None
|
query = None
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# main(basedir='/t', query='ns')
|
# main(basedir='/t', query='ns')
|
||||||
# test()
|
# test()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user