pep8 and dont fail matcher memleak test is used memory for 10 rounds is zero

This commit is contained in:
Kovid Goyal 2017-02-02 20:30:32 +05:30
parent 8c6433d843
commit dab1bd61e9

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import, from __future__ import (unicode_literals, division, absolute_import, print_function)
print_function)
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
@ -38,7 +37,7 @@ class Worker(Thread):
def __init__(self, requests, results): def __init__(self, requests, results):
Thread.__init__(self) Thread.__init__(self)
self.requests, self.results = requests, results self.requests, self.results = requests, results
atexit.register(lambda : requests.put(None)) atexit.register(lambda: requests.put(None))
def run(self): def run(self):
while True: while True:
@ -52,6 +51,8 @@ class Worker(Thread):
self.results.put((False, as_unicode(e))) self.results.put((False, as_unicode(e)))
# import traceback # import traceback
# traceback.print_exc() # traceback.print_exc()
wlock = Lock() wlock = Lock()
workers = [] workers = []
@ -64,9 +65,9 @@ def split(tasks, pool_size):
and i is the index of the element x in the original list. and i is the index of the element x in the original list.
''' '''
ans, count = [], 0 ans, count = [], 0
delta = int(ceil(len(tasks)/pool_size)) delta = int(ceil(len(tasks) / pool_size))
while tasks: while tasks:
section = [(count+i, task) for i, task in enumerate(tasks[:delta])] section = [(count + i, task) for i, task in enumerate(tasks[:delta])]
tasks = tasks[delta:] tasks = tasks[delta:]
count += len(section) count += len(section)
ans.append(section) ans.append(section)
@ -82,7 +83,14 @@ def default_scorer(*args, **kwargs):
class Matcher(object): class Matcher(object):
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3, scorer=None): def __init__(
self,
items,
level1=DEFAULT_LEVEL1,
level2=DEFAULT_LEVEL2,
level3=DEFAULT_LEVEL3,
scorer=None
):
with wlock: with wlock:
if not workers: if not workers:
requests, results = Queue(), Queue() requests, results = Queue(), Queue()
@ -92,9 +100,11 @@ class Matcher(object):
items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items)) items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
self.items = items = tuple(items) self.items = items = tuple(items)
tasks = split(items, len(workers)) tasks = split(items, len(workers))
self.task_maps = [{j:i for j, (i, _) in enumerate(task)} for task in tasks] self.task_maps = [{j: i for j, (i, _) in enumerate(task)} for task in tasks]
scorer = scorer or default_scorer scorer = scorer or default_scorer
self.scorers = [scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks] self.scorers = [
scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks
]
self.sort_keys = None self.sort_keys = None
def __call__(self, query, limit=None): def __call__(self, query, limit=None):
@ -103,7 +113,10 @@ class Matcher(object):
for i, scorer in enumerate(self.scorers): for i, scorer in enumerate(self.scorers):
workers[0].requests.put((i, scorer, query)) workers[0].requests.put((i, scorer, query))
if self.sort_keys is None: if self.sort_keys is None:
self.sort_keys = {i:primary_sort_key(x) for i, x in enumerate(self.items)} self.sort_keys = {
i: primary_sort_key(x)
for i, x in enumerate(self.items)
}
num = len(self.task_maps) num = len(self.task_maps)
scores, positions = {}, {} scores, positions = {}, {}
error = None error = None
@ -122,7 +135,8 @@ class Matcher(object):
if error is not None: if error is not None:
raise Exception('Failed to score items: %s' % error) raise Exception('Failed to score items: %s' % error)
items = sorted(((-scores[i], item, positions[i]) for i, item in enumerate(self.items)), items = sorted(((-scores[i], item, positions[i])
for i, item in enumerate(self.items)),
key=itemgetter(0)) key=itemgetter(0))
if limit is not None: if limit is not None:
del items[limit:] del items[limit:]
@ -148,6 +162,7 @@ class FilesystemMatcher(Matcher):
def __init__(self, basedir, *args, **kwargs): def __init__(self, basedir, *args, **kwargs):
Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs) Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs)
# Python implementation of the scoring algorithm {{{ # Python implementation of the scoring algorithm {{{
@ -157,7 +172,9 @@ def calc_score_for_char(ctx, prev, current, distance):
if prev in ctx.level1: if prev in ctx.level1:
factor = 0.9 factor = 0.9
elif prev in ctx.level2 or (icu_lower(prev) == prev and icu_upper(current) == current): elif prev in ctx.level2 or (
icu_lower(prev) == prev and icu_upper(current) == current
):
factor = 0.8 factor = 0.8
elif prev in ctx.level3: elif prev in ctx.level3:
factor = 0.7 factor = 0.7
@ -169,7 +186,7 @@ def calc_score_for_char(ctx, prev, current, distance):
def process_item(ctx, haystack, needle): def process_item(ctx, haystack, needle):
# non-recursive implementation using a stack # non-recursive implementation using a stack
stack = [(0, 0, 0, 0, [-1]*len(needle))] stack = [(0, 0, 0, 0, [-1] * len(needle))]
final_score, final_positions = stack[0][-2:] final_score, final_positions = stack[0][-2:]
push, pop = stack.append, stack.pop push, pop = stack.append, stack.pop
while stack: while stack:
@ -189,7 +206,9 @@ def process_item(ctx, haystack, needle):
pos += hidx pos += hidx
distance = pos - last_idx distance = pos - last_idx
score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(ctx, haystack[pos-1], haystack[pos], distance) score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(
ctx, haystack[pos - 1], haystack[pos], distance
)
hidx = pos + 1 hidx = pos + 1
push((hidx, i, last_idx, score, list(positions))) push((hidx, i, last_idx, score, list(positions)))
last_idx = positions[i] = pos last_idx = positions[i] = pos
@ -204,9 +223,17 @@ def process_item(ctx, haystack, needle):
class PyScorer(object): class PyScorer(object):
__slots__ = ('level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory') __slots__ = (
'level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory'
)
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3): def __init__(
self,
items,
level1=DEFAULT_LEVEL1,
level2=DEFAULT_LEVEL2,
level3=DEFAULT_LEVEL3
):
self.level1, self.level2, self.level3 = level1, level2, level3 self.level1, self.level2, self.level3 = level1, level2, level3
self.max_score_per_char = 0 self.max_score_per_char = 0
self.items = items self.items = items
@ -216,16 +243,30 @@ class PyScorer(object):
self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0 self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0
self.memory = {} self.memory = {}
yield process_item(self, item, needle) yield process_item(self, item, needle)
# }}} # }}}
class CScorer(object): class CScorer(object):
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3): def __init__(
self,
items,
level1=DEFAULT_LEVEL1,
level2=DEFAULT_LEVEL2,
level3=DEFAULT_LEVEL3
):
speedup, err = plugins['matcher'] speedup, err = plugins['matcher']
if speedup is None: if speedup is None:
raise PluginFailed('Failed to load the matcher plugin with error: %s' % err) raise PluginFailed(
self.m = speedup.Matcher(items, primary_collator().capsule, unicode(level1), unicode(level2), unicode(level3)) 'Failed to load the matcher plugin with error: %s' % err
)
self.m = speedup.Matcher(
items,
primary_collator().capsule,
unicode(level1), unicode(level2), unicode(level3)
)
def __call__(self, query): def __call__(self, query):
scores, positions = self.m.calculate_scores(query) scores, positions = self.m.calculate_scores(query)
@ -245,8 +286,14 @@ def test(return_tests=False):
m('a') m('a')
def doit(c): def doit(c):
m = Matcher([c+'im/one.gif', c+'im/two.gif', c+'text/one.html',], scorer=CScorer) m = Matcher([
c + 'im/one.gif',
c + 'im/two.gif',
c + 'text/one.html',
],
scorer=CScorer)
m('one') m('one')
start = memory() start = memory()
for i in xrange(10): for i in xrange(10):
doit(str(i)) doit(str(i))
@ -257,14 +304,16 @@ def test(return_tests=False):
doit(str(i)) doit(str(i))
gc.collect() gc.collect()
used100 = memory() - start used100 = memory() - start
if used100 > 0 and used10 >= 0: if used100 > 0 and used10 > 0:
self.assertLessEqual(used100, 2 * used10) self.assertLessEqual(used100, 2 * used10)
def test_non_bmp(self): def test_non_bmp(self):
raw = '_\U0001f431-' raw = '_\U0001f431-'
m = Matcher([raw], scorer=CScorer) m = Matcher([raw], scorer=CScorer)
positions = next(m(raw).itervalues()) positions = next(m(raw).itervalues())
self.assertEqual(positions, (0, 1, (2 if sys.maxunicode >= 0x10ffff else 3))) self.assertEqual(
positions, (0, 1, (2 if sys.maxunicode >= 0x10ffff else 3))
)
if return_tests: if return_tests:
return unittest.TestLoader().loadTestsFromTestCase(Test) return unittest.TestLoader().loadTestsFromTestCase(Test)
@ -277,12 +326,15 @@ def test(return_tests=False):
TestRunner(verbosity=4) TestRunner(verbosity=4)
if sys.maxunicode >= 0x10ffff: if sys.maxunicode >= 0x10ffff:
get_char = lambda string, pos: string[pos] get_char = lambda string, pos: string[pos]
else: else:
def get_char(string, pos): def get_char(string, pos):
chs = 2 if ('\ud800' <= string[pos] <= '\udbff') else 1 # UTF-16 surrogate pair in python narrow builds chs = 2 if ('\ud800' <= string[pos] <= '\udbff'
return string[pos:pos+chs] ) else 1 # UTF-16 surrogate pair in python narrow builds
return string[pos:pos + chs]
def main(basedir=None, query=None): def main(basedir=None, query=None):
@ -290,7 +342,8 @@ def main(basedir=None, query=None):
from calibre.utils.terminal import ColoredStream from calibre.utils.terminal import ColoredStream
if basedir is None: if basedir is None:
try: try:
basedir = raw_input('Enter directory to scan [%s]: ' % os.getcwdu()).decode(sys.stdin.encoding).strip() or os.getcwdu() basedir = raw_input('Enter directory to scan [%s]: ' % os.getcwdu()
).decode(sys.stdin.encoding).strip() or os.getcwdu()
except (EOFError, KeyboardInterrupt): except (EOFError, KeyboardInterrupt):
return return
m = FilesystemMatcher(basedir) m = FilesystemMatcher(basedir)
@ -318,6 +371,7 @@ def main(basedir=None, query=None):
prints(path[p:]) prints(path[p:])
query = None query = None
if __name__ == '__main__': if __name__ == '__main__':
# main(basedir='/t', query='ns') # main(basedir='/t', query='ns')
# test() # test()