From dab1bd61e94c2fbfb97bda096115a61bd5660b8a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 2 Feb 2017 20:30:32 +0530 Subject: [PATCH] pep8 and dont fail matcher memleak test is used memory for 10 rounds is zero --- src/calibre/utils/matcher.py | 102 ++++++++++++++++++++++++++--------- 1 file changed, 78 insertions(+), 24 deletions(-) diff --git a/src/calibre/utils/matcher.py b/src/calibre/utils/matcher.py index 8defe9049d..6d4164c125 100644 --- a/src/calibre/utils/matcher.py +++ b/src/calibre/utils/matcher.py @@ -1,7 +1,6 @@ #!/usr/bin/env python2 # vim:fileencoding=utf-8 -from __future__ import (unicode_literals, division, absolute_import, - print_function) +from __future__ import (unicode_literals, division, absolute_import, print_function) __license__ = 'GPL v3' __copyright__ = '2014, Kovid Goyal ' @@ -38,7 +37,7 @@ class Worker(Thread): def __init__(self, requests, results): Thread.__init__(self) self.requests, self.results = requests, results - atexit.register(lambda : requests.put(None)) + atexit.register(lambda: requests.put(None)) def run(self): while True: @@ -52,6 +51,8 @@ class Worker(Thread): self.results.put((False, as_unicode(e))) # import traceback # traceback.print_exc() + + wlock = Lock() workers = [] @@ -64,9 +65,9 @@ def split(tasks, pool_size): and i is the index of the element x in the original list. ''' ans, count = [], 0 - delta = int(ceil(len(tasks)/pool_size)) + delta = int(ceil(len(tasks) / pool_size)) while tasks: - section = [(count+i, task) for i, task in enumerate(tasks[:delta])] + section = [(count + i, task) for i, task in enumerate(tasks[:delta])] tasks = tasks[delta:] count += len(section) ans.append(section) @@ -82,7 +83,14 @@ def default_scorer(*args, **kwargs): class Matcher(object): - def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3, scorer=None): + def __init__( + self, + items, + level1=DEFAULT_LEVEL1, + level2=DEFAULT_LEVEL2, + level3=DEFAULT_LEVEL3, + scorer=None + ): with wlock: if not workers: requests, results = Queue(), Queue() @@ -92,9 +100,11 @@ class Matcher(object): items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items)) self.items = items = tuple(items) tasks = split(items, len(workers)) - self.task_maps = [{j:i for j, (i, _) in enumerate(task)} for task in tasks] + self.task_maps = [{j: i for j, (i, _) in enumerate(task)} for task in tasks] scorer = scorer or default_scorer - self.scorers = [scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks] + self.scorers = [ + scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks + ] self.sort_keys = None def __call__(self, query, limit=None): @@ -103,7 +113,10 @@ class Matcher(object): for i, scorer in enumerate(self.scorers): workers[0].requests.put((i, scorer, query)) if self.sort_keys is None: - self.sort_keys = {i:primary_sort_key(x) for i, x in enumerate(self.items)} + self.sort_keys = { + i: primary_sort_key(x) + for i, x in enumerate(self.items) + } num = len(self.task_maps) scores, positions = {}, {} error = None @@ -122,7 +135,8 @@ class Matcher(object): if error is not None: raise Exception('Failed to score items: %s' % error) - items = sorted(((-scores[i], item, positions[i]) for i, item in enumerate(self.items)), + items = sorted(((-scores[i], item, positions[i]) + for i, item in enumerate(self.items)), key=itemgetter(0)) if limit is not None: del items[limit:] @@ -148,6 +162,7 @@ class FilesystemMatcher(Matcher): def __init__(self, basedir, *args, **kwargs): Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs) + # Python implementation of the scoring algorithm {{{ @@ -157,7 +172,9 @@ def calc_score_for_char(ctx, prev, current, distance): if prev in ctx.level1: factor = 0.9 - elif prev in ctx.level2 or (icu_lower(prev) == prev and icu_upper(current) == current): + elif prev in ctx.level2 or ( + icu_lower(prev) == prev and icu_upper(current) == current + ): factor = 0.8 elif prev in ctx.level3: factor = 0.7 @@ -169,7 +186,7 @@ def calc_score_for_char(ctx, prev, current, distance): def process_item(ctx, haystack, needle): # non-recursive implementation using a stack - stack = [(0, 0, 0, 0, [-1]*len(needle))] + stack = [(0, 0, 0, 0, [-1] * len(needle))] final_score, final_positions = stack[0][-2:] push, pop = stack.append, stack.pop while stack: @@ -189,7 +206,9 @@ def process_item(ctx, haystack, needle): pos += hidx distance = pos - last_idx - score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(ctx, haystack[pos-1], haystack[pos], distance) + score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char( + ctx, haystack[pos - 1], haystack[pos], distance + ) hidx = pos + 1 push((hidx, i, last_idx, score, list(positions))) last_idx = positions[i] = pos @@ -204,9 +223,17 @@ def process_item(ctx, haystack, needle): class PyScorer(object): - __slots__ = ('level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory') + __slots__ = ( + 'level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory' + ) - def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3): + def __init__( + self, + items, + level1=DEFAULT_LEVEL1, + level2=DEFAULT_LEVEL2, + level3=DEFAULT_LEVEL3 + ): self.level1, self.level2, self.level3 = level1, level2, level3 self.max_score_per_char = 0 self.items = items @@ -216,16 +243,30 @@ class PyScorer(object): self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0 self.memory = {} yield process_item(self, item, needle) + + # }}} class CScorer(object): - def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3): + def __init__( + self, + items, + level1=DEFAULT_LEVEL1, + level2=DEFAULT_LEVEL2, + level3=DEFAULT_LEVEL3 + ): speedup, err = plugins['matcher'] if speedup is None: - raise PluginFailed('Failed to load the matcher plugin with error: %s' % err) - self.m = speedup.Matcher(items, primary_collator().capsule, unicode(level1), unicode(level2), unicode(level3)) + raise PluginFailed( + 'Failed to load the matcher plugin with error: %s' % err + ) + self.m = speedup.Matcher( + items, + primary_collator().capsule, + unicode(level1), unicode(level2), unicode(level3) + ) def __call__(self, query): scores, positions = self.m.calculate_scores(query) @@ -245,8 +286,14 @@ def test(return_tests=False): m('a') def doit(c): - m = Matcher([c+'im/one.gif', c+'im/two.gif', c+'text/one.html',], scorer=CScorer) + m = Matcher([ + c + 'im/one.gif', + c + 'im/two.gif', + c + 'text/one.html', + ], + scorer=CScorer) m('one') + start = memory() for i in xrange(10): doit(str(i)) @@ -257,14 +304,16 @@ def test(return_tests=False): doit(str(i)) gc.collect() used100 = memory() - start - if used100 > 0 and used10 >= 0: + if used100 > 0 and used10 > 0: self.assertLessEqual(used100, 2 * used10) def test_non_bmp(self): raw = '_\U0001f431-' m = Matcher([raw], scorer=CScorer) positions = next(m(raw).itervalues()) - self.assertEqual(positions, (0, 1, (2 if sys.maxunicode >= 0x10ffff else 3))) + self.assertEqual( + positions, (0, 1, (2 if sys.maxunicode >= 0x10ffff else 3)) + ) if return_tests: return unittest.TestLoader().loadTestsFromTestCase(Test) @@ -277,12 +326,15 @@ def test(return_tests=False): TestRunner(verbosity=4) + if sys.maxunicode >= 0x10ffff: get_char = lambda string, pos: string[pos] else: + def get_char(string, pos): - chs = 2 if ('\ud800' <= string[pos] <= '\udbff') else 1 # UTF-16 surrogate pair in python narrow builds - return string[pos:pos+chs] + chs = 2 if ('\ud800' <= string[pos] <= '\udbff' + ) else 1 # UTF-16 surrogate pair in python narrow builds + return string[pos:pos + chs] def main(basedir=None, query=None): @@ -290,7 +342,8 @@ def main(basedir=None, query=None): from calibre.utils.terminal import ColoredStream if basedir is None: try: - basedir = raw_input('Enter directory to scan [%s]: ' % os.getcwdu()).decode(sys.stdin.encoding).strip() or os.getcwdu() + basedir = raw_input('Enter directory to scan [%s]: ' % os.getcwdu() + ).decode(sys.stdin.encoding).strip() or os.getcwdu() except (EOFError, KeyboardInterrupt): return m = FilesystemMatcher(basedir) @@ -318,6 +371,7 @@ def main(basedir=None, query=None): prints(path[p:]) query = None + if __name__ == '__main__': # main(basedir='/t', query='ns') # test()