diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index 4f91a382cd..fe9800332b 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -1774,10 +1774,21 @@ class Cache(object): return f.get_books_for_val(item_id_or_composite_value, self._get_proxy_metadata, self._all_book_ids()) return self._books_for_field(f.name, int(item_id_or_composite_value)) + @read_api + def data_for_find_identical_books(self): + ''' Return data that can be used to implement + :meth:`find_identical_books` in a worker process without access to the + db. See db.utils for an implementation. ''' + at = self.fields['authors'].table + author_map = defaultdict(set) + for aid, author in at.id_map.iteritems(): + author_map[icu_lower(author)].add(aid) + return (author_map, at.col_book_map.copy(), self.fields['title'].table.book_col_map.copy()) + @read_api def find_identical_books(self, mi, search_restriction='', book_ids=None): ''' Finds books that have a superset of the authors in mi and the same - title (title is fuzzy matched) ''' + title (title is fuzzy matched). See also :meth:`data_for_find_identical_books`. ''' from calibre.db.utils import fuzzy_title identical_book_ids = set() if mi.authors: diff --git a/src/calibre/db/tests/reading.py b/src/calibre/db/tests/reading.py index c5af110a17..ae390c97ab 100644 --- a/src/calibre/db/tests/reading.py +++ b/src/calibre/db/tests/reading.py @@ -628,3 +628,18 @@ class ReadingTest(BaseTest): self.assertEqual('FMT2', cache.field_for('#ccf', 1)) # }}} + def test_find_identical_books(self): # {{{ + ' Test find_identical_books ' + from calibre.ebooks.metadata.book.base import Metadata + from calibre.db.utils import find_identical_books + # 'find_identical_books': [(,), (Metadata('unknown'),), (Metadata('xxxx'),)], + cache = self.init_cache(self.library_path) + data = cache.data_for_find_identical_books() + for mi, books in ( + (Metadata('title one', ['author one']), {2}), + (Metadata(_('Unknown')), {3}), + (Metadata('title two', ['author one']), {1}), + ): + self.assertEqual(books, cache.find_identical_books(mi)) + self.assertEqual(books, find_identical_books(mi, data)) + # }}} diff --git a/src/calibre/db/utils.py b/src/calibre/db/utils.py index 2026c287ed..629581f8e5 100644 --- a/src/calibre/db/utils.py +++ b/src/calibre/db/utils.py @@ -53,6 +53,23 @@ def fuzzy_title(title): title = pat.sub(repl, title) return title +def find_identical_books(mi, data): + author_map, aid_map, title_map = data + author_ids = set() + for a in mi.authors: + author_ids |= author_map.get(icu_lower(a), set()) + book_ids = set() + for aid in author_ids: + book_ids |= aid_map.get(aid, set()) + ans = set() + titleq = fuzzy_title(mi.title) + for book_id in book_ids: + title = title_map.get(book_id, '') + if fuzzy_title(title) == titleq: + ans.add(book_id) + return ans + + Entry = namedtuple('Entry', 'path size timestamp thumbnail_size') class CacheError(Exception): pass