mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Method to find identical books in worker processes without access to the db
This commit is contained in:
parent
466a399c34
commit
d5879944cf
@ -1774,10 +1774,21 @@ class Cache(object):
|
||||
return f.get_books_for_val(item_id_or_composite_value, self._get_proxy_metadata, self._all_book_ids())
|
||||
return self._books_for_field(f.name, int(item_id_or_composite_value))
|
||||
|
||||
@read_api
|
||||
def data_for_find_identical_books(self):
|
||||
''' Return data that can be used to implement
|
||||
:meth:`find_identical_books` in a worker process without access to the
|
||||
db. See db.utils for an implementation. '''
|
||||
at = self.fields['authors'].table
|
||||
author_map = defaultdict(set)
|
||||
for aid, author in at.id_map.iteritems():
|
||||
author_map[icu_lower(author)].add(aid)
|
||||
return (author_map, at.col_book_map.copy(), self.fields['title'].table.book_col_map.copy())
|
||||
|
||||
@read_api
|
||||
def find_identical_books(self, mi, search_restriction='', book_ids=None):
|
||||
''' Finds books that have a superset of the authors in mi and the same
|
||||
title (title is fuzzy matched) '''
|
||||
title (title is fuzzy matched). See also :meth:`data_for_find_identical_books`. '''
|
||||
from calibre.db.utils import fuzzy_title
|
||||
identical_book_ids = set()
|
||||
if mi.authors:
|
||||
|
@ -628,3 +628,18 @@ class ReadingTest(BaseTest):
|
||||
self.assertEqual('FMT2', cache.field_for('#ccf', 1))
|
||||
# }}}
|
||||
|
||||
def test_find_identical_books(self): # {{{
|
||||
' Test find_identical_books '
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.db.utils import find_identical_books
|
||||
# 'find_identical_books': [(,), (Metadata('unknown'),), (Metadata('xxxx'),)],
|
||||
cache = self.init_cache(self.library_path)
|
||||
data = cache.data_for_find_identical_books()
|
||||
for mi, books in (
|
||||
(Metadata('title one', ['author one']), {2}),
|
||||
(Metadata(_('Unknown')), {3}),
|
||||
(Metadata('title two', ['author one']), {1}),
|
||||
):
|
||||
self.assertEqual(books, cache.find_identical_books(mi))
|
||||
self.assertEqual(books, find_identical_books(mi, data))
|
||||
# }}}
|
||||
|
@ -53,6 +53,23 @@ def fuzzy_title(title):
|
||||
title = pat.sub(repl, title)
|
||||
return title
|
||||
|
||||
def find_identical_books(mi, data):
|
||||
author_map, aid_map, title_map = data
|
||||
author_ids = set()
|
||||
for a in mi.authors:
|
||||
author_ids |= author_map.get(icu_lower(a), set())
|
||||
book_ids = set()
|
||||
for aid in author_ids:
|
||||
book_ids |= aid_map.get(aid, set())
|
||||
ans = set()
|
||||
titleq = fuzzy_title(mi.title)
|
||||
for book_id in book_ids:
|
||||
title = title_map.get(book_id, '')
|
||||
if fuzzy_title(title) == titleq:
|
||||
ans.add(book_id)
|
||||
return ans
|
||||
|
||||
|
||||
Entry = namedtuple('Entry', 'path size timestamp thumbnail_size')
|
||||
class CacheError(Exception):
|
||||
pass
|
||||
|
Loading…
x
Reference in New Issue
Block a user