mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Method to find identical books in worker processes without access to the db
This commit is contained in:
parent
466a399c34
commit
d5879944cf
@ -1774,10 +1774,21 @@ class Cache(object):
|
|||||||
return f.get_books_for_val(item_id_or_composite_value, self._get_proxy_metadata, self._all_book_ids())
|
return f.get_books_for_val(item_id_or_composite_value, self._get_proxy_metadata, self._all_book_ids())
|
||||||
return self._books_for_field(f.name, int(item_id_or_composite_value))
|
return self._books_for_field(f.name, int(item_id_or_composite_value))
|
||||||
|
|
||||||
|
@read_api
|
||||||
|
def data_for_find_identical_books(self):
|
||||||
|
''' Return data that can be used to implement
|
||||||
|
:meth:`find_identical_books` in a worker process without access to the
|
||||||
|
db. See db.utils for an implementation. '''
|
||||||
|
at = self.fields['authors'].table
|
||||||
|
author_map = defaultdict(set)
|
||||||
|
for aid, author in at.id_map.iteritems():
|
||||||
|
author_map[icu_lower(author)].add(aid)
|
||||||
|
return (author_map, at.col_book_map.copy(), self.fields['title'].table.book_col_map.copy())
|
||||||
|
|
||||||
@read_api
|
@read_api
|
||||||
def find_identical_books(self, mi, search_restriction='', book_ids=None):
|
def find_identical_books(self, mi, search_restriction='', book_ids=None):
|
||||||
''' Finds books that have a superset of the authors in mi and the same
|
''' Finds books that have a superset of the authors in mi and the same
|
||||||
title (title is fuzzy matched) '''
|
title (title is fuzzy matched). See also :meth:`data_for_find_identical_books`. '''
|
||||||
from calibre.db.utils import fuzzy_title
|
from calibre.db.utils import fuzzy_title
|
||||||
identical_book_ids = set()
|
identical_book_ids = set()
|
||||||
if mi.authors:
|
if mi.authors:
|
||||||
|
@ -628,3 +628,18 @@ class ReadingTest(BaseTest):
|
|||||||
self.assertEqual('FMT2', cache.field_for('#ccf', 1))
|
self.assertEqual('FMT2', cache.field_for('#ccf', 1))
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
def test_find_identical_books(self): # {{{
|
||||||
|
' Test find_identical_books '
|
||||||
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.db.utils import find_identical_books
|
||||||
|
# 'find_identical_books': [(,), (Metadata('unknown'),), (Metadata('xxxx'),)],
|
||||||
|
cache = self.init_cache(self.library_path)
|
||||||
|
data = cache.data_for_find_identical_books()
|
||||||
|
for mi, books in (
|
||||||
|
(Metadata('title one', ['author one']), {2}),
|
||||||
|
(Metadata(_('Unknown')), {3}),
|
||||||
|
(Metadata('title two', ['author one']), {1}),
|
||||||
|
):
|
||||||
|
self.assertEqual(books, cache.find_identical_books(mi))
|
||||||
|
self.assertEqual(books, find_identical_books(mi, data))
|
||||||
|
# }}}
|
||||||
|
@ -53,6 +53,23 @@ def fuzzy_title(title):
|
|||||||
title = pat.sub(repl, title)
|
title = pat.sub(repl, title)
|
||||||
return title
|
return title
|
||||||
|
|
||||||
|
def find_identical_books(mi, data):
|
||||||
|
author_map, aid_map, title_map = data
|
||||||
|
author_ids = set()
|
||||||
|
for a in mi.authors:
|
||||||
|
author_ids |= author_map.get(icu_lower(a), set())
|
||||||
|
book_ids = set()
|
||||||
|
for aid in author_ids:
|
||||||
|
book_ids |= aid_map.get(aid, set())
|
||||||
|
ans = set()
|
||||||
|
titleq = fuzzy_title(mi.title)
|
||||||
|
for book_id in book_ids:
|
||||||
|
title = title_map.get(book_id, '')
|
||||||
|
if fuzzy_title(title) == titleq:
|
||||||
|
ans.add(book_id)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
Entry = namedtuple('Entry', 'path size timestamp thumbnail_size')
|
Entry = namedtuple('Entry', 'path size timestamp thumbnail_size')
|
||||||
class CacheError(Exception):
|
class CacheError(Exception):
|
||||||
pass
|
pass
|
||||||
|
Loading…
x
Reference in New Issue
Block a user