mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
find_identical_books()
This commit is contained in:
parent
669efdd6f6
commit
34704c9735
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, traceback, random, shutil
|
import os, traceback, random, shutil, re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from functools import wraps, partial
|
from functools import wraps, partial
|
||||||
@ -25,7 +25,7 @@ from calibre.db.tables import VirtualTable
|
|||||||
from calibre.db.write import get_series_values
|
from calibre.db.write import get_series_values
|
||||||
from calibre.db.lazy import FormatMetadata, FormatsList
|
from calibre.db.lazy import FormatMetadata, FormatsList
|
||||||
from calibre.ebooks import check_ebook_format
|
from calibre.ebooks import check_ebook_format
|
||||||
from calibre.ebooks.metadata import string_to_authors, author_to_author_sort
|
from calibre.ebooks.metadata import string_to_authors, author_to_author_sort, get_title_sort_pat
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.ebooks.metadata.opf2 import metadata_to_opf
|
from calibre.ebooks.metadata.opf2 import metadata_to_opf
|
||||||
from calibre.ptempfile import (base_dir, PersistentTemporaryFile,
|
from calibre.ptempfile import (base_dir, PersistentTemporaryFile,
|
||||||
@ -767,9 +767,8 @@ class Cache(object):
|
|||||||
return sorted(all_book_ids, key=partial(SortKey, fields, sort_keys))
|
return sorted(all_book_ids, key=partial(SortKey, fields, sort_keys))
|
||||||
|
|
||||||
@read_api
|
@read_api
|
||||||
def search(self, query, restriction, virtual_fields=None):
|
def search(self, query, restriction='', virtual_fields=None, book_ids=None):
|
||||||
return self._search_api(self, query, restriction,
|
return self._search_api(self, query, restriction, virtual_fields=virtual_fields, book_ids=book_ids)
|
||||||
virtual_fields=virtual_fields)
|
|
||||||
|
|
||||||
@read_api
|
@read_api
|
||||||
def get_categories(self, sort='name', book_ids=None, icon_map=None):
|
def get_categories(self, sort='name', book_ids=None, icon_map=None):
|
||||||
@ -1452,6 +1451,59 @@ class Cache(object):
|
|||||||
return f.get_books_for_val(item_id_or_composite_value, self._get_metadata, self._all_book_ids())
|
return f.get_books_for_val(item_id_or_composite_value, self._get_metadata, self._all_book_ids())
|
||||||
return self._books_for_field(f.name, item_id_or_composite_value)
|
return self._books_for_field(f.name, item_id_or_composite_value)
|
||||||
|
|
||||||
|
@read_api
|
||||||
|
def find_identical_books(self, mi, search_restriction='', book_ids=None):
|
||||||
|
''' Finds books that have a superset of the authors in mi and the same
|
||||||
|
title (title is fuzzy matched) '''
|
||||||
|
fuzzy_title_patterns = [(re.compile(pat, re.IGNORECASE) if
|
||||||
|
isinstance(pat, basestring) else pat, repl) for pat, repl in
|
||||||
|
[
|
||||||
|
(r'[\[\](){}<>\'";,:#]', ''),
|
||||||
|
(get_title_sort_pat(), ''),
|
||||||
|
(r'[-._]', ' '),
|
||||||
|
(r'\s+', ' ')
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
def fuzzy_title(title):
|
||||||
|
title = icu_lower(title.strip())
|
||||||
|
for pat, repl in fuzzy_title_patterns:
|
||||||
|
title = pat.sub(repl, title)
|
||||||
|
return title
|
||||||
|
|
||||||
|
identical_book_ids = set()
|
||||||
|
if mi.authors:
|
||||||
|
try:
|
||||||
|
quathors = mi.authors[:20] # Too many authors causes parsing of
|
||||||
|
# the search expression to fail
|
||||||
|
query = ' and '.join('authors:"=%s"'%(a.replace('"', '')) for a in quathors)
|
||||||
|
qauthors = mi.authors[20:]
|
||||||
|
except ValueError:
|
||||||
|
return identical_book_ids
|
||||||
|
try:
|
||||||
|
book_ids = self._search(query, restriction=search_restriction, book_ids=book_ids)
|
||||||
|
except:
|
||||||
|
traceback.print_exc()
|
||||||
|
return identical_book_ids
|
||||||
|
if qauthors and book_ids:
|
||||||
|
matches = set()
|
||||||
|
qauthors = {icu_lower(x) for x in qauthors}
|
||||||
|
for book_id in book_ids:
|
||||||
|
aut = self._field_for('authors', book_id)
|
||||||
|
if aut:
|
||||||
|
aut = {icu_lower(x) for x in aut}
|
||||||
|
if aut.issuperset(qauthors):
|
||||||
|
matches.add(book_id)
|
||||||
|
book_ids = matches
|
||||||
|
|
||||||
|
for book_id in book_ids:
|
||||||
|
fbook_title = self._field_for('title', book_id)
|
||||||
|
fbook_title = fuzzy_title(fbook_title)
|
||||||
|
mbook_title = fuzzy_title(mi.title)
|
||||||
|
if fbook_title == mbook_title:
|
||||||
|
identical_book_ids.add(book_id)
|
||||||
|
return identical_book_ids
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
class SortKey(object): # {{{
|
class SortKey(object): # {{{
|
||||||
|
@ -746,6 +746,7 @@ LibraryDatabase.isbn = MT(
|
|||||||
LibraryDatabase.get_books_for_category = MT(
|
LibraryDatabase.get_books_for_category = MT(
|
||||||
lambda self, category, id_:self.new_api.get_books_for_category(category, id_))
|
lambda self, category, id_:self.new_api.get_books_for_category(category, id_))
|
||||||
LibraryDatabase.get_data_as_dict = MT(get_data_as_dict)
|
LibraryDatabase.get_data_as_dict = MT(get_data_as_dict)
|
||||||
|
LibraryDatabase.find_identical_books = MT(lambda self, mi:self.new_api.find_identical_books(mi))
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# Legacy setter API {{{
|
# Legacy setter API {{{
|
||||||
|
@ -661,7 +661,7 @@ class Search(object):
|
|||||||
def change_locations(self, newlocs):
|
def change_locations(self, newlocs):
|
||||||
self.all_search_locations = newlocs
|
self.all_search_locations = newlocs
|
||||||
|
|
||||||
def __call__(self, dbcache, query, search_restriction, virtual_fields=None):
|
def __call__(self, dbcache, query, search_restriction, virtual_fields=None, book_ids=None):
|
||||||
'''
|
'''
|
||||||
Return the set of ids of all records that match the specified
|
Return the set of ids of all records that match the specified
|
||||||
query and restriction
|
query and restriction
|
||||||
@ -674,17 +674,15 @@ class Search(object):
|
|||||||
if search_restriction:
|
if search_restriction:
|
||||||
q = u'(%s) and (%s)' % (search_restriction, query)
|
q = u'(%s) and (%s)' % (search_restriction, query)
|
||||||
|
|
||||||
all_book_ids = dbcache._all_book_ids(type=set)
|
all_book_ids = dbcache._all_book_ids(type=set) if book_ids is None else set(book_ids)
|
||||||
if not q:
|
if not q:
|
||||||
return all_book_ids
|
return all_book_ids
|
||||||
|
|
||||||
if not isinstance(q, type(u'')):
|
if not isinstance(q, type(u'')):
|
||||||
q = q.decode('utf-8')
|
q = q.decode('utf-8')
|
||||||
|
|
||||||
# We construct a new parser instance per search as pyparsing is not
|
# We construct a new parser instance per search as the parse is not
|
||||||
# thread safe. On my desktop, constructing a SearchQueryParser instance
|
# thread safe.
|
||||||
# takes 0.000975 seconds and restoring it from a pickle takes
|
|
||||||
# 0.000974 seconds.
|
|
||||||
sqp = Parser(
|
sqp = Parser(
|
||||||
dbcache, all_book_ids, dbcache._pref('grouped_search_terms'),
|
dbcache, all_book_ids, dbcache._pref('grouped_search_terms'),
|
||||||
self.date_search, self.num_search, self.bool_search,
|
self.date_search, self.num_search, self.bool_search,
|
||||||
|
@ -166,6 +166,7 @@ class LegacyTest(BaseTest):
|
|||||||
self.assertEqual(dict(db.prefs), dict(ndb.prefs))
|
self.assertEqual(dict(db.prefs), dict(ndb.prefs))
|
||||||
|
|
||||||
for meth, args in {
|
for meth, args in {
|
||||||
|
'find_identical_books': [(Metadata('title one', ['author one']),), (Metadata('unknown'),), (Metadata('xxxx'),)],
|
||||||
'get_books_for_category': [('tags', newstag), ('#formats', 'FMT1')],
|
'get_books_for_category': [('tags', newstag), ('#formats', 'FMT1')],
|
||||||
'get_next_series_num_for': [('A Series One',)],
|
'get_next_series_num_for': [('A Series One',)],
|
||||||
'get_id_from_uuid':[('ddddd',), (db.uuid(1, True),)],
|
'get_id_from_uuid':[('ddddd',), (db.uuid(1, True),)],
|
||||||
|
Loading…
x
Reference in New Issue
Block a user