find_identical_books()

This commit is contained in:
Kovid Goyal 2013-07-19 11:37:46 +05:30
parent 669efdd6f6
commit 34704c9735
4 changed files with 63 additions and 11 deletions

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, traceback, random, shutil import os, traceback, random, shutil, re
from io import BytesIO from io import BytesIO
from collections import defaultdict from collections import defaultdict
from functools import wraps, partial from functools import wraps, partial
@ -25,7 +25,7 @@ from calibre.db.tables import VirtualTable
from calibre.db.write import get_series_values from calibre.db.write import get_series_values
from calibre.db.lazy import FormatMetadata, FormatsList from calibre.db.lazy import FormatMetadata, FormatsList
from calibre.ebooks import check_ebook_format from calibre.ebooks import check_ebook_format
from calibre.ebooks.metadata import string_to_authors, author_to_author_sort from calibre.ebooks.metadata import string_to_authors, author_to_author_sort, get_title_sort_pat
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre.ebooks.metadata.opf2 import metadata_to_opf
from calibre.ptempfile import (base_dir, PersistentTemporaryFile, from calibre.ptempfile import (base_dir, PersistentTemporaryFile,
@ -767,9 +767,8 @@ class Cache(object):
return sorted(all_book_ids, key=partial(SortKey, fields, sort_keys)) return sorted(all_book_ids, key=partial(SortKey, fields, sort_keys))
@read_api @read_api
def search(self, query, restriction, virtual_fields=None): def search(self, query, restriction='', virtual_fields=None, book_ids=None):
return self._search_api(self, query, restriction, return self._search_api(self, query, restriction, virtual_fields=virtual_fields, book_ids=book_ids)
virtual_fields=virtual_fields)
@read_api @read_api
def get_categories(self, sort='name', book_ids=None, icon_map=None): def get_categories(self, sort='name', book_ids=None, icon_map=None):
@ -1452,6 +1451,59 @@ class Cache(object):
return f.get_books_for_val(item_id_or_composite_value, self._get_metadata, self._all_book_ids()) return f.get_books_for_val(item_id_or_composite_value, self._get_metadata, self._all_book_ids())
return self._books_for_field(f.name, item_id_or_composite_value) return self._books_for_field(f.name, item_id_or_composite_value)
@read_api
def find_identical_books(self, mi, search_restriction='', book_ids=None):
''' Finds books that have a superset of the authors in mi and the same
title (title is fuzzy matched) '''
fuzzy_title_patterns = [(re.compile(pat, re.IGNORECASE) if
isinstance(pat, basestring) else pat, repl) for pat, repl in
[
(r'[\[\](){}<>\'";,:#]', ''),
(get_title_sort_pat(), ''),
(r'[-._]', ' '),
(r'\s+', ' ')
]
]
def fuzzy_title(title):
title = icu_lower(title.strip())
for pat, repl in fuzzy_title_patterns:
title = pat.sub(repl, title)
return title
identical_book_ids = set()
if mi.authors:
try:
quathors = mi.authors[:20] # Too many authors causes parsing of
# the search expression to fail
query = ' and '.join('authors:"=%s"'%(a.replace('"', '')) for a in quathors)
qauthors = mi.authors[20:]
except ValueError:
return identical_book_ids
try:
book_ids = self._search(query, restriction=search_restriction, book_ids=book_ids)
except:
traceback.print_exc()
return identical_book_ids
if qauthors and book_ids:
matches = set()
qauthors = {icu_lower(x) for x in qauthors}
for book_id in book_ids:
aut = self._field_for('authors', book_id)
if aut:
aut = {icu_lower(x) for x in aut}
if aut.issuperset(qauthors):
matches.add(book_id)
book_ids = matches
for book_id in book_ids:
fbook_title = self._field_for('title', book_id)
fbook_title = fuzzy_title(fbook_title)
mbook_title = fuzzy_title(mi.title)
if fbook_title == mbook_title:
identical_book_ids.add(book_id)
return identical_book_ids
# }}} # }}}
class SortKey(object): # {{{ class SortKey(object): # {{{

View File

@ -746,6 +746,7 @@ LibraryDatabase.isbn = MT(
LibraryDatabase.get_books_for_category = MT( LibraryDatabase.get_books_for_category = MT(
lambda self, category, id_:self.new_api.get_books_for_category(category, id_)) lambda self, category, id_:self.new_api.get_books_for_category(category, id_))
LibraryDatabase.get_data_as_dict = MT(get_data_as_dict) LibraryDatabase.get_data_as_dict = MT(get_data_as_dict)
LibraryDatabase.find_identical_books = MT(lambda self, mi:self.new_api.find_identical_books(mi))
# }}} # }}}
# Legacy setter API {{{ # Legacy setter API {{{

View File

@ -661,7 +661,7 @@ class Search(object):
def change_locations(self, newlocs): def change_locations(self, newlocs):
self.all_search_locations = newlocs self.all_search_locations = newlocs
def __call__(self, dbcache, query, search_restriction, virtual_fields=None): def __call__(self, dbcache, query, search_restriction, virtual_fields=None, book_ids=None):
''' '''
Return the set of ids of all records that match the specified Return the set of ids of all records that match the specified
query and restriction query and restriction
@ -674,17 +674,15 @@ class Search(object):
if search_restriction: if search_restriction:
q = u'(%s) and (%s)' % (search_restriction, query) q = u'(%s) and (%s)' % (search_restriction, query)
all_book_ids = dbcache._all_book_ids(type=set) all_book_ids = dbcache._all_book_ids(type=set) if book_ids is None else set(book_ids)
if not q: if not q:
return all_book_ids return all_book_ids
if not isinstance(q, type(u'')): if not isinstance(q, type(u'')):
q = q.decode('utf-8') q = q.decode('utf-8')
# We construct a new parser instance per search as pyparsing is not # We construct a new parser instance per search as the parse is not
# thread safe. On my desktop, constructing a SearchQueryParser instance # thread safe.
# takes 0.000975 seconds and restoring it from a pickle takes
# 0.000974 seconds.
sqp = Parser( sqp = Parser(
dbcache, all_book_ids, dbcache._pref('grouped_search_terms'), dbcache, all_book_ids, dbcache._pref('grouped_search_terms'),
self.date_search, self.num_search, self.bool_search, self.date_search, self.num_search, self.bool_search,

View File

@ -166,6 +166,7 @@ class LegacyTest(BaseTest):
self.assertEqual(dict(db.prefs), dict(ndb.prefs)) self.assertEqual(dict(db.prefs), dict(ndb.prefs))
for meth, args in { for meth, args in {
'find_identical_books': [(Metadata('title one', ['author one']),), (Metadata('unknown'),), (Metadata('xxxx'),)],
'get_books_for_category': [('tags', newstag), ('#formats', 'FMT1')], 'get_books_for_category': [('tags', newstag), ('#formats', 'FMT1')],
'get_next_series_num_for': [('A Series One',)], 'get_next_series_num_for': [('A Series One',)],
'get_id_from_uuid':[('ddddd',), (db.uuid(1, True),)], 'get_id_from_uuid':[('ddddd',), (db.uuid(1, True),)],