calibredb add: New option --automerge to automatically merge duplicates

Have the option accept different merge algorithms. Also implement it for
recursive adding.
This commit is contained in:
Kovid Goyal 2020-11-23 14:07:55 +05:30
parent fd97af9b1d
commit 54dbba6f06
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 111 additions and 42 deletions

View File

@ -9,19 +9,18 @@ from contextlib import contextmanager
from optparse import OptionGroup, OptionValueError from optparse import OptionGroup, OptionValueError
from calibre import prints from calibre import prints
from calibre.db.utils import find_identical_books
from calibre.db.copy_to_library import automerge_book
from calibre.db.adding import ( from calibre.db.adding import (
cdb_find_in_dir, cdb_recursive_find, compile_rule, create_format_map, cdb_find_in_dir, cdb_recursive_find, compile_rule, create_format_map,
run_import_plugins, run_import_plugins_before_metadata run_import_plugins, run_import_plugins_before_metadata
) )
from calibre.db.utils import find_identical_books
from calibre.ebooks.metadata import MetaInformation, string_to_authors from calibre.ebooks.metadata import MetaInformation, string_to_authors
from calibre.ebooks.metadata.book.serialize import read_cover, serialize_cover from calibre.ebooks.metadata.book.serialize import read_cover, serialize_cover
from calibre.ebooks.metadata.meta import get_metadata, metadata_from_formats from calibre.ebooks.metadata.meta import get_metadata, metadata_from_formats
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.srv.changes import books_added from calibre.srv.changes import books_added, formats_added
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
from calibre.utils.config import tweaks from calibre.utils.short_uuid import uuid4
from polyglot.builtins import unicode_type from polyglot.builtins import unicode_type
readonly = False readonly = False
@ -37,8 +36,83 @@ def empty(db, notify_changes, is_remote, args):
return ids, bool(duplicates) return ids, bool(duplicates)
def cached_identical_book_data(db, request_id):
key = db.library_id, request_id
if getattr(cached_identical_book_data, 'key', None) != key:
cached_identical_book_data.key = key
cached_identical_book_data.ans = db.data_for_find_identical_books()
return cached_identical_book_data.ans
def do_adding(db, request_id, notify_changes, is_remote, mi, format_map, add_duplicates, oautomerge):
identical_book_list, added_ids, updated_ids = set(), set(), set()
duplicates = []
identical_books_data = None
def add_format(book_id, fmt):
db.add_format(book_id, fmt, format_map[fmt], replace=True, run_hooks=False)
updated_ids.add(book_id)
def add_book():
nonlocal added_ids
added_ids_, duplicates_ = db.add_books(
[(mi, format_map)], add_duplicates=True, run_hooks=False)
added_ids |= set(added_ids_)
duplicates.extend(duplicates_)
if oautomerge != 'disabled' or not add_duplicates:
identical_books_data = cached_identical_book_data(db, request_id)
identical_book_list = find_identical_books(mi, identical_books_data)
if oautomerge != 'disabled':
if identical_book_list:
needs_add = False
duplicated_formats = set()
for book_id in identical_book_list:
book_formats = {q.upper() for q in db.formats(book_id)}
input_formats = {q.upper():q for q in format_map}
common_formats = book_formats & set(input_formats)
if not common_formats:
for x in input_formats:
add_format(book_id, input_formats[x])
else:
new_formats = set(input_formats) - book_formats
if new_formats:
for x in new_formats:
add_format(book_id, input_formats[x])
if oautomerge == 'overwrite':
for x in common_formats:
add_format(book_id, input_formats[x])
elif oautomerge == 'ignore':
for x in common_formats:
duplicated_formats.add(input_formats[x])
elif oautomerge == 'new_record':
needs_add = True
if needs_add:
add_book()
if duplicated_formats:
duplicates.append((mi, {x: format_map[x] for x in duplicated_formats}))
else:
add_book()
else:
if identical_book_list:
duplicates.append((mi, format_map))
else:
add_book()
if added_ids and identical_books_data is not None:
for book_id in added_ids:
db.update_data_for_find_identical_books(book_id, identical_books_data)
if is_remote:
notify_changes(books_added(added_ids))
if updated_ids:
notify_changes(formats_added({book_id: tuple(format_map) for book_id in updated_ids}))
db.dump_metadata()
return added_ids, updated_ids, duplicates
def book(db, notify_changes, is_remote, args): def book(db, notify_changes, is_remote, args):
data, fname, fmt, add_duplicates, otitle, oauthors, oisbn, otags, oseries, oseries_index, ocover, oidentifiers, olanguages, oautomerge = args data, fname, fmt, add_duplicates, otitle, oauthors, oisbn, otags, oseries, oseries_index, ocover, oidentifiers, olanguages, oautomerge, request_id = args
with add_ctx(), TemporaryDirectory('add-single') as tdir, run_import_plugins_before_metadata(tdir): with add_ctx(), TemporaryDirectory('add-single') as tdir, run_import_plugins_before_metadata(tdir):
if is_remote: if is_remote:
with lopen(os.path.join(tdir, fname), 'wb') as f: with lopen(os.path.join(tdir, fname), 'wb') as f:
@ -69,30 +143,19 @@ def book(db, notify_changes, is_remote, args):
mi.cover = None mi.cover = None
mi.cover_data = ocover mi.cover_data = ocover
identical_book_list,added_ids,updated_ids=set(),set(),set() identical_book_list, added_ids, updated_ids = set(), set(), set()
if oautomerge: duplicates = []
identical_books_data = identical_books_data = db.data_for_find_identical_books() identical_books_data = None
identical_book_list = find_identical_books(mi, identical_books_data) added_ids, updated_ids, duplicates = do_adding(
add_duplicates=True db, request_id, notify_changes, is_remote, mi, {fmt: path}, add_duplicates, oautomerge)
if len(identical_book_list) > 0:
for book_id in identical_book_list:
db.add_format(book_id, fmt, path, replace='overwrite', run_hooks=False)
updated_ids=identical_book_list
duplicates=False
else:
added_ids, duplicates = db.add_books(
[(mi, {fmt: path})], add_duplicates=add_duplicates, run_hooks=False)
if is_remote: return added_ids, updated_ids, bool(duplicates), mi.title
notify_changes(books_added(added_ids))
notify_changes(books_added(updated_ids))
db.dump_metadata()
return added_ids,updated_ids, bool(duplicates), mi.title
def format_group(db, notify_changes, is_remote, args): def format_group(db, notify_changes, is_remote, args):
formats, add_duplicates, cover_data = args formats, add_duplicates, oautomerge, request_id, cover_data = args
with add_ctx(), TemporaryDirectory('add-multiple') as tdir, run_import_plugins_before_metadata(tdir): with add_ctx(), TemporaryDirectory('add-multiple') as tdir, run_import_plugins_before_metadata(tdir):
updated_ids = {}
if is_remote: if is_remote:
paths = [] paths = []
for name, data in formats: for name, data in formats:
@ -104,14 +167,13 @@ def format_group(db, notify_changes, is_remote, args):
paths = run_import_plugins(paths) paths = run_import_plugins(paths)
mi = metadata_from_formats(paths) mi = metadata_from_formats(paths)
if mi.title is None: if mi.title is None:
return None, set(), False return None, set(), set(), False
if cover_data and not mi.cover_data or not mi.cover_data[1]: if cover_data and not mi.cover_data or not mi.cover_data[1]:
mi.cover_data = 'jpeg', cover_data mi.cover_data = 'jpeg', cover_data
ids, dups = db.add_books([(mi, create_format_map(paths))], add_duplicates=add_duplicates, run_hooks=False) format_map = create_format_map(paths)
if is_remote: added_ids, updated_ids, duplicates = do_adding(
notify_changes(books_added(ids)) db, request_id, notify_changes, is_remote, mi, format_map, add_duplicates, oautomerge)
db.dump_metadata() return mi.title, set(added_ids), set(updated_ids), bool(duplicates)
return mi.title, ids, bool(dups)
def implementation(db, notify_changes, action, *args): def implementation(db, notify_changes, action, *args):
@ -157,6 +219,7 @@ def do_add(
oisbn, otags, oseries, oseries_index, ocover, oidentifiers, olanguages, oisbn, otags, oseries, oseries_index, ocover, oidentifiers, olanguages,
compiled_rules, oautomerge compiled_rules, oautomerge
): ):
request_id = uuid4()
with add_ctx(): with add_ctx():
files, dirs = [], [] files, dirs = [], []
for path in paths: for path in paths:
@ -178,7 +241,7 @@ def do_add(
aids, mids, dups, book_title = dbctx.run( aids, mids, dups, book_title = dbctx.run(
'add', 'book', dbctx.path(book), os.path.basename(book), fmt, add_duplicates, 'add', 'book', dbctx.path(book), os.path.basename(book), fmt, add_duplicates,
otitle, oauthors, oisbn, otags, oseries, oseries_index, serialize_cover(ocover) if ocover else None, otitle, oauthors, oisbn, otags, oseries, oseries_index, serialize_cover(ocover) if ocover else None,
oidentifiers, olanguages, oautomerge oidentifiers, olanguages, oautomerge, request_id
) )
added_ids |= set(aids) added_ids |= set(aids)
merged_ids |= set(mids) merged_ids |= set(mids)
@ -204,10 +267,11 @@ def do_add(
except EnvironmentError: except EnvironmentError:
pass pass
book_title, ids, dups = dbctx.run( book_title, ids, mids, dups = dbctx.run(
'add', 'format_group', tuple(map(dbctx.path, formats)), add_duplicates, cover_data) 'add', 'format_group', tuple(map(dbctx.path, formats)), add_duplicates, oautomerge, request_id, cover_data)
if book_title is not None: if book_title is not None:
added_ids |= set(ids) added_ids |= set(ids)
merged_ids |= set(mids)
if dups: if dups:
dir_dups.append((book_title, formats)) dir_dups.append((book_title, formats))
@ -234,7 +298,7 @@ def do_add(
if added_ids: if added_ids:
prints(_('Added book ids: %s') % (', '.join(map(unicode_type, added_ids)))) prints(_('Added book ids: %s') % (', '.join(map(unicode_type, added_ids))))
if merged_ids: if merged_ids:
prints(_('Updated book ids: %s') % (', '.join(map(unicode_type, merged_ids)))) prints(_('Merged book ids: %s') % (', '.join(map(unicode_type, merged_ids))))
def option_parser(get_parser, args): def option_parser(get_parser, args):
@ -254,16 +318,21 @@ the directory related options below.
action='store_true', action='store_true',
default=False, default=False,
help=_( help=_(
'Add books to database even if they already exist. Comparison is done based on book titles.' 'Add books to database even if they already exist. Comparison is done based on book titles and authors.'
) ' Note that the {} option takes precedence.'
).format('--automerge')
) )
parser.add_option( parser.add_option(
'-m', '-m',
'--automerge', '--automerge',
action='store_true', type='choice',
default=False, choices=('disabled', 'ignore', 'overwrite', 'new_record'),
default='disabled',
help=_( help=_(
'Add or upgrade existing book(s) to database. Comparison is done based on book titles, autor and language.\nSearch the library for the specified book and decide: \n * To update its format and language on the library if the book is newer than the existing one in the library.\n * To add to the library if the format and language does not exist.\n * To discard action if none of the above.' 'If books with similar titles and authors are found, merge the incoming formats (files) automatically into'
' existing book records. A value of "ignore" means duplicate formats are discarded. A value of'
' "overwrite" means duplicate formats in the library are overwritten with the newly added files.'
' A value of "new_record" means duplicate formats are placed into a new book record.'
) )
) )
parser.add_option( parser.add_option(

View File

@ -149,7 +149,7 @@
<item row="8" column="0"> <item row="8" column="0">
<widget class="QCheckBox" name="opt_add_formats_to_existing"> <widget class="QCheckBox" name="opt_add_formats_to_existing">
<property name="toolTip"> <property name="toolTip">
<string>Auto-merge: If books with similar titles and authors found, merge the incoming formats automatically into <string>Auto-merge: If books with similar titles and authors are found, merge the incoming formats automatically into
existing book records. The box to the right controls what happens when an existing record already has existing book records. The box to the right controls what happens when an existing record already has
the incoming format. Note that this option also affects the Copy to library action. the incoming format. Note that this option also affects the Copy to library action.
@ -163,7 +163,7 @@ Title match ignores leading indefinite articles (&quot;the&quot;, &quot;a&quot;,
<item row="8" column="1"> <item row="8" column="1">
<widget class="QComboBox" name="opt_automerge"> <widget class="QComboBox" name="opt_automerge">
<property name="toolTip"> <property name="toolTip">
<string>Auto-merge: If books with similar titles and authors found, merge the incoming formats (files) automatically into <string>Auto-merge: If books with similar titles and authors are found, merge the incoming formats (files) automatically into
existing book records. This box controls what happens when an existing record already has existing book records. This box controls what happens when an existing record already has
the incoming format: the incoming format: