From 54dbba6f0613020d6f413abc692b0f63f5d02841 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 23 Nov 2020 14:07:55 +0530 Subject: [PATCH] calibredb add: New option --automerge to automatically merge duplicates Have the option accept different merge algorithms. Also implement it for recursive adding. --- src/calibre/db/cli/cmd_add.py | 149 ++++++++++++++++++------- src/calibre/gui2/preferences/adding.ui | 4 +- 2 files changed, 111 insertions(+), 42 deletions(-) diff --git a/src/calibre/db/cli/cmd_add.py b/src/calibre/db/cli/cmd_add.py index aff63bae0f..33faf61bea 100644 --- a/src/calibre/db/cli/cmd_add.py +++ b/src/calibre/db/cli/cmd_add.py @@ -9,19 +9,18 @@ from contextlib import contextmanager from optparse import OptionGroup, OptionValueError from calibre import prints -from calibre.db.utils import find_identical_books -from calibre.db.copy_to_library import automerge_book from calibre.db.adding import ( cdb_find_in_dir, cdb_recursive_find, compile_rule, create_format_map, run_import_plugins, run_import_plugins_before_metadata ) +from calibre.db.utils import find_identical_books from calibre.ebooks.metadata import MetaInformation, string_to_authors from calibre.ebooks.metadata.book.serialize import read_cover, serialize_cover from calibre.ebooks.metadata.meta import get_metadata, metadata_from_formats from calibre.ptempfile import TemporaryDirectory -from calibre.srv.changes import books_added +from calibre.srv.changes import books_added, formats_added from calibre.utils.localization import canonicalize_lang -from calibre.utils.config import tweaks +from calibre.utils.short_uuid import uuid4 from polyglot.builtins import unicode_type readonly = False @@ -37,8 +36,83 @@ def empty(db, notify_changes, is_remote, args): return ids, bool(duplicates) +def cached_identical_book_data(db, request_id): + key = db.library_id, request_id + if getattr(cached_identical_book_data, 'key', None) != key: + cached_identical_book_data.key = key + cached_identical_book_data.ans = db.data_for_find_identical_books() + return cached_identical_book_data.ans + + +def do_adding(db, request_id, notify_changes, is_remote, mi, format_map, add_duplicates, oautomerge): + identical_book_list, added_ids, updated_ids = set(), set(), set() + duplicates = [] + identical_books_data = None + + def add_format(book_id, fmt): + db.add_format(book_id, fmt, format_map[fmt], replace=True, run_hooks=False) + updated_ids.add(book_id) + + def add_book(): + nonlocal added_ids + added_ids_, duplicates_ = db.add_books( + [(mi, format_map)], add_duplicates=True, run_hooks=False) + added_ids |= set(added_ids_) + duplicates.extend(duplicates_) + + if oautomerge != 'disabled' or not add_duplicates: + identical_books_data = cached_identical_book_data(db, request_id) + identical_book_list = find_identical_books(mi, identical_books_data) + + if oautomerge != 'disabled': + if identical_book_list: + needs_add = False + duplicated_formats = set() + for book_id in identical_book_list: + book_formats = {q.upper() for q in db.formats(book_id)} + input_formats = {q.upper():q for q in format_map} + common_formats = book_formats & set(input_formats) + if not common_formats: + for x in input_formats: + add_format(book_id, input_formats[x]) + else: + new_formats = set(input_formats) - book_formats + if new_formats: + for x in new_formats: + add_format(book_id, input_formats[x]) + if oautomerge == 'overwrite': + for x in common_formats: + add_format(book_id, input_formats[x]) + elif oautomerge == 'ignore': + for x in common_formats: + duplicated_formats.add(input_formats[x]) + elif oautomerge == 'new_record': + needs_add = True + if needs_add: + add_book() + if duplicated_formats: + duplicates.append((mi, {x: format_map[x] for x in duplicated_formats})) + else: + add_book() + else: + if identical_book_list: + duplicates.append((mi, format_map)) + else: + add_book() + if added_ids and identical_books_data is not None: + for book_id in added_ids: + db.update_data_for_find_identical_books(book_id, identical_books_data) + + if is_remote: + notify_changes(books_added(added_ids)) + if updated_ids: + notify_changes(formats_added({book_id: tuple(format_map) for book_id in updated_ids})) + db.dump_metadata() + return added_ids, updated_ids, duplicates + + def book(db, notify_changes, is_remote, args): - data, fname, fmt, add_duplicates, otitle, oauthors, oisbn, otags, oseries, oseries_index, ocover, oidentifiers, olanguages, oautomerge = args + data, fname, fmt, add_duplicates, otitle, oauthors, oisbn, otags, oseries, oseries_index, ocover, oidentifiers, olanguages, oautomerge, request_id = args with add_ctx(), TemporaryDirectory('add-single') as tdir, run_import_plugins_before_metadata(tdir): if is_remote: with lopen(os.path.join(tdir, fname), 'wb') as f: @@ -68,31 +142,20 @@ def book(db, notify_changes, is_remote, args): if ocover: mi.cover = None mi.cover_data = ocover - - identical_book_list,added_ids,updated_ids=set(),set(),set() - if oautomerge: - identical_books_data = identical_books_data = db.data_for_find_identical_books() - identical_book_list = find_identical_books(mi, identical_books_data) - add_duplicates=True - if len(identical_book_list) > 0: - for book_id in identical_book_list: - db.add_format(book_id, fmt, path, replace='overwrite', run_hooks=False) - updated_ids=identical_book_list - duplicates=False - else: - added_ids, duplicates = db.add_books( - [(mi, {fmt: path})], add_duplicates=add_duplicates, run_hooks=False) - if is_remote: - notify_changes(books_added(added_ids)) - notify_changes(books_added(updated_ids)) - db.dump_metadata() - return added_ids,updated_ids, bool(duplicates), mi.title + identical_book_list, added_ids, updated_ids = set(), set(), set() + duplicates = [] + identical_books_data = None + added_ids, updated_ids, duplicates = do_adding( + db, request_id, notify_changes, is_remote, mi, {fmt: path}, add_duplicates, oautomerge) + + return added_ids, updated_ids, bool(duplicates), mi.title def format_group(db, notify_changes, is_remote, args): - formats, add_duplicates, cover_data = args + formats, add_duplicates, oautomerge, request_id, cover_data = args with add_ctx(), TemporaryDirectory('add-multiple') as tdir, run_import_plugins_before_metadata(tdir): + updated_ids = {} if is_remote: paths = [] for name, data in formats: @@ -104,14 +167,13 @@ def format_group(db, notify_changes, is_remote, args): paths = run_import_plugins(paths) mi = metadata_from_formats(paths) if mi.title is None: - return None, set(), False + return None, set(), set(), False if cover_data and not mi.cover_data or not mi.cover_data[1]: mi.cover_data = 'jpeg', cover_data - ids, dups = db.add_books([(mi, create_format_map(paths))], add_duplicates=add_duplicates, run_hooks=False) - if is_remote: - notify_changes(books_added(ids)) - db.dump_metadata() - return mi.title, ids, bool(dups) + format_map = create_format_map(paths) + added_ids, updated_ids, duplicates = do_adding( + db, request_id, notify_changes, is_remote, mi, format_map, add_duplicates, oautomerge) + return mi.title, set(added_ids), set(updated_ids), bool(duplicates) def implementation(db, notify_changes, action, *args): @@ -157,6 +219,7 @@ def do_add( oisbn, otags, oseries, oseries_index, ocover, oidentifiers, olanguages, compiled_rules, oautomerge ): + request_id = uuid4() with add_ctx(): files, dirs = [], [] for path in paths: @@ -178,7 +241,7 @@ def do_add( aids, mids, dups, book_title = dbctx.run( 'add', 'book', dbctx.path(book), os.path.basename(book), fmt, add_duplicates, otitle, oauthors, oisbn, otags, oseries, oseries_index, serialize_cover(ocover) if ocover else None, - oidentifiers, olanguages, oautomerge + oidentifiers, olanguages, oautomerge, request_id ) added_ids |= set(aids) merged_ids |= set(mids) @@ -204,10 +267,11 @@ def do_add( except EnvironmentError: pass - book_title, ids, dups = dbctx.run( - 'add', 'format_group', tuple(map(dbctx.path, formats)), add_duplicates, cover_data) + book_title, ids, mids, dups = dbctx.run( + 'add', 'format_group', tuple(map(dbctx.path, formats)), add_duplicates, oautomerge, request_id, cover_data) if book_title is not None: added_ids |= set(ids) + merged_ids |= set(mids) if dups: dir_dups.append((book_title, formats)) @@ -234,7 +298,7 @@ def do_add( if added_ids: prints(_('Added book ids: %s') % (', '.join(map(unicode_type, added_ids)))) if merged_ids: - prints(_('Updated book ids: %s') % (', '.join(map(unicode_type, merged_ids)))) + prints(_('Merged book ids: %s') % (', '.join(map(unicode_type, merged_ids)))) def option_parser(get_parser, args): @@ -254,16 +318,21 @@ the directory related options below. action='store_true', default=False, help=_( - 'Add books to database even if they already exist. Comparison is done based on book titles.' - ) + 'Add books to database even if they already exist. Comparison is done based on book titles and authors.' + ' Note that the {} option takes precedence.' + ).format('--automerge') ) parser.add_option( '-m', '--automerge', - action='store_true', - default=False, + type='choice', + choices=('disabled', 'ignore', 'overwrite', 'new_record'), + default='disabled', help=_( - 'Add or upgrade existing book(s) to database. Comparison is done based on book titles, autor and language.\nSearch the library for the specified book and decide: \n * To update its format and language on the library if the book is newer than the existing one in the library.\n * To add to the library if the format and language does not exist.\n * To discard action if none of the above.' + 'If books with similar titles and authors are found, merge the incoming formats (files) automatically into' + ' existing book records. A value of "ignore" means duplicate formats are discarded. A value of' + ' "overwrite" means duplicate formats in the library are overwritten with the newly added files.' + ' A value of "new_record" means duplicate formats are placed into a new book record.' ) ) parser.add_option( diff --git a/src/calibre/gui2/preferences/adding.ui b/src/calibre/gui2/preferences/adding.ui index 00828f7b5b..8063ac32c5 100644 --- a/src/calibre/gui2/preferences/adding.ui +++ b/src/calibre/gui2/preferences/adding.ui @@ -149,7 +149,7 @@ - Auto-merge: If books with similar titles and authors found, merge the incoming formats automatically into + Auto-merge: If books with similar titles and authors are found, merge the incoming formats automatically into existing book records. The box to the right controls what happens when an existing record already has the incoming format. Note that this option also affects the Copy to library action. @@ -163,7 +163,7 @@ Title match ignores leading indefinite articles ("the", "a", - Auto-merge: If books with similar titles and authors found, merge the incoming formats (files) automatically into + Auto-merge: If books with similar titles and authors are found, merge the incoming formats (files) automatically into existing book records. This box controls what happens when an existing record already has the incoming format: