diff --git a/src/calibre/db/tables.py b/src/calibre/db/tables.py index d8b623b352..219f12a613 100644 --- a/src/calibre/db/tables.py +++ b/src/calibre/db/tables.py @@ -67,6 +67,13 @@ class Table(object): def fix_link_table(self, db): pass + def fix_case_duplicates(self, db): + ''' If this table contains entries that differ only by case, then merge + those entries. This can happen in databases created with old versions + of calibre and non-ascii values, since sqlite's NOCASE only works with + ascii text. ''' + pass + class VirtualTable(Table): ''' @@ -215,6 +222,26 @@ class ManyToOneTable(Table): db.conn.executemany('DELETE FROM {0} WHERE {1}=?'.format( self.link_table, self.metadata['link_column']), tuple((x,) for x in extra_item_ids)) + def fix_case_duplicates(self, db): + case_map = defaultdict(set) + for item_id, val in self.id_map.iteritems(): + case_map[icu_lower(val)].add(item_id) + + for v in case_map.itervalues(): + if len(v) > 1: + main_id = min(v) + v.discard(main_id) + for item_id in v: + self.id_map.pop(item_id, None) + books = self.col_book_map.pop(item_id, set()) + for book_id in books: + self.book_col_map[book_id] = main_id + db.conn.executemany('UPDATE {0} SET {1}=? WHERE {1}=?'.format( + self.link_table, self.metadata['link_column']), + tuple((main_id, x) for x in v)) + db.conn.executemany('DELETE FROM {0} WHERE id=?'.format(self.metadata['table']), + tuple((x,) for x in v)) + def remove_books(self, book_ids, db): clean = set() for book_id in book_ids: @@ -386,6 +413,43 @@ class ManyToManyTable(ManyToOneTable): self.link_table, lcol, table), (existing_item, item_id, item_id)) return affected_books, new_id + def fix_case_duplicates(self, db): + from calibre.db.write import uniq + case_map = defaultdict(set) + for item_id, val in self.id_map.iteritems(): + case_map[icu_lower(val)].add(item_id) + + for v in case_map.itervalues(): + if len(v) > 1: + done_books = set() + main_id = min(v) + v.discard(main_id) + for item_id in v: + self.id_map.pop(item_id, None) + books = self.col_book_map.pop(item_id, set()) + for book_id in books: + if book_id in done_books: + continue + done_books.add(book_id) + orig = self.book_col_map.get(book_id, ()) + if not orig: + continue + vals = uniq(tuple(main_id if x in v else x for x in orig)) + self.book_col_map[book_id] = vals + if len(orig) == len(vals): + # We have a simple replacement + db.conn.executemany( + 'UPDATE {0} SET {1}=? WHERE {1}=? AND book=?'.format( + self.link_table, self.metadata['link_column']), + tuple((main_id, x, book_id) for x in v)) + else: + # duplicates + db.conn.execute('DELETE FROM {0} WHERE book=?'.format(self.link_table), (book_id,)) + db.conn.executemany( + 'INSERT INTO {0} (book,{1}) VALUES (?,?)'.format(self.link_table, self.metadata['link_column']), + tuple((book_id, x) for x in vals)) + db.conn.executemany('DELETE FROM {0} WHERE id=?'.format(self.metadata['table']), + tuple((x,) for x in v)) class AuthorsTable(ManyToManyTable): @@ -445,6 +509,9 @@ class FormatsTable(ManyToManyTable): def read_id_maps(self, db): pass + def fix_case_duplicates(self, db): + pass + def read_maps(self, db): self.fname_map = fnm = defaultdict(dict) self.size_map = sm = defaultdict(dict) @@ -524,6 +591,9 @@ class IdentifiersTable(ManyToManyTable): def read_id_maps(self, db): pass + def fix_case_duplicates(self, db): + pass + def read_maps(self, db): self.book_col_map = defaultdict(dict) self.col_book_map = defaultdict(set) diff --git a/src/calibre/db/tests/writing.py b/src/calibre/db/tests/writing.py index a42d47e67f..572453dfad 100644 --- a/src/calibre/db/tests/writing.py +++ b/src/calibre/db/tests/writing.py @@ -632,3 +632,63 @@ class WritingTest(BaseTest): self.assertEqual({3}, cache.set_sort_for_authors(sdata), 'Setting the author sort to the same value as before, incorrectly marked some books as dirty') # }}} + + def test_fix_case_duplicates(self): # {{{ + ' Test fixing of databases that have items in is_many fields that differ only by case ' + ae = self.assertEqual + cache = self.init_cache() + conn = cache.backend.conn + conn.execute('INSERT INTO publishers (name) VALUES ("mūs")') + lid = conn.last_insert_rowid() + conn.execute('INSERT INTO publishers (name) VALUES ("MŪS")') + uid = conn.last_insert_rowid() + conn.execute('DELETE FROM books_publishers_link') + conn.execute('INSERT INTO books_publishers_link (book,publisher) VALUES (1, %d)' % lid) + conn.execute('INSERT INTO books_publishers_link (book,publisher) VALUES (2, %d)' % uid) + conn.execute('INSERT INTO books_publishers_link (book,publisher) VALUES (3, %d)' % uid) + cache.reload_from_db() + t = cache.fields['publisher'].table + for x in (lid, uid): + self.assertIn(x, t.id_map) + self.assertIn(x, t.col_book_map) + ae(t.book_col_map[1], lid) + ae(t.book_col_map[2], uid) + t.fix_case_duplicates(cache.backend) + for c in (cache, self.init_cache()): + t = c.fields['publisher'].table + self.assertNotIn(uid, t.id_map) + self.assertNotIn(uid, t.col_book_map) + for bid in (1, 2, 3): + ae(c.field_for('publisher', bid), "mūs") + c.close() + + cache = self.init_cache() + conn = cache.backend.conn + conn.execute('INSERT INTO tags (name) VALUES ("mūūs")') + lid = conn.last_insert_rowid() + conn.execute('INSERT INTO tags (name) VALUES ("MŪŪS")') + uid = conn.last_insert_rowid() + conn.execute('INSERT INTO tags (name) VALUES ("mūŪS")') + mid = conn.last_insert_rowid() + conn.execute('INSERT INTO tags (name) VALUES ("t")') + norm = conn.last_insert_rowid() + conn.execute('DELETE FROM books_tags_link') + for book_id, vals in {1:(lid, uid), 2:(uid, mid), 3:(lid, norm)}.iteritems(): + conn.executemany('INSERT INTO books_tags_link (book,tag) VALUES (?,?)', + tuple((book_id, x) for x in vals)) + cache.reload_from_db() + t = cache.fields['tags'].table + for x in (lid, uid, mid): + self.assertIn(x, t.id_map) + self.assertIn(x, t.col_book_map) + t.fix_case_duplicates(cache.backend) + for c in (cache, self.init_cache()): + t = c.fields['tags'].table + for x in (uid, mid): + self.assertNotIn(x, t.id_map) + self.assertNotIn(x, t.col_book_map) + ae(c.field_for('tags', 1), (t.id_map[lid],)) + ae(c.field_for('tags', 2), (t.id_map[lid],), 'failed for book 2') + ae(c.field_for('tags', 3), (t.id_map[lid], t.id_map[norm])) + # }}} + diff --git a/src/calibre/db/write.py b/src/calibre/db/write.py index ee81659984..6025bbad9b 100644 --- a/src/calibre/db/write.py +++ b/src/calibre/db/write.py @@ -278,6 +278,10 @@ def many_one(book_id_val_map, db, field, allow_case_change, *args): # Map values to db ids, including any new values kmap = safe_lower if dt in {'text', 'series'} else lambda x:x rid_map = {kmap(item):item_id for item_id, item in table.id_map.iteritems()} + if len(rid_map) != len(table.id_map): + # table has some entries that differ only in case, fix it + table.fix_case_duplicates(db) + rid_map = {kmap(item):item_id for item_id, item in table.id_map.iteritems()} val_map = {None:None} case_changes = {} for val in book_id_val_map.itervalues(): @@ -339,7 +343,7 @@ def many_one(book_id_val_map, db, field, allow_case_change, *args): # Many-Many fields {{{ -def uniq(vals, kmap): +def uniq(vals, kmap=lambda x:x): ''' Remove all duplicates from vals, while preserving order. kmap must be a callable that returns a hashable value for every item in vals ''' vals = vals or () @@ -358,6 +362,10 @@ def many_many(book_id_val_map, db, field, allow_case_change, *args): # Map values to db ids, including any new values kmap = safe_lower if dt == 'text' else lambda x:x rid_map = {kmap(item):item_id for item_id, item in table.id_map.iteritems()} + if len(rid_map) != len(table.id_map): + # table has some entries that differ only in case, fix it + table.fix_case_duplicates(db) + rid_map = {kmap(item):item_id for item_id, item in table.id_map.iteritems()} val_map = {} case_changes = {} book_id_val_map = {k:uniq(vals, kmap) for k, vals in book_id_val_map.iteritems()}