newdb: Handle databases with case duplicates

databases created with pre 1.0 versions of calibre could have multiple
tags/series/publishers/etc. that differ only in the case. This is
because sqlite's case handling only works for ascii characters.
This could cause problems when setting metadata.
Automatically detect and fix these databases. Fixes #1221545 [edit metadata individually>Download Metadata>press ok>unhandled exception](https://bugs.launchpad.net/calibre/+bug/1221545)
This commit is contained in:
Kovid Goyal 2013-09-07 13:21:45 +05:30
parent 18e3f9464b
commit f0d61014ab
3 changed files with 139 additions and 1 deletions

View File

@ -67,6 +67,13 @@ class Table(object):
def fix_link_table(self, db):
pass
def fix_case_duplicates(self, db):
''' If this table contains entries that differ only by case, then merge
those entries. This can happen in databases created with old versions
of calibre and non-ascii values, since sqlite's NOCASE only works with
ascii text. '''
pass
class VirtualTable(Table):
'''
@ -215,6 +222,26 @@ class ManyToOneTable(Table):
db.conn.executemany('DELETE FROM {0} WHERE {1}=?'.format(
self.link_table, self.metadata['link_column']), tuple((x,) for x in extra_item_ids))
def fix_case_duplicates(self, db):
case_map = defaultdict(set)
for item_id, val in self.id_map.iteritems():
case_map[icu_lower(val)].add(item_id)
for v in case_map.itervalues():
if len(v) > 1:
main_id = min(v)
v.discard(main_id)
for item_id in v:
self.id_map.pop(item_id, None)
books = self.col_book_map.pop(item_id, set())
for book_id in books:
self.book_col_map[book_id] = main_id
db.conn.executemany('UPDATE {0} SET {1}=? WHERE {1}=?'.format(
self.link_table, self.metadata['link_column']),
tuple((main_id, x) for x in v))
db.conn.executemany('DELETE FROM {0} WHERE id=?'.format(self.metadata['table']),
tuple((x,) for x in v))
def remove_books(self, book_ids, db):
clean = set()
for book_id in book_ids:
@ -386,6 +413,43 @@ class ManyToManyTable(ManyToOneTable):
self.link_table, lcol, table), (existing_item, item_id, item_id))
return affected_books, new_id
def fix_case_duplicates(self, db):
from calibre.db.write import uniq
case_map = defaultdict(set)
for item_id, val in self.id_map.iteritems():
case_map[icu_lower(val)].add(item_id)
for v in case_map.itervalues():
if len(v) > 1:
done_books = set()
main_id = min(v)
v.discard(main_id)
for item_id in v:
self.id_map.pop(item_id, None)
books = self.col_book_map.pop(item_id, set())
for book_id in books:
if book_id in done_books:
continue
done_books.add(book_id)
orig = self.book_col_map.get(book_id, ())
if not orig:
continue
vals = uniq(tuple(main_id if x in v else x for x in orig))
self.book_col_map[book_id] = vals
if len(orig) == len(vals):
# We have a simple replacement
db.conn.executemany(
'UPDATE {0} SET {1}=? WHERE {1}=? AND book=?'.format(
self.link_table, self.metadata['link_column']),
tuple((main_id, x, book_id) for x in v))
else:
# duplicates
db.conn.execute('DELETE FROM {0} WHERE book=?'.format(self.link_table), (book_id,))
db.conn.executemany(
'INSERT INTO {0} (book,{1}) VALUES (?,?)'.format(self.link_table, self.metadata['link_column']),
tuple((book_id, x) for x in vals))
db.conn.executemany('DELETE FROM {0} WHERE id=?'.format(self.metadata['table']),
tuple((x,) for x in v))
class AuthorsTable(ManyToManyTable):
@ -445,6 +509,9 @@ class FormatsTable(ManyToManyTable):
def read_id_maps(self, db):
pass
def fix_case_duplicates(self, db):
pass
def read_maps(self, db):
self.fname_map = fnm = defaultdict(dict)
self.size_map = sm = defaultdict(dict)
@ -524,6 +591,9 @@ class IdentifiersTable(ManyToManyTable):
def read_id_maps(self, db):
pass
def fix_case_duplicates(self, db):
pass
def read_maps(self, db):
self.book_col_map = defaultdict(dict)
self.col_book_map = defaultdict(set)

View File

@ -632,3 +632,63 @@ class WritingTest(BaseTest):
self.assertEqual({3}, cache.set_sort_for_authors(sdata),
'Setting the author sort to the same value as before, incorrectly marked some books as dirty')
# }}}
def test_fix_case_duplicates(self): # {{{
' Test fixing of databases that have items in is_many fields that differ only by case '
ae = self.assertEqual
cache = self.init_cache()
conn = cache.backend.conn
conn.execute('INSERT INTO publishers (name) VALUES ("mūs")')
lid = conn.last_insert_rowid()
conn.execute('INSERT INTO publishers (name) VALUES ("MŪS")')
uid = conn.last_insert_rowid()
conn.execute('DELETE FROM books_publishers_link')
conn.execute('INSERT INTO books_publishers_link (book,publisher) VALUES (1, %d)' % lid)
conn.execute('INSERT INTO books_publishers_link (book,publisher) VALUES (2, %d)' % uid)
conn.execute('INSERT INTO books_publishers_link (book,publisher) VALUES (3, %d)' % uid)
cache.reload_from_db()
t = cache.fields['publisher'].table
for x in (lid, uid):
self.assertIn(x, t.id_map)
self.assertIn(x, t.col_book_map)
ae(t.book_col_map[1], lid)
ae(t.book_col_map[2], uid)
t.fix_case_duplicates(cache.backend)
for c in (cache, self.init_cache()):
t = c.fields['publisher'].table
self.assertNotIn(uid, t.id_map)
self.assertNotIn(uid, t.col_book_map)
for bid in (1, 2, 3):
ae(c.field_for('publisher', bid), "mūs")
c.close()
cache = self.init_cache()
conn = cache.backend.conn
conn.execute('INSERT INTO tags (name) VALUES ("mūūs")')
lid = conn.last_insert_rowid()
conn.execute('INSERT INTO tags (name) VALUES ("MŪŪS")')
uid = conn.last_insert_rowid()
conn.execute('INSERT INTO tags (name) VALUES ("mūŪS")')
mid = conn.last_insert_rowid()
conn.execute('INSERT INTO tags (name) VALUES ("t")')
norm = conn.last_insert_rowid()
conn.execute('DELETE FROM books_tags_link')
for book_id, vals in {1:(lid, uid), 2:(uid, mid), 3:(lid, norm)}.iteritems():
conn.executemany('INSERT INTO books_tags_link (book,tag) VALUES (?,?)',
tuple((book_id, x) for x in vals))
cache.reload_from_db()
t = cache.fields['tags'].table
for x in (lid, uid, mid):
self.assertIn(x, t.id_map)
self.assertIn(x, t.col_book_map)
t.fix_case_duplicates(cache.backend)
for c in (cache, self.init_cache()):
t = c.fields['tags'].table
for x in (uid, mid):
self.assertNotIn(x, t.id_map)
self.assertNotIn(x, t.col_book_map)
ae(c.field_for('tags', 1), (t.id_map[lid],))
ae(c.field_for('tags', 2), (t.id_map[lid],), 'failed for book 2')
ae(c.field_for('tags', 3), (t.id_map[lid], t.id_map[norm]))
# }}}

View File

@ -278,6 +278,10 @@ def many_one(book_id_val_map, db, field, allow_case_change, *args):
# Map values to db ids, including any new values
kmap = safe_lower if dt in {'text', 'series'} else lambda x:x
rid_map = {kmap(item):item_id for item_id, item in table.id_map.iteritems()}
if len(rid_map) != len(table.id_map):
# table has some entries that differ only in case, fix it
table.fix_case_duplicates(db)
rid_map = {kmap(item):item_id for item_id, item in table.id_map.iteritems()}
val_map = {None:None}
case_changes = {}
for val in book_id_val_map.itervalues():
@ -339,7 +343,7 @@ def many_one(book_id_val_map, db, field, allow_case_change, *args):
# Many-Many fields {{{
def uniq(vals, kmap):
def uniq(vals, kmap=lambda x:x):
''' Remove all duplicates from vals, while preserving order. kmap must be a
callable that returns a hashable value for every item in vals '''
vals = vals or ()
@ -358,6 +362,10 @@ def many_many(book_id_val_map, db, field, allow_case_change, *args):
# Map values to db ids, including any new values
kmap = safe_lower if dt == 'text' else lambda x:x
rid_map = {kmap(item):item_id for item_id, item in table.id_map.iteritems()}
if len(rid_map) != len(table.id_map):
# table has some entries that differ only in case, fix it
table.fix_case_duplicates(db)
rid_map = {kmap(item):item_id for item_id, item in table.id_map.iteritems()}
val_map = {}
case_changes = {}
book_id_val_map = {k:uniq(vals, kmap) for k, vals in book_id_val_map.iteritems()}