Implement #5016 (Add formats to existing ebook records)

This commit is contained in:
Kovid Goyal 2010-03-11 23:26:10 -07:00
commit a0f2163403
8 changed files with 101 additions and 20 deletions

View File

@ -20,7 +20,7 @@ def string_to_authors(raw):
raw = raw.replace('&&', u'\uffff') raw = raw.replace('&&', u'\uffff')
raw = _author_pat.sub('&', raw) raw = _author_pat.sub('&', raw)
authors = [a.strip().replace(u'\uffff', '&') for a in raw.split('&')] authors = [a.strip().replace(u'\uffff', '&') for a in raw.split('&')]
return authors return [a for a in authors if a]
def authors_to_string(authors): def authors_to_string(authors):
if authors is not None: if authors is not None:

View File

@ -4,10 +4,9 @@ __copyright__ = '2010, Greg Riker <griker@hotmail.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
''' Read/write metadata from Amazon's topaz format ''' ''' Read/write metadata from Amazon's topaz format '''
import copy, StringIO, sys import StringIO, sys
from struct import pack, unpack from struct import pack
from calibre import prints
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
class StreamSlicer(object): class StreamSlicer(object):
@ -200,7 +199,6 @@ class MetadataUpdater(object):
# Build a dict of topaz_header records # Build a dict of topaz_header records
topaz_headers = {} topaz_headers = {}
for x in range(self.header_records): for x in range(self.header_records):
c_marker = self.data[offset]
offset += 1 offset += 1
taglen, consumed = self.decode_vwi(self.data[offset:offset+4]) taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed offset += consumed
@ -259,7 +257,6 @@ class MetadataUpdater(object):
self.metadata = {} self.metadata = {}
for x in range(self.md_header['num_recs']): for x in range(self.md_header['num_recs']):
md_record = {}
taglen, consumed = self.decode_vwi(self.data[offset:offset+4]) taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed offset += consumed
tag = self.data[offset:offset+taglen] tag = self.data[offset:offset+taglen]
@ -380,7 +377,6 @@ def set_metadata(stream, mi):
return return
if __name__ == '__main__': if __name__ == '__main__':
import cStringIO, sys
#print get_metadata(open(sys.argv[1], 'rb')) #print get_metadata(open(sys.argv[1], 'rb'))
mi = MetaInformation(title="My New Title", authors=['Smith, John']) mi = MetaInformation(title="My New Title", authors=['Smith, John'])
set_metadata(open(sys.argv[1], 'rb'), mi) set_metadata(open(sys.argv[1], 'rb'), mi)

View File

@ -1,7 +1,7 @@
''' '''
UI for adding books to the database and saving books to disk UI for adding books to the database and saving books to disk
''' '''
import os, shutil, time import os, shutil, time, re
from Queue import Queue, Empty from Queue import Queue, Empty
from threading import Thread from threading import Thread
@ -13,9 +13,10 @@ from calibre.gui2 import question_dialog, error_dialog, info_dialog
from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.constants import preferred_encoding, filesystem_encoding from calibre.constants import preferred_encoding, filesystem_encoding
from calibre.utils.config import prefs
class DuplicatesAdder(QThread): class DuplicatesAdder(QThread):
# Add duplicate books
def __init__(self, parent, db, duplicates, db_adder): def __init__(self, parent, db, duplicates, db_adder):
QThread.__init__(self, parent) QThread.__init__(self, parent)
self.db, self.db_adder = db, db_adder self.db, self.db_adder = db, db_adder
@ -27,6 +28,7 @@ class DuplicatesAdder(QThread):
formats = [f for f in formats if not f.lower().endswith('.opf')] formats = [f for f in formats if not f.lower().endswith('.opf')]
id = self.db.create_book_entry(mi, cover=cover, id = self.db.create_book_entry(mi, cover=cover,
add_duplicates=True) add_duplicates=True)
# here we add all the formats for dupe book record created above
self.db_adder.add_formats(id, formats) self.db_adder.add_formats(id, formats)
self.db_adder.number_of_books_added += 1 self.db_adder.number_of_books_added += 1
self.emit(SIGNAL('added(PyQt_PyObject)'), count) self.emit(SIGNAL('added(PyQt_PyObject)'), count)
@ -90,6 +92,15 @@ class DBAdder(Thread):
self.daemon = True self.daemon = True
self.input_queue = Queue() self.input_queue = Queue()
self.output_queue = Queue() self.output_queue = Queue()
self.fuzzy_title_patterns = [(re.compile(pat), repl) for pat, repl in
[
(r'[\[\](){}<>\'";,:#]', ''),
(r'^(the|a|an) ', ''),
(r'[-._]', ' '),
(r'\s+', ' ')
]
]
self.merged_books = set([])
def run(self): def run(self):
while not self.end: while not self.end:
@ -125,6 +136,34 @@ class DBAdder(Thread):
fmts[-1] = fmt fmts[-1] = fmt
return fmts return fmts
def fuzzy_title(self, title):
title = title.strip().lower()
for pat, repl in self.fuzzy_title_patterns:
title = pat.sub(repl, title)
return title
def find_identical_books(self, mi):
identical_book_ids = set([])
if mi.authors:
try:
query = u' and '.join([u'author:"=%s"'%(a.replace('"', '')) for a in
mi.authors])
except ValueError:
return identical_book_ids
try:
book_ids = self.db.data.parse(query)
except:
import traceback
traceback.print_exc()
return identical_book_ids
for book_id in book_ids:
fbook_title = self.db.title(book_id, index_is_id=True)
fbook_title = self.fuzzy_title(fbook_title)
mbook_title = self.fuzzy_title(mi.title)
if fbook_title == mbook_title:
identical_book_ids.add(book_id)
return identical_book_ids
def add(self, id, opf, cover, name): def add(self, id, opf, cover, name):
formats = self.ids.pop(id) formats = self.ids.pop(id)
if opf.endswith('.error'): if opf.endswith('.error'):
@ -145,12 +184,25 @@ class DBAdder(Thread):
if self.db is not None: if self.db is not None:
if cover: if cover:
cover = open(cover, 'rb').read() cover = open(cover, 'rb').read()
orig_formats = formats
formats = [f for f in formats if not f.lower().endswith('.opf')]
if prefs['add_formats_to_existing']:
identical_book_list = self.find_identical_books(mi)
if identical_book_list: # books with same author and nearly same title exist in db
self.merged_books.add(mi.title)
for identical_book in identical_book_list:
self.add_formats(identical_book, formats, replace=False)
else:
id = self.db.create_book_entry(mi, cover=cover, add_duplicates=True)
self.number_of_books_added += 1
self.add_formats(id, formats)
else:
id = self.db.create_book_entry(mi, cover=cover, add_duplicates=False) id = self.db.create_book_entry(mi, cover=cover, add_duplicates=False)
self.number_of_books_added += 1 self.number_of_books_added += 1
if id is None: if id is None:
self.duplicates.append((mi, cover, formats)) self.duplicates.append((mi, cover, orig_formats))
else: else:
formats = [f for f in formats if not f.lower().endswith('.opf')]
self.add_formats(id, formats) self.add_formats(id, formats)
else: else:
self.names.append(name) self.names.append(name)
@ -158,12 +210,12 @@ class DBAdder(Thread):
self.infos.append(mi) self.infos.append(mi)
return mi.title return mi.title
def add_formats(self, id, formats): def add_formats(self, id, formats, replace=True):
for path in formats: for path in formats:
fmt = os.path.splitext(path)[-1].replace('.', '').upper() fmt = os.path.splitext(path)[-1].replace('.', '').upper()
with open(path, 'rb') as f: with open(path, 'rb') as f:
self.db.add_format(id, fmt, f, index_is_id=True, self.db.add_format(id, fmt, f, index_is_id=True,
notify=False) notify=False, replace=replace)
class Adder(QObject): class Adder(QObject):
@ -330,6 +382,11 @@ class Adder(QObject):
return getattr(getattr(self, 'db_adder', None), 'number_of_books_added', return getattr(getattr(self, 'db_adder', None), 'number_of_books_added',
0) 0)
@property
def merged_books(self):
return getattr(getattr(self, 'db_adder', None), 'merged_books',
set([]))
@property @property
def critical(self): def critical(self):
return getattr(getattr(self, 'db_adder', None), 'critical', return getattr(getattr(self, 'db_adder', None), 'critical',

View File

@ -44,6 +44,7 @@ class AddSave(QTabWidget, Ui_TabWidget):
self.filename_pattern = FilenamePattern(self) self.filename_pattern = FilenamePattern(self)
self.metadata_box.layout().insertWidget(0, self.filename_pattern) self.metadata_box.layout().insertWidget(0, self.filename_pattern)
self.opt_swap_author_names.setChecked(prefs['swap_author_names']) self.opt_swap_author_names.setChecked(prefs['swap_author_names'])
self.opt_add_formats_to_existing.setChecked(prefs['add_formats_to_existing'])
help = '\n'.join(textwrap.wrap(c.get_option('template').help, 75)) help = '\n'.join(textwrap.wrap(c.get_option('template').help, 75))
self.save_template.initialize('save_to_disk', opts.template, help) self.save_template.initialize('save_to_disk', opts.template, help)
self.send_template.initialize('send_to_device', opts.send_template, help) self.send_template.initialize('send_to_device', opts.send_template, help)
@ -69,6 +70,7 @@ class AddSave(QTabWidget, Ui_TabWidget):
pattern = self.filename_pattern.commit() pattern = self.filename_pattern.commit()
prefs['filename_pattern'] = pattern prefs['filename_pattern'] = pattern
prefs['swap_author_names'] = bool(self.opt_swap_author_names.isChecked()) prefs['swap_author_names'] = bool(self.opt_swap_author_names.isChecked())
prefs['add_formats_to_existing'] = bool(self.opt_add_formats_to_existing.isChecked())
return True return True

View File

@ -6,7 +6,7 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>645</width> <width>588</width>
<height>516</height> <height>516</height>
</rect> </rect>
</property> </property>
@ -49,6 +49,19 @@
</widget> </widget>
</item> </item>
<item row="2" column="0" colspan="2"> <item row="2" column="0" colspan="2">
<widget class="QCheckBox" name="opt_add_formats_to_existing">
<property name="toolTip">
<string>If an existing book with a similar title and author is found that does not have the format being added, the format is added
to the existing book, instead of creating a new entry. If the existing book already has the format, then it is silently ignored.
Title match ignores leading indefinite articles (&quot;the&quot;, &quot;a&quot;, &quot;an&quot;), punctuation, case, etc. Author match is exact.</string>
</property>
<property name="text">
<string>If books with similar titles and authors found, &amp;merge the new files automatically</string>
</property>
</widget>
</item>
<item row="3" column="0" colspan="2">
<widget class="QGroupBox" name="metadata_box"> <widget class="QGroupBox" name="metadata_box">
<property name="title"> <property name="title">
<string>&amp;Configure metadata from file name</string> <string>&amp;Configure metadata from file name</string>

View File

@ -24,7 +24,7 @@ from PyQt4.QtSvg import QSvgRenderer
from calibre import prints, patheq, strftime from calibre import prints, patheq, strftime
from calibre.constants import __version__, __appname__, isfrozen, islinux, \ from calibre.constants import __version__, __appname__, isfrozen, islinux, \
iswindows, isosx, filesystem_encoding iswindows, isosx, filesystem_encoding, preferred_encoding
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.config import prefs, dynamic from calibre.utils.config import prefs, dynamic
@ -1244,6 +1244,13 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
self.library_view.model().books_added(self._adder.number_of_books_added) self.library_view.model().books_added(self._adder.number_of_books_added)
if hasattr(self, 'db_images'): if hasattr(self, 'db_images'):
self.db_images.reset() self.db_images.reset()
if getattr(self._adder, 'merged_books', False):
books = u'\n'.join([x if isinstance(x, unicode) else
x.decode(preferred_encoding, 'replace') for x in
self._adder.merged_books])
info_dialog(self, _('Merged some books'),
_('Some duplicates were found and merged into the '
'following existing books:'), det_msg=books, show=True)
if getattr(self._adder, 'critical', None): if getattr(self._adder, 'critical', None):
det_msg = [] det_msg = []
for name, log in self._adder.critical.items(): for name, log in self._adder.critical.items():

View File

@ -998,12 +998,15 @@ class LibraryDatabase2(LibraryDatabase):
return self.add_format(index, format, stream, return self.add_format(index, format, stream,
index_is_id=index_is_id, path=path, notify=notify) index_is_id=index_is_id, path=path, notify=notify)
def add_format(self, index, format, stream, index_is_id=False, path=None, notify=True): def add_format(self, index, format, stream, index_is_id=False, path=None,
notify=True, replace=True):
id = index if index_is_id else self.id(index) id = index if index_is_id else self.id(index)
if path is None: if path is None:
path = os.path.join(self.library_path, self.path(id, index_is_id=True)) path = os.path.join(self.library_path, self.path(id, index_is_id=True))
name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False) name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False)
if name: if name:
if not replace:
return False
self.conn.execute('DELETE FROM data WHERE book=? AND format=?', (id, format)) self.conn.execute('DELETE FROM data WHERE book=? AND format=?', (id, format))
name = self.construct_file_name(id) name = self.construct_file_name(id)
ext = ('.' + format.lower()) if format else '' ext = ('.' + format.lower()) if format else ''
@ -1021,6 +1024,7 @@ class LibraryDatabase2(LibraryDatabase):
self.refresh_ids([id]) self.refresh_ids([id])
if notify: if notify:
self.notify('metadata', [id]) self.notify('metadata', [id])
return True
def delete_book(self, id, notify=True): def delete_book(self, id, notify=True):
''' '''

View File

@ -670,6 +670,8 @@ def _prefs():
help=_('The priority of worker processes')) help=_('The priority of worker processes'))
c.add_opt('swap_author_names', default=False, c.add_opt('swap_author_names', default=False,
help=_('Swap author first and last names when reading metadata')) help=_('Swap author first and last names when reading metadata'))
c.add_opt('add_formats_to_existing', default=False,
help=_('Add new formats to existing book records'))
c.add_opt('migrated', default=False, help='For Internal use. Don\'t modify.') c.add_opt('migrated', default=False, help='For Internal use. Don\'t modify.')
return c return c