Implement #5016 (Add formats to existing ebook records)

This commit is contained in:
Kovid Goyal 2010-03-11 23:26:10 -07:00
commit a0f2163403
8 changed files with 101 additions and 20 deletions

View File

@ -20,7 +20,7 @@ def string_to_authors(raw):
raw = raw.replace('&&', u'\uffff')
raw = _author_pat.sub('&', raw)
authors = [a.strip().replace(u'\uffff', '&') for a in raw.split('&')]
return authors
return [a for a in authors if a]
def authors_to_string(authors):
if authors is not None:

View File

@ -4,10 +4,9 @@ __copyright__ = '2010, Greg Riker <griker@hotmail.com>'
__docformat__ = 'restructuredtext en'
''' Read/write metadata from Amazon's topaz format '''
import copy, StringIO, sys
from struct import pack, unpack
import StringIO, sys
from struct import pack
from calibre import prints
from calibre.ebooks.metadata import MetaInformation
class StreamSlicer(object):
@ -200,7 +199,6 @@ class MetadataUpdater(object):
# Build a dict of topaz_header records
topaz_headers = {}
for x in range(self.header_records):
c_marker = self.data[offset]
offset += 1
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
@ -259,7 +257,6 @@ class MetadataUpdater(object):
self.metadata = {}
for x in range(self.md_header['num_recs']):
md_record = {}
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
tag = self.data[offset:offset+taglen]
@ -380,7 +377,6 @@ def set_metadata(stream, mi):
return
if __name__ == '__main__':
import cStringIO, sys
#print get_metadata(open(sys.argv[1], 'rb'))
mi = MetaInformation(title="My New Title", authors=['Smith, John'])
set_metadata(open(sys.argv[1], 'rb'), mi)

View File

@ -1,7 +1,7 @@
'''
UI for adding books to the database and saving books to disk
'''
import os, shutil, time
import os, shutil, time, re
from Queue import Queue, Empty
from threading import Thread
@ -13,9 +13,10 @@ from calibre.gui2 import question_dialog, error_dialog, info_dialog
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.metadata import MetaInformation
from calibre.constants import preferred_encoding, filesystem_encoding
from calibre.utils.config import prefs
class DuplicatesAdder(QThread):
# Add duplicate books
def __init__(self, parent, db, duplicates, db_adder):
QThread.__init__(self, parent)
self.db, self.db_adder = db, db_adder
@ -27,6 +28,7 @@ class DuplicatesAdder(QThread):
formats = [f for f in formats if not f.lower().endswith('.opf')]
id = self.db.create_book_entry(mi, cover=cover,
add_duplicates=True)
# here we add all the formats for dupe book record created above
self.db_adder.add_formats(id, formats)
self.db_adder.number_of_books_added += 1
self.emit(SIGNAL('added(PyQt_PyObject)'), count)
@ -90,6 +92,15 @@ class DBAdder(Thread):
self.daemon = True
self.input_queue = Queue()
self.output_queue = Queue()
self.fuzzy_title_patterns = [(re.compile(pat), repl) for pat, repl in
[
(r'[\[\](){}<>\'";,:#]', ''),
(r'^(the|a|an) ', ''),
(r'[-._]', ' '),
(r'\s+', ' ')
]
]
self.merged_books = set([])
def run(self):
while not self.end:
@ -125,6 +136,34 @@ class DBAdder(Thread):
fmts[-1] = fmt
return fmts
def fuzzy_title(self, title):
title = title.strip().lower()
for pat, repl in self.fuzzy_title_patterns:
title = pat.sub(repl, title)
return title
def find_identical_books(self, mi):
identical_book_ids = set([])
if mi.authors:
try:
query = u' and '.join([u'author:"=%s"'%(a.replace('"', '')) for a in
mi.authors])
except ValueError:
return identical_book_ids
try:
book_ids = self.db.data.parse(query)
except:
import traceback
traceback.print_exc()
return identical_book_ids
for book_id in book_ids:
fbook_title = self.db.title(book_id, index_is_id=True)
fbook_title = self.fuzzy_title(fbook_title)
mbook_title = self.fuzzy_title(mi.title)
if fbook_title == mbook_title:
identical_book_ids.add(book_id)
return identical_book_ids
def add(self, id, opf, cover, name):
formats = self.ids.pop(id)
if opf.endswith('.error'):
@ -145,12 +184,25 @@ class DBAdder(Thread):
if self.db is not None:
if cover:
cover = open(cover, 'rb').read()
orig_formats = formats
formats = [f for f in formats if not f.lower().endswith('.opf')]
if prefs['add_formats_to_existing']:
identical_book_list = self.find_identical_books(mi)
if identical_book_list: # books with same author and nearly same title exist in db
self.merged_books.add(mi.title)
for identical_book in identical_book_list:
self.add_formats(identical_book, formats, replace=False)
else:
id = self.db.create_book_entry(mi, cover=cover, add_duplicates=True)
self.number_of_books_added += 1
self.add_formats(id, formats)
else:
id = self.db.create_book_entry(mi, cover=cover, add_duplicates=False)
self.number_of_books_added += 1
if id is None:
self.duplicates.append((mi, cover, formats))
self.duplicates.append((mi, cover, orig_formats))
else:
formats = [f for f in formats if not f.lower().endswith('.opf')]
self.add_formats(id, formats)
else:
self.names.append(name)
@ -158,12 +210,12 @@ class DBAdder(Thread):
self.infos.append(mi)
return mi.title
def add_formats(self, id, formats):
def add_formats(self, id, formats, replace=True):
for path in formats:
fmt = os.path.splitext(path)[-1].replace('.', '').upper()
with open(path, 'rb') as f:
self.db.add_format(id, fmt, f, index_is_id=True,
notify=False)
notify=False, replace=replace)
class Adder(QObject):
@ -330,6 +382,11 @@ class Adder(QObject):
return getattr(getattr(self, 'db_adder', None), 'number_of_books_added',
0)
@property
def merged_books(self):
return getattr(getattr(self, 'db_adder', None), 'merged_books',
set([]))
@property
def critical(self):
return getattr(getattr(self, 'db_adder', None), 'critical',

View File

@ -44,6 +44,7 @@ class AddSave(QTabWidget, Ui_TabWidget):
self.filename_pattern = FilenamePattern(self)
self.metadata_box.layout().insertWidget(0, self.filename_pattern)
self.opt_swap_author_names.setChecked(prefs['swap_author_names'])
self.opt_add_formats_to_existing.setChecked(prefs['add_formats_to_existing'])
help = '\n'.join(textwrap.wrap(c.get_option('template').help, 75))
self.save_template.initialize('save_to_disk', opts.template, help)
self.send_template.initialize('send_to_device', opts.send_template, help)
@ -69,6 +70,7 @@ class AddSave(QTabWidget, Ui_TabWidget):
pattern = self.filename_pattern.commit()
prefs['filename_pattern'] = pattern
prefs['swap_author_names'] = bool(self.opt_swap_author_names.isChecked())
prefs['add_formats_to_existing'] = bool(self.opt_add_formats_to_existing.isChecked())
return True

View File

@ -6,7 +6,7 @@
<rect>
<x>0</x>
<y>0</y>
<width>645</width>
<width>588</width>
<height>516</height>
</rect>
</property>
@ -49,6 +49,19 @@
</widget>
</item>
<item row="2" column="0" colspan="2">
<widget class="QCheckBox" name="opt_add_formats_to_existing">
<property name="toolTip">
<string>If an existing book with a similar title and author is found that does not have the format being added, the format is added
to the existing book, instead of creating a new entry. If the existing book already has the format, then it is silently ignored.
Title match ignores leading indefinite articles (&quot;the&quot;, &quot;a&quot;, &quot;an&quot;), punctuation, case, etc. Author match is exact.</string>
</property>
<property name="text">
<string>If books with similar titles and authors found, &amp;merge the new files automatically</string>
</property>
</widget>
</item>
<item row="3" column="0" colspan="2">
<widget class="QGroupBox" name="metadata_box">
<property name="title">
<string>&amp;Configure metadata from file name</string>

View File

@ -24,7 +24,7 @@ from PyQt4.QtSvg import QSvgRenderer
from calibre import prints, patheq, strftime
from calibre.constants import __version__, __appname__, isfrozen, islinux, \
iswindows, isosx, filesystem_encoding
iswindows, isosx, filesystem_encoding, preferred_encoding
from calibre.utils.filenames import ascii_filename
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.config import prefs, dynamic
@ -1244,6 +1244,13 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
self.library_view.model().books_added(self._adder.number_of_books_added)
if hasattr(self, 'db_images'):
self.db_images.reset()
if getattr(self._adder, 'merged_books', False):
books = u'\n'.join([x if isinstance(x, unicode) else
x.decode(preferred_encoding, 'replace') for x in
self._adder.merged_books])
info_dialog(self, _('Merged some books'),
_('Some duplicates were found and merged into the '
'following existing books:'), det_msg=books, show=True)
if getattr(self._adder, 'critical', None):
det_msg = []
for name, log in self._adder.critical.items():

View File

@ -998,12 +998,15 @@ class LibraryDatabase2(LibraryDatabase):
return self.add_format(index, format, stream,
index_is_id=index_is_id, path=path, notify=notify)
def add_format(self, index, format, stream, index_is_id=False, path=None, notify=True):
def add_format(self, index, format, stream, index_is_id=False, path=None,
notify=True, replace=True):
id = index if index_is_id else self.id(index)
if path is None:
path = os.path.join(self.library_path, self.path(id, index_is_id=True))
name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False)
if name:
if not replace:
return False
self.conn.execute('DELETE FROM data WHERE book=? AND format=?', (id, format))
name = self.construct_file_name(id)
ext = ('.' + format.lower()) if format else ''
@ -1021,6 +1024,7 @@ class LibraryDatabase2(LibraryDatabase):
self.refresh_ids([id])
if notify:
self.notify('metadata', [id])
return True
def delete_book(self, id, notify=True):
'''

View File

@ -670,6 +670,8 @@ def _prefs():
help=_('The priority of worker processes'))
c.add_opt('swap_author_names', default=False,
help=_('Swap author first and last names when reading metadata'))
c.add_opt('add_formats_to_existing', default=False,
help=_('Add new formats to existing book records'))
c.add_opt('migrated', default=False, help='For Internal use. Don\'t modify.')
return c