From ff74cd460d30d9ccf97bc1abff3aa519a681fca1 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 20 Aug 2009 21:05:47 -0400 Subject: [PATCH 001/120] Device drivers use file path template. --- src/calibre/devices/cybookg3/driver.py | 15 ++++--- src/calibre/devices/jetbook/driver.py | 23 +++++----- src/calibre/devices/prs500/books.py | 14 +++--- src/calibre/devices/prs500/driver.py | 6 +-- src/calibre/devices/prs505/books.py | 15 +++---- src/calibre/devices/prs505/driver.py | 20 +++++---- src/calibre/devices/usbms/device.py | 50 +++++---------------- src/calibre/devices/usbms/driver.py | 14 +++--- src/calibre/gui2/device.py | 60 ++++++++++---------------- 9 files changed, 86 insertions(+), 131 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 670438f94d..de5e96d053 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -45,24 +45,25 @@ class CYBOOKG3(USBMS): DELETE_EXTS = ['.mbp', '.dat', '_6090.t2b'] SUPPORTS_SUB_DIRS = True - def upload_books(self, files, names, on_card=None, end_session=True, - metadata=None): + def upload_books(self, files, metadatas, ids, on_card=None, + end_session=True): path = self._sanity_check(on_card, files) paths = [] - names = iter(names) - metadata = iter(metadata) + metadatas = iter(metadatas) + ids = iter(ids) for i, infile in enumerate(files): - mdata, fname = metadata.next(), names.next() - filepath = self.create_upload_path(path, mdata, fname) + mdata, id = metadatas.next(), ids.next() + ext = os.path.splitext(infile)[1] + filepath = self.create_upload_path(path, mdata, ext, id) paths.append(filepath) self.put_file(infile, filepath, replace_file=True) coverdata = None - cover = mdata.get('cover', None) + cover = mdata.cover if cover: coverdata = cover[2] diff --git a/src/calibre/devices/jetbook/driver.py b/src/calibre/devices/jetbook/driver.py index 8fcbe306a2..949438ae6f 100644 --- a/src/calibre/devices/jetbook/driver.py +++ b/src/calibre/devices/jetbook/driver.py @@ -15,7 +15,7 @@ from itertools import cycle from calibre.devices.usbms.driver import USBMS from calibre.utils.filenames import ascii_filename as sanitize -from calibre.ebooks.metadata import string_to_authors +from calibre.ebooks.metadata import authors_to_string, string_to_authors class JETBOOK(USBMS): name = 'Ectaco JetBook Device Interface' @@ -50,23 +50,22 @@ class JETBOOK(USBMS): r'(?P.+)#(?P.+)' ) - def upload_books(self, files, names, on_card=False, end_session=True, - metadata=None): - + def upload_books(self, files, metadatas, ids, on_card=None, + end_session=True): path = self._sanity_check(on_card, files) paths = [] - names = iter(names) - metadata = iter(metadata) + metadatas = iter(metadatas) + ids = iter(ids) for i, infile in enumerate(files): - mdata, fname = metadata.next(), names.next() - path = os.path.dirname(self.create_upload_path(path, mdata, fname)) + mdata, id = metadatas.next(), ids.next() + ext = os.path.splitext(infile)[1] + path = self.create_upload_path(path, mdata, ext, id) - author = sanitize(mdata.get('authors','Unknown')).replace(' ', '_') - title = sanitize(mdata.get('title', 'Unknown')).replace(' ', '_') - fileext = os.path.splitext(os.path.basename(fname))[1] - fname = '%s#%s%s' % (author, title, fileext) + author = sanitize(authors_to_string(mdata.authors)).replace(' ', '_') + title = sanitize(mdata.title).replace(' ', '_') + fname = '%s#%s%s' % (author, title, ext) filepath = os.path.join(path, fname) paths.append(filepath) diff --git a/src/calibre/devices/prs500/books.py b/src/calibre/devices/prs500/books.py index 5eb8d7f011..770c48caf9 100644 --- a/src/calibre/devices/prs500/books.py +++ b/src/calibre/devices/prs500/books.py @@ -9,6 +9,7 @@ from base64 import b64decode as decode from base64 import b64encode as encode import re +from calibre.ebooks.metadata import authors_to_string from calibre.devices.interface import BookList as _BookList from calibre.devices import strftime, strptime @@ -262,9 +263,9 @@ class BookList(_BookList): cid = self.max_id()+1 sourceid = str(self[0].sourceid) if len(self) else "1" attrs = { - "title" : info["title"], - 'titleSorter' : sortable_title(info['title']), - "author" : info["authors"] if info['authors'] else 'Unknown', \ + "title" : info.title, + 'titleSorter' : sortable_title(info.title), + "author" : authors_to_string(info.authors), \ "page":"0", "part":"0", "scale":"0", \ "sourceid":sourceid, "id":str(cid), "date":"", \ "mime":mime, "path":name, "size":str(size) @@ -273,7 +274,7 @@ class BookList(_BookList): node.setAttributeNode(self.document.createAttribute(attr)) node.setAttribute(attr, attrs[attr]) try: - w, h, data = info["cover"] + w, h, data = info.cover except TypeError: w, h, data = None, None, None @@ -290,10 +291,7 @@ class BookList(_BookList): book.datetime = ctime self.append(book) self.set_next_id(cid+1) - if self.prefix and info.has_key('tags'): # Playlists only supportted in main memory - if info.has_key('tag order'): - self.tag_order.update(info['tag order']) - self.set_playlists(book.id, info['tags']) + self.set_playlists(book.id, info.tags def playlist_by_title(self, title): diff --git a/src/calibre/devices/prs500/driver.py b/src/calibre/devices/prs500/driver.py index 4273101273..b6209f75af 100644 --- a/src/calibre/devices/prs500/driver.py +++ b/src/calibre/devices/prs500/driver.py @@ -863,14 +863,14 @@ class PRS500(DeviceConfig, DevicePlugin): self.upload_book_list(booklists[1], end_session=False) @safe - def upload_books(self, files, names, on_card=False, end_session=True, - metadata=None): + def upload_books(self, files, metadatas, ids, on_card=None, + end_session=True): card = self.card(end_session=False) prefix = card + '/' + self.CARD_PATH_PREFIX +'/' if on_card else '/Data/media/books/' if on_card and not self._exists(prefix)[0]: self.mkdir(prefix[:-1], False) paths, ctimes = [], [] - names = iter(names) + names = iter([m.title for m in metatdatas]) infiles = [file if hasattr(file, 'read') else open(file, 'rb') for file in files] for f in infiles: f.seek(0, 2) sizes = [f.tell() for f in infiles] diff --git a/src/calibre/devices/prs505/books.py b/src/calibre/devices/prs505/books.py index 6e268e734a..4b8a952816 100644 --- a/src/calibre/devices/prs505/books.py +++ b/src/calibre/devices/prs505/books.py @@ -8,7 +8,7 @@ import xml.dom.minidom as dom from base64 import b64decode as decode from base64 import b64encode as encode - +from calibre.ebooks.metadata import authors_to_string from calibre.devices.interface import BookList as _BookList from calibre.devices import strftime as _strftime from calibre.devices import strptime @@ -194,9 +194,9 @@ class BookList(_BookList): except: sourceid = '1' attrs = { - "title" : info["title"], - 'titleSorter' : sortable_title(info['title']), - "author" : info["authors"] if info['authors'] else _('Unknown'), + "title" : info.title, + 'titleSorter' : sortable_title(info.title), + "author" : authors_to_string(info.authors), "page":"0", "part":"0", "scale":"0", \ "sourceid":sourceid, "id":str(cid), "date":"", \ "mime":mime, "path":name, "size":str(size) @@ -205,7 +205,7 @@ class BookList(_BookList): node.setAttributeNode(self.document.createAttribute(attr)) node.setAttribute(attr, attrs[attr]) try: - w, h, data = info["cover"] + w, h, data = info.cover except TypeError: w, h, data = None, None, None @@ -221,10 +221,7 @@ class BookList(_BookList): book = Book(node, self.mountpath, [], prefix=self.prefix) book.datetime = ctime self.append(book) - if info.has_key('tags'): - if info.has_key('tag order'): - self.tag_order.update(info['tag order']) - self.set_tags(book, info['tags']) + self.set_tags(book, info.tags) def _delete_book(self, node): nid = node.getAttribute('id') diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index d1e1535e36..b4fccd2548 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -109,20 +109,22 @@ class PRS505(CLI, Device): self.report_progress(1.0, _('Getting list of books on device...')) return bl - def upload_books(self, files, names, on_card=None, end_session=True, - metadata=None): + def upload_books(self, files, metadatas, ids, on_card=None, + end_session=True): path = self._sanity_check(on_card, files) - paths, ctimes, sizes = [], [], [] - names = iter(names) - metadata = iter(metadata) - for i, infile in enumerate(files): - mdata, fname = metadata.next(), names.next() - filepath = self.create_upload_path(path, mdata, fname) + paths = [] + metadatas = iter(metadatas) + ids = iter(ids) + for i, infile in enumerate(files): + mdata, id = metadatas.next(), ids.next() + ext = os.path.splitext(infile)[1] + filepath = self.create_upload_path(path, mdata, ext, id) paths.append(filepath) - self.put_file(infile, paths[-1], replace_file=True) + + self.put_file(infile, filepath, replace_file=True) ctimes.append(os.path.getctime(paths[-1])) sizes.append(os.stat(paths[-1]).st_size) diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index 007d058941..f54e09e92f 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -23,7 +23,7 @@ from calibre.devices.interface import DevicePlugin from calibre.devices.errors import DeviceError, FreeSpaceError from calibre.devices.usbms.deviceconfig import DeviceConfig from calibre import iswindows, islinux, isosx, __appname__ -from calibre.utils.filenames import ascii_filename as sanitize, shorten_components_to +from calibre.utils.filenames import shorten_components_to class Device(DeviceConfig, DevicePlugin): @@ -667,46 +667,18 @@ class Device(DeviceConfig, DevicePlugin): raise FreeSpaceError(_("There is insufficient free space on the storage card")) return path - def create_upload_path(self, path, mdata, fname): - path = os.path.abspath(path) - newpath = path - extra_components = [] - - if self.SUPPORTS_SUB_DIRS and self.settings().use_subdirs: - if 'tags' in mdata.keys(): - for tag in mdata['tags']: - if tag.startswith(_('News')): - extra_components.append('news') - c = sanitize(mdata.get('title', '')) - if c: - extra_components.append(c) - c = sanitize(mdata.get('timestamp', '')) - if c: - extra_components.append(c) - break - elif tag.startswith('/'): - for c in tag.split('/'): - c = sanitize(c) - if not c: continue - extra_components.append(c) - break - - if not extra_components: - c = sanitize(mdata.get('authors', _('Unknown'))) - if c: - extra_components.append(c) - c = sanitize(mdata.get('title', _('Unknown'))) - if c: - extra_components.append(c) - newpath = os.path.join(newpath, c) - - fname = sanitize(fname) - extra_components.append(fname) - extra_components = [str(x) for x in extra_components] - components = shorten_components_to(250 - len(path), extra_components) - filepath = os.path.join(path, *components) + def create_upload_path(self, root, mdata, ext, id): + from calibre.library.save_to_disk import config, get_components + opts = config().parse() + components = get_components(opts.template, mdata, id, opts.timefmt, 250) + components = [str(x) for x in components] + components = shorten_components_to(250 - len(root), components) + filepath = '%s%s' % (os.path.join(root, *components), ext) filedir = os.path.dirname(filepath) + if not self.SUPPORTS_SUB_DIRS or not self.settings().use_subdirs: + filedir = root + filepath = os.path.join(root, os.path.basename(filepath)) if not os.path.exists(filedir): os.makedirs(filedir) diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index 5650a2f10e..6cfe0ed132 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -79,19 +79,19 @@ class USBMS(CLI, Device): return bl - def upload_books(self, files, names, on_card=None, end_session=True, - metadata=None): + def upload_books(self, files, metadatas, ids, on_card=None, + end_session=True): path = self._sanity_check(on_card, files) paths = [] - names = iter(names) - metadata = iter(metadata) + metadatas = iter(metadatas) + ids = iter(ids) for i, infile in enumerate(files): - mdata, fname = metadata.next(), names.next() - filepath = self.create_upload_path(path, mdata, fname) - + mdata, id = metadatas.next(), ids.next() + ext = os.path.splitext(infile)[1] + filepath = self.create_upload_path(path, mdata, ext, id) paths.append(filepath) self.put_file(infile, filepath, replace_file=True) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index 3a46352a70..75385008f0 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -214,18 +214,17 @@ class DeviceManager(Thread): return self.create_job(self._sync_booklists, done, args=[booklists], description=_('Send metadata to device')) - def _upload_books(self, files, names, on_card=None, metadata=None): + def _upload_books(self, files, metadata, ids, on_card=None): '''Upload books to device: ''' - return self.device.upload_books(files, names, on_card, - metadata=metadata, end_session=False) + return self.device.upload_books(files, metadata, ids, on_card, + end_session=False) - def upload_books(self, done, files, names, on_card=None, titles=None, - metadata=None): - desc = _('Upload %d books to device')%len(names) + def upload_books(self, done, files, metadata, ids, on_card=None, titles=None): + desc = _('Upload %d books to device')%len(files) if titles: desc += u':' + u', '.join(titles) - return self.create_job(self._upload_books, done, args=[files, names], - kwargs={'on_card':on_card,'metadata':metadata}, description=desc) + return self.create_job(self._upload_books, done, args=[files, metadata, ids], + kwargs={'on_card':on_card}, description=desc) def add_books_to_metadata(self, locations, metadata, booklists): self.device.add_books_to_metadata(locations, metadata, booklists) @@ -698,18 +697,18 @@ class DeviceGUI(object): dynamic.set('news_to_be_synced', set([])) return metadata = self.library_view.model().get_metadata(ids, - rows_are_ids=True) + rows_are_ids=True, full_metadata=True)[1] names = [] for mi in metadata: - prefix = ascii_filename(mi['title']) + prefix = ascii_filename(mi.title) if not isinstance(prefix, unicode): prefix = prefix.decode(preferred_encoding, 'replace') prefix = ascii_filename(prefix) names.append('%s_%d%s'%(prefix, id, os.path.splitext(f.name)[1])) - cdata = mi['cover'] + cdata = mi.cover if cdata: - mi['cover'] = self.cover_to_thumbnail(cdata) + mi.cover = self.cover_to_thumbnail(cdata) dynamic.set('news_to_be_synced', set([])) if config['upload_news_to_device'] and files: remove = ids if \ @@ -718,8 +717,7 @@ class DeviceGUI(object): self.location_view.model().free[1] : 'carda', self.location_view.model().free[2] : 'cardb' } on_card = space.get(sorted(space.keys(), reverse=True)[0], None) - self.upload_books(files, names, metadata, - on_card=on_card, + self.upload_books(files, metadata, ids, on_card=on_card, memory=[[f.name for f in files], remove]) self.status_bar.showMessage(_('Sending news to device.'), 5000) @@ -741,38 +739,28 @@ class DeviceGUI(object): else: _auto_ids = [] - metadata = self.library_view.model().get_metadata(ids, True) + metadata = self.library_view.model().get_metadata(ids, True, full_metadata=True)[1] ids = iter(ids) for mi in metadata: - cdata = mi['cover'] + cdata = mi.cover if cdata: mi['cover'] = self.cover_to_thumbnail(cdata) metadata = iter(metadata) files = [getattr(f, 'name', None) for f in _files] - bad, good, gf, names, remove_ids = [], [], [], [], [] + bad, mdata, gf, fids, remove_ids = [], [], [], [], [] for f in files: mi = metadata.next() id = ids.next() if f is None: - bad.append(mi['title']) + bad.append(mi.title) else: remove_ids.append(id) - good.append(mi) gf.append(f) - t = mi['title'] - if not t: - t = _('Unknown') - a = mi['authors'] - if not a: - a = _('Unknown') - prefix = ascii_filename(t+' - '+a) - if not isinstance(prefix, unicode): - prefix = prefix.decode(preferred_encoding, 'replace') - prefix = ascii_filename(prefix) - names.append('%s_%d%s'%(prefix, id, os.path.splitext(f)[1])) + mdata.append(mi) + fids.append(id) remove = remove_ids if delete_from_library else [] - self.upload_books(gf, names, good, on_card, memory=(_files, remove)) + self.upload_books(gf, mdata, fids, on_card, memory=(_files, remove)) self.status_bar.showMessage(_('Sending books to device.'), 5000) auto = [] @@ -834,17 +822,15 @@ class DeviceGUI(object): cp, fs = job.result self.location_view.model().update_devices(cp, fs) - def upload_books(self, files, names, metadata, on_card=None, memory=None): + def upload_books(self, files, metadata, ids, on_card=None, memory=None): ''' Upload books to device. :param files: List of either paths to files or file like objects ''' - titles = [i['title'] for i in metadata] + titles = [i.title for i in metadata] job = self.device_manager.upload_books( Dispatcher(self.books_uploaded), - files, names, on_card=on_card, - metadata=metadata, titles=titles - ) + files, metadata, ids, on_card=on_card, titles=titles) self.upload_memory[job] = (metadata, on_card, memory, files) def books_uploaded(self, job): @@ -857,7 +843,7 @@ class DeviceGUI(object): if isinstance(job.exception, FreeSpaceError): where = 'in main memory.' if 'memory' in str(job.exception) \ else 'on the storage card.' - titles = '\n'.join(['<li>'+mi['title']+'</li>' \ + titles = '\n'.join(['<li>'+mi.title+'</li>' \ for mi in metadata]) d = error_dialog(self, _('No space on device'), _('<p>Cannot upload books to device there ' From 819706e616b22a50bb36639a864cb622f31578fd Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 22 Aug 2009 18:29:42 -0400 Subject: [PATCH 002/120] Fix typo. --- src/calibre/devices/prs500/books.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/prs500/books.py b/src/calibre/devices/prs500/books.py index 770c48caf9..382dcf135d 100644 --- a/src/calibre/devices/prs500/books.py +++ b/src/calibre/devices/prs500/books.py @@ -291,7 +291,7 @@ class BookList(_BookList): book.datetime = ctime self.append(book) self.set_next_id(cid+1) - self.set_playlists(book.id, info.tags + self.set_playlists(book.id, info.tags) def playlist_by_title(self, title): From 857f55d2b199230a37895aac9936cf087b02bab0 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 22 Aug 2009 18:55:29 -0400 Subject: [PATCH 003/120] Fix bug #3251: Handle single line paragraphs in PDB files. --- src/calibre/ebooks/pdb/ereader/reader.py | 6 +-- src/calibre/ebooks/pdb/ereader/reader132.py | 4 +- src/calibre/ebooks/pdb/ereader/reader202.py | 4 +- src/calibre/ebooks/pdb/formatreader.py | 2 +- src/calibre/ebooks/pdb/input.py | 11 +++++- src/calibre/ebooks/pdb/palmdoc/reader.py | 7 ++-- src/calibre/ebooks/pdb/ztxt/reader.py | 7 ++-- src/calibre/ebooks/txt/input.py | 7 +--- src/calibre/ebooks/txt/processor.py | 6 ++- src/calibre/gui2/convert/pdb_input.py | 19 ++++++++++ src/calibre/gui2/convert/pdb_input.ui | 41 +++++++++++++++++++++ 11 files changed, 91 insertions(+), 23 deletions(-) create mode 100644 src/calibre/gui2/convert/pdb_input.py create mode 100644 src/calibre/gui2/convert/pdb_input.ui diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 7a3298122f..77ca8d6933 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -15,13 +15,13 @@ from calibre.ebooks.pdb.ereader.reader202 import Reader202 class Reader(FormatReader): - def __init__(self, header, stream, log, encoding=None): + def __init__(self, header, stream, log, options): record0_size = len(header.section_data(0)) if record0_size == 132: - self.reader = Reader132(header, stream, log, encoding) + self.reader = Reader132(header, stream, log, options) elif record0_size == 202: - self.reader = Reader202(header, stream, log, encoding) + self.reader = Reader202(header, stream, log, options) else: raise EreaderError('Size mismatch. eReader header record size %s KB is not supported.' % record0_size) diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py index a1d1f4294d..d44eb2c561 100644 --- a/src/calibre/ebooks/pdb/ereader/reader132.py +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -47,9 +47,9 @@ class HeaderRecord(object): class Reader132(FormatReader): - def __init__(self, header, stream, log, encoding=None): + def __init__(self, header, stream, log, options): self.log = log - self.encoding = encoding + self.encoding = options.input_encoding self.log.debug('132 byte header version found.') diff --git a/src/calibre/ebooks/pdb/ereader/reader202.py b/src/calibre/ebooks/pdb/ereader/reader202.py index 5057df363e..18281a208e 100644 --- a/src/calibre/ebooks/pdb/ereader/reader202.py +++ b/src/calibre/ebooks/pdb/ereader/reader202.py @@ -33,9 +33,9 @@ class HeaderRecord(object): class Reader202(FormatReader): - def __init__(self, header, stream, log, encoding=None): + def __init__(self, header, stream, log, options): self.log = log - self.encoding = encoding + self.encoding = options.input_encoding self.log.debug('202 byte header version found.') diff --git a/src/calibre/ebooks/pdb/formatreader.py b/src/calibre/ebooks/pdb/formatreader.py index bde6c9ae35..2251eaae04 100644 --- a/src/calibre/ebooks/pdb/formatreader.py +++ b/src/calibre/ebooks/pdb/formatreader.py @@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en' class FormatReader(object): - def __init__(self, header, stream, log, encoding=None): + def __init__(self, header, stream, log, options): raise NotImplementedError() def extract_content(self, output_dir): diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 62ae24c7f0..3ad1a6121c 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' import os -from calibre.customize.conversion import InputFormatPlugin +from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader @@ -17,6 +17,13 @@ class PDBInput(InputFormatPlugin): description = 'Convert PDB to HTML' file_types = set(['pdb']) + options = set([ + OptionRecommendation(name='single_line_paras', recommended_value=False, + help=_('Normally calibre treats blank lines as paragraph markers. ' + 'With this option it will assume that every line represents ' + 'a paragraph instead.')), + ]) + def convert(self, stream, options, file_ext, log, accelerators): header = PdbHeaderReader(stream) @@ -27,7 +34,7 @@ class PDBInput(InputFormatPlugin): log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident)) - reader = Reader(header, stream, log, options.input_encoding) + reader = Reader(header, stream, log, options) opf = reader.extract_content(os.getcwd()) return opf diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index 7e8f3b241c..e1935db566 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -31,10 +31,11 @@ class HeaderRecord(object): class Reader(FormatReader): - def __init__(self, header, stream, log, encoding=None): + def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = encoding + self.encoding = options.input_encoding + self.single_line_paras = options.single_line_paras self.sections = [] for i in range(header.num_sections): @@ -61,7 +62,7 @@ class Reader(FormatReader): txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - html = txt_to_markdown(txt) + html = txt_to_markdown(txt, single_line_paras=self.single_line_paras) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 0c334556e8..86c5abfe82 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -34,10 +34,11 @@ class HeaderRecord(object): class Reader(FormatReader): - def __init__(self, header, stream, log, encoding=None): + def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = encoding + self.encoding = options.input_encoding + self.single_line_paras = options.single_line_paras self.sections = [] for i in range(header.num_sections): @@ -76,7 +77,7 @@ class Reader(FormatReader): txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - html = txt_to_markdown(txt) + html = txt_to_markdown(txt, single_line_paras=self.single_line_paras) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 493fdf3967..75dd516360 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -31,14 +31,9 @@ class TXTInput(InputFormatPlugin): log.debug('Reading text from file...') txt = stream.read().decode(ienc, 'replace') - if options.single_line_paras: - txt = txt.replace('\r\n', '\n') - txt = txt.replace('\r', '\n') - txt = txt.replace('\n', '\n\n') - log.debug('Running text though markdown conversion...') try: - html = txt_to_markdown(txt) + html = txt_to_markdown(txt, single_line_paras=options.single_line_paras) except RuntimeError: raise ValueError('This txt file has malformed markup, it cannot be' 'converted by calibre. See http://daringfireball.net/projects/markdown/syntax') diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index ddb9b6a121..3005d633b8 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -13,7 +13,11 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -def txt_to_markdown(txt, title=''): +def txt_to_markdown(txt, title='', single_line_paras=False): + if single_line_paras: + txt = txt.replace('\r\n', '\n') + txt = txt.replace('\r', '\n') + txt = txt.replace('\n', '\n\n') md = markdown.Markdown( extensions=['footnotes', 'tables', 'toc'], safe_mode=False,) diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py new file mode 100644 index 0000000000..cc7582c7f6 --- /dev/null +++ b/src/calibre/gui2/convert/pdb_input.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +from calibre.gui2.convert.pdb_input_ui import Ui_Form +from calibre.gui2.convert import Widget + +class PluginWidget(Widget, Ui_Form): + + TITLE = _('PDB Input') + HELP = _('Options specific to')+' PDB '+_('input') + + def __init__(self, parent, get_option, get_help, db=None, book_id=None): + Widget.__init__(self, parent, 'txt_input', + ['single_line_paras']) + self.db, self.book_id = db, book_id + self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/pdb_input.ui b/src/calibre/gui2/convert/pdb_input.ui new file mode 100644 index 0000000000..191e749833 --- /dev/null +++ b/src/calibre/gui2/convert/pdb_input.ui @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<ui version="4.0"> + <class>Form</class> + <widget class="QWidget" name="Form"> + <property name="geometry"> + <rect> + <x>0</x> + <y>0</y> + <width>400</width> + <height>300</height> + </rect> + </property> + <property name="windowTitle"> + <string>Form</string> + </property> + <layout class="QGridLayout" name="gridLayout"> + <item row="1" column="0"> + <spacer name="verticalSpacer"> + <property name="orientation"> + <enum>Qt::Vertical</enum> + </property> + <property name="sizeHint" stdset="0"> + <size> + <width>20</width> + <height>213</height> + </size> + </property> + </spacer> + </item> + <item row="0" column="0"> + <widget class="QCheckBox" name="opt_single_line_paras"> + <property name="text"> + <string>Treat each &line as a paragraph</string> + </property> + </widget> + </item> + </layout> + </widget> + <resources/> + <connections/> +</ui> From 1f014caf4454c5831ffb450b05569018a247f71d Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 22 Aug 2009 23:15:23 -0400 Subject: [PATCH 004/120] Remove excessive newlines from TXT output. --- src/calibre/ebooks/txt/txtml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 24a9fb0878..09f9d5d50c 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -90,8 +90,8 @@ class TXTMLizer(object): text = re.sub('[ ]+', ' ', text) # Remove excessive newlines. - #text = re.sub('\n[ ]+\n', '\n\n', text) - #text = re.sub('\n{3,}', '\n\n', text) + text = re.sub('\n[ ]+\n', '\n\n', text) + text = re.sub('\n{5,}', '\n\n\n\n', text) # Replace spaces at the beginning and end of lines text = re.sub('(?imu)^[ ]+', '', text) From 5589a0644740387a3a024f10933cf060fad2f413 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 28 Aug 2009 22:36:40 -0400 Subject: [PATCH 005/120] fix #3322: PDB input parameter not saved. --- src/calibre/gui2/convert/pdb_input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py index cc7582c7f6..4b0ba73fda 100644 --- a/src/calibre/gui2/convert/pdb_input.py +++ b/src/calibre/gui2/convert/pdb_input.py @@ -13,7 +13,7 @@ class PluginWidget(Widget, Ui_Form): HELP = _('Options specific to')+' PDB '+_('input') def __init__(self, parent, get_option, get_help, db=None, book_id=None): - Widget.__init__(self, parent, 'txt_input', + Widget.__init__(self, parent, 'pdb_input', ['single_line_paras']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) From 813d9eb14f921a5926f29eda38ee9af438ec251f Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 29 Aug 2009 11:09:49 -0400 Subject: [PATCH 006/120] Fix Bug #3338: Handle strange characters in eReader metadata title. --- src/calibre/ebooks/pdb/ereader/reader132.py | 2 +- src/calibre/ebooks/pdb/ereader/reader202.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py index d44eb2c561..806d62c977 100644 --- a/src/calibre/ebooks/pdb/ereader/reader132.py +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -105,7 +105,7 @@ class Reader132(FormatReader): if not os.path.exists(output_dir): os.makedirs(output_dir) - html = u'<html><head><title>%s' % self.mi.title + html = u'%s' % self.mi.title.decode('utf-8', 'replace') pml = u'' for i in range(1, self.header_record.num_text_pages + 1): diff --git a/src/calibre/ebooks/pdb/ereader/reader202.py b/src/calibre/ebooks/pdb/ereader/reader202.py index 18281a208e..da8fadd5d9 100644 --- a/src/calibre/ebooks/pdb/ereader/reader202.py +++ b/src/calibre/ebooks/pdb/ereader/reader202.py @@ -93,7 +93,7 @@ class Reader202(FormatReader): pml += self.get_text_page(i) html = u'%s%s' % \ - (self.mi.title, pml_to_html(pml)) + (self.mi.title.decode('utf-8', 'replace'), pml_to_html(pml)) with CurrentDir(output_dir): with open('index.html', 'wb') as index: From 6a55edf9cd3c0516e849ce4712de3da240f4f35b Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 31 Aug 2009 07:35:06 -0400 Subject: [PATCH 007/120] Update descripiton. --- src/calibre/gui2/dialogs/config/add_save.ui | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/dialogs/config/add_save.ui b/src/calibre/gui2/dialogs/config/add_save.ui index 513be73e54..ef1a867cd2 100644 --- a/src/calibre/gui2/dialogs/config/add_save.ui +++ b/src/calibre/gui2/dialogs/config/add_save.ui @@ -70,7 +70,7 @@ - Here you can control how calibre will save your books when you click the Save to Disk button: + Here you can control how calibre will save your books when you click the Save to Disk or Send to Device buttons: true From 13a4379063735c589c561a73a9493415d385960e Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 31 Aug 2009 21:03:00 -0400 Subject: [PATCH 008/120] Implement bug #3359: Make markdown processing of text files optional. --- src/calibre/ebooks/oeb/base.py | 4 ++-- src/calibre/ebooks/pdb/palmdoc/reader.py | 8 +++++--- src/calibre/ebooks/pdb/ztxt/reader.py | 7 +++++-- src/calibre/ebooks/txt/input.py | 23 ++++++++++++++++------- src/calibre/ebooks/txt/processor.py | 24 ++++++++++++++++-------- 5 files changed, 44 insertions(+), 22 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 3f9e6a4d4a..2e06fffe4e 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -934,7 +934,7 @@ class Manifest(object): self.oeb.log.debug('Converting', self.href, '...') - from calibre.ebooks.txt.processor import txt_to_markdown + from calibre.ebooks.txt.processor import convert_markdown title = self.oeb.metadata.title if title: @@ -942,7 +942,7 @@ class Manifest(object): else: title = _('Unknown') - return self._parse_xhtml(txt_to_markdown(data, title)) + return self._parse_xhtml(convert_markdown(data, title)) def _parse_css(self, data): diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index e1935db566..8992382597 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -13,8 +13,8 @@ import struct from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.txt.processor import opf_writer -from calibre.ebooks.txt.processor import txt_to_markdown +from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \ + opf_writer class HeaderRecord(object): ''' @@ -62,7 +62,9 @@ class Reader(FormatReader): txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - html = txt_to_markdown(txt, single_line_paras=self.single_line_paras) + if self.single_line_paras: + txt = separate_paragraphs(txt) + html = convert_basic(txt) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 86c5abfe82..664f498bee 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -12,7 +12,8 @@ import os, struct, zlib from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ztxt import zTXTError -from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer +from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \ + opf_writer SUPPORTED_VERSION = (1, 40) @@ -77,7 +78,9 @@ class Reader(FormatReader): txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - html = txt_to_markdown(txt, single_line_paras=self.single_line_paras) + if self.single_line_paras: + txt = separate_paragraphs(txt) + html = convert_basic(txt) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 5d84a1bde1..2b0245c98b 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -7,7 +7,8 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre.ebooks.txt.processor import txt_to_markdown +from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ + separate_paragraphs class TXTInput(InputFormatPlugin): @@ -21,6 +22,8 @@ class TXTInput(InputFormatPlugin): help=_('Normally calibre treats blank lines as paragraph markers. ' 'With this option it will assume that every line represents ' 'a paragraph instead.')), + OptionRecommendation(name='markdown', recommended_value=False, + help=_('Run the text input though the markdown processor.')), ]) def convert(self, stream, options, file_ext, log, @@ -31,12 +34,18 @@ class TXTInput(InputFormatPlugin): log.debug('Reading text from file...') txt = stream.read().decode(ienc, 'replace') - log.debug('Running text though markdown conversion...') - try: - html = txt_to_markdown(txt, single_line_paras=options.single_line_paras) - except RuntimeError: - raise ValueError('This txt file has malformed markup, it cannot be' - 'converted by calibre. See http://daringfireball.net/projects/markdown/syntax') + if options.single_line_paras: + txt = separate_paragraphs(txt) + + if options.markdown: + log.debug('Running text though markdown conversion...') + try: + html = convert_markdown(txt) + except RuntimeError: + raise ValueError('This txt file has malformed markup, it cannot be' + 'converted by calibre. See http://daringfireball.net/projects/markdown/syntax') + else: + html = convert_basic(txt) from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 3005d633b8..94df216616 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -5,6 +5,7 @@ Read content from txt file. ''' import os +import re from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator @@ -13,18 +14,25 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -def txt_to_markdown(txt, title='', single_line_paras=False): - if single_line_paras: - txt = txt.replace('\r\n', '\n') - txt = txt.replace('\r', '\n') - txt = txt.replace('\n', '\n\n') +HTML_TEMPLATE = u'%s%s' + +def convert_basic(txt, title=''): + lines = [] + for line in txt.splitlines(): + lines.append('

%s

' % line) + return HTML_TEMPLATE % (title, '\n'.join(lines)) + +def convert_markdown(txt, title=''): md = markdown.Markdown( extensions=['footnotes', 'tables', 'toc'], safe_mode=False,) - html = u'%s%s' % (title, - md.convert(txt)) + return HTML_TEMPLATE % (title, md.convert(txt)) - return html +def separate_paragraphs(txt): + txt = txt.replace('\r\n', '\n') + txt = txt.replace('\r', '\n') + txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt) + return txt def opf_writer(path, opf_name, manifest, spine, mi): opf = OPFCreator(path, mi) From 9a30868d8372d3b07f34938827734b3700f9b4ed Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 31 Aug 2009 21:05:47 -0400 Subject: [PATCH 009/120] Add markdown option to TXT input in GUI. --- src/calibre/gui2/convert/txt_input.py | 2 +- src/calibre/gui2/convert/txt_input.ui | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py index 71dbbe1fe2..3d17eefe0d 100644 --- a/src/calibre/gui2/convert/txt_input.py +++ b/src/calibre/gui2/convert/txt_input.py @@ -14,6 +14,6 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, 'txt_input', - ['single_line_paras']) + ['single_line_paras', 'markdown']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui index 191e749833..353144b46b 100644 --- a/src/calibre/gui2/convert/txt_input.ui +++ b/src/calibre/gui2/convert/txt_input.ui @@ -14,7 +14,7 @@ Form
- + Qt::Vertical @@ -34,6 +34,13 @@
+ + + + Process using markdown + + + From 26a0327943421c677a9a395a6395f8d3082080e3 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 1 Sep 2009 06:21:40 -0400 Subject: [PATCH 010/120] TXT input convert_basic fixes: Make it handle html reserved characters in the text and detect paragraphs correctly. --- src/calibre/ebooks/txt/processor.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 94df216616..f6503c0bc5 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -7,6 +7,7 @@ Read content from txt file. import os import re +from calibre import prepare_string_for_xml from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator @@ -14,12 +15,28 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -HTML_TEMPLATE = u'%s%s' +HTML_TEMPLATE = u'%s\n%s\n' def convert_basic(txt, title=''): lines = [] + # Strip whitespace from the beginning and end of the line. Also replace + # all line breaks with \n. for line in txt.splitlines(): - lines.append('

%s

' % line) + lines.append(line.strip()) + txt = '\n'.join(lines) + + # Remove blank lines from the beginning and end of the document. + txt = re.sub('^\s+(?=.)', '', txt) + txt = re.sub('(?<=.)\s+$', '', txt) + # Remove excessive line breaks. + txt = re.sub('\n{3,}', '\n\n', txt) + + lines = [] + # Split into paragraphs based on having a blank line between text. + for line in txt.split('\n\n'): + if line.strip(): + lines.append('

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) + return HTML_TEMPLATE % (title, '\n'.join(lines)) def convert_markdown(txt, title=''): From f08775dda72c26589f8c85f74102764ac6030b99 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 1 Sep 2009 17:27:24 -0400 Subject: [PATCH 011/120] HTML input: Use correct name for pdf line unwrapping option. Conversion preprocessor: Allow non pdf to html input to use the line unwrapping code. --- src/calibre/ebooks/conversion/preprocess.py | 22 ++++++++++----------- src/calibre/ebooks/html/input.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index cb2564ec0a..029b9752e1 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -223,16 +223,7 @@ class HTMLPreProcessor(object): elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif self.is_pdftohtml(html): - end_rules = [] - if getattr(self.extra_opts, 'unwrap_factor', None): - length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) - if length: - end_rules.append( - # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), - ) - - rules = self.PDFTOHTML + end_rules + rules = self.PDFTOHTML else: rules = [] @@ -246,7 +237,16 @@ class HTMLPreProcessor(object): (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') ) - for rule in self.PREPROCESS + pre_rules + rules: + end_rules = [] + if getattr(self.extra_opts, 'unwrap_factor', None): + length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) + if length: + end_rules.append( + # Un wrap using punctuation + (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), + ) + + for rule in self.PREPROCESS + pre_rules + rules + end_rules: html = rule[0].sub(rule[1], html) # Handle broken XHTML w/ SVG (ugh) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 92c2df9690..7b7bfdf3aa 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -262,7 +262,7 @@ class HTMLInput(InputFormatPlugin): ) ), - OptionRecommendation(name='pdf_line_length', recommended_value=0.5, + OptionRecommendation(name='unwrap_factor', recommended_value=0.5, help=_('Average line length for line breaking if the HTML is from a ' 'previous partial conversion of a PDF file.')), From fe267db425c06a60d2afc92ce11c7e1cdd7b1ca5 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 1 Sep 2009 19:45:22 -0400 Subject: [PATCH 012/120] Fix indent. --- src/calibre/ebooks/conversion/preprocess.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 029b9752e1..1aae939a06 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -237,14 +237,14 @@ class HTMLPreProcessor(object): (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') ) - end_rules = [] - if getattr(self.extra_opts, 'unwrap_factor', None): - length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) - if length: - end_rules.append( - # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), - ) + end_rules = [] + if getattr(self.extra_opts, 'unwrap_factor', None): + length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) + if length: + end_rules.append( + # Un wrap using punctuation + (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), + ) for rule in self.PREPROCESS + pre_rules + rules + end_rules: html = rule[0].sub(rule[1], html) From 8c8b419e05b74ad4f5579b15ece7298cb8bf649f Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 1 Sep 2009 20:04:16 -0400 Subject: [PATCH 013/120] Fix bug #3380: TXT input encoding ignored. --- src/calibre/ebooks/txt/input.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 2b0245c98b..47b03181f0 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -51,6 +51,7 @@ class TXTInput(InputFormatPlugin): html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) + options.input_encoding = 'utf-8' base = os.getcwdu() if hasattr(stream, 'name'): base = os.path.dirname(stream.name) From 34e9857ab0d36acb5f9cd091e0200210f0bdf4c4 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 2 Sep 2009 17:05:11 -0400 Subject: [PATCH 014/120] TXT output: Optimize string manipulation. --- src/calibre/ebooks/txt/newlines.py | 2 +- src/calibre/ebooks/txt/txtml.py | 51 +++++++++++++++++++----------- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/calibre/ebooks/txt/newlines.py b/src/calibre/ebooks/txt/newlines.py index 983d356206..ae766a216f 100644 --- a/src/calibre/ebooks/txt/newlines.py +++ b/src/calibre/ebooks/txt/newlines.py @@ -19,7 +19,7 @@ class TxtNewlines(object): self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) def specified_newlines(newline, text): - if newline == os.linesep: + if newline == '\n': return text return text.replace(os.linesep, newline) diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 206dff50ed..284cc22896 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -8,7 +8,8 @@ __docformat__ = 'restructuredtext en' Transform OEB content into plain text ''' -import os, re +import os +import re from lxml import etree @@ -43,15 +44,15 @@ class TXTMLizer(object): return self.mlize_spine() def mlize_spine(self): - output = u'' - output += self.get_toc() + output = [u''] + output.append(self.get_toc()) for item in self.oeb_book.spine: self.log.debug('Converting %s to TXT...' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) content = self.remove_newlines(content) - output += self.dump_text(etree.fromstring(content), stylizer) - output = self.cleanup_text(output) + output.append(self.get_text(etree.fromstring(content), stylizer)) + output = self.cleanup_text(u''.join(output)) return output @@ -64,13 +65,13 @@ class TXTMLizer(object): return text def get_toc(self): - toc = u'' + toc = [u''] if getattr(self.opts, 'inline_toc', None): self.log.debug('Generating table of contents...') - toc += u'%s\n\n' % _(u'Table of Contents:') + toc.append(u'%s\n\n' % _(u'Table of Contents:')) for item in self.oeb_book.toc: - toc += u'* %s\n\n' % item.title - return toc + toc.append(u'* %s\n\n' % item.title) + return ''.join(toc) def cleanup_text(self, text): self.log.debug('\tClean up text...') @@ -99,6 +100,17 @@ class TXTMLizer(object): return text + def get_text(self, elem, stylizer): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + @end: The last two characters of the text from the previous element. + This is used to determine if a blank line is needed when starting + a new block element. + ''' + + return u''.join(self.dump_text(elem, stylizer)) + def dump_text(self, elem, stylizer, end=''): ''' @elem: The element in the etree that we are working on. @@ -110,14 +122,14 @@ class TXTMLizer(object): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: - return u'' + return [''] - text = u'' + text = [''] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': - return u'' + return [''] tag = barename(elem.tag) in_block = False @@ -125,20 +137,23 @@ class TXTMLizer(object): # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: in_block = True - if not end.endswith(os.linesep + os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': - text += os.linesep + os.linesep + if not end.endswith('\n\n') and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text.append('\n\n') # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': - text += elem.text + text.append(elem.text) for item in elem: - text += self.dump_text(item, stylizer, text[-2:]) + en = u'' + if len(text) >= 2: + en = text[-1][-2:] + text += self.dump_text(item, stylizer, en) if in_block: - text += os.linesep + os.linesep + text.append('\n\n') if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': - text += elem.tail + text.append(elem.tail) return text From 8245d7d7e84ce7ec74ba00b94a34bad67ad44ed1 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 2 Sep 2009 17:56:54 -0400 Subject: [PATCH 015/120] TXT Output: Option to produce not add a blank line between paragraphs. Option to indent the beginning of each paragraph by a tab. --- src/calibre/ebooks/txt/output.py | 6 ++++++ src/calibre/ebooks/txt/txtml.py | 15 +++++++++++---- src/calibre/gui2/convert/txt_output.ui | 18 ++++++++++++++++-- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 6f0a768b8f..b3bda7fa9d 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -33,6 +33,12 @@ class TXTOutput(OutputFormatPlugin): OptionRecommendation(name='inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help=_('Add Table of Contents to beginning of the book.')), + OptionRecommendation(name='flush_paras', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Do not add a blank line between paragraphs.')), + OptionRecommendation(name='indent_paras', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Add a tab at the beginning of each paragraph.')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 284cc22896..10c132e1e1 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -41,6 +41,7 @@ class TXTMLizer(object): self.log.info('Converting XHTML to TXT...') self.oeb_book = oeb_book self.opts = opts + return self.mlize_spine() def mlize_spine(self): @@ -92,12 +93,18 @@ class TXTMLizer(object): # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) - text = re.sub('\n{3,}', '\n\n', text) + if self.opts.flush_paras: + text = re.sub('\n{2,}', '\n', text) + else: + text = re.sub('\n{3,}', '\n\n', text) # Replace spaces at the beginning and end of lines text = re.sub('(?imu)^[ ]+', '', text) text = re.sub('(?imu)[ ]+$', '', text) + if self.opts.indent_paras: + text = re.sub('(?imu)^(?=.)', '\t', text) + return text def get_text(self, elem, stylizer): @@ -137,8 +144,8 @@ class TXTMLizer(object): # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: in_block = True - if not end.endswith('\n\n') and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': - text.append('\n\n') + if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text.append(u'\n\n') # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': @@ -151,7 +158,7 @@ class TXTMLizer(object): text += self.dump_text(item, stylizer, en) if in_block: - text.append('\n\n') + text.append(u'\n\n') if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': text.append(elem.tail) diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 6e62040533..900198aca9 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -27,7 +27,7 @@ - + Qt::Vertical @@ -40,13 +40,27 @@ - + &Inline TOC + + + + Do not add a blank line between paragraphs. + + + + + + + Add a tab at the beginning of each paragraph + + + From 7fc881735f7f59e80276a332432f658cce91b917 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 2 Sep 2009 18:16:27 -0400 Subject: [PATCH 016/120] PML output: Optimize string manipulation. --- src/calibre/ebooks/pml/pmlml.py | 65 +++++++++++++++++---------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 2438fd9bef..72b55d00b1 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -85,11 +85,11 @@ class PMLMLizer(object): def pmlmlize_spine(self): self.image_hrefs = {} self.link_hrefs = {} - output = u'' - output += self.get_cover_page() - output += u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk' - output += self.get_text() - output = output.replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc()) + output = [u''] + output.append(self.get_cover_page()) + output.append(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk') + output.append(self.get_text()) + output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc()) output = self.clean_text(output) return output @@ -101,29 +101,29 @@ class PMLMLizer(object): item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) - output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output def get_toc(self): - toc = u'' + toc = [u''] if self.opts.inline_toc: self.log.debug('Generating table of contents...') - toc += u'\\X0%s\\X0\n\n' % _('Table of Contents:') + toc.append(u'\\X0%s\\X0\n\n' % _('Table of Contents:')) for item in self.oeb_book.toc: if item.href in self.link_hrefs.keys(): - toc += '* \\q="#%s"%s\\q\n' % (self.link_hrefs[item.href], item.title) + toc.append('* \\q="#%s"%s\\q\n' % (self.link_hrefs[item.href], item.title)) else: self.oeb.warn('Ignoring toc item: %s not found in document.' % item) - return toc + return ''.join(toc) def get_text(self): - text = u'' + text = [u''] for item in self.oeb_book.spine: self.log.debug('Converting %s to PML markup...' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) - text += self.add_page_anchor(item) + text.append(self.add_page_anchor(item)) text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) - return text + return ''.join(text) def add_page_anchor(self, page): return self.get_anchor(page, '') @@ -163,14 +163,14 @@ class PMLMLizer(object): def dump_text(self, elem, stylizer, page, tag_stack=[]): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: - return u'' + return [u''] - text = u'' + text = [u''] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': - return u'' + return [u''] tag = barename(elem.tag) tag_count = 0 @@ -187,24 +187,25 @@ class PMLMLizer(object): if elem.attrib.get('src', None): if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys(): self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') - text += '\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])] + text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])]) if tag == 'hr': - text += '\\w' + w = '\\w' width = elem.get('width') if width: - text += '="%s%%"' % width + w += '="%s%%"' % width else: - text += '="50%"' + w += '="50%"' + text.append(w) # Process style information that needs holds a single tag # Commented out because every page in an OEB book starts with this style #if style['page-break-before'] == 'always': - # text += '\\p' + # text.append('\\p') pml_tag = TAG_MAP.get(tag, None) if pml_tag and pml_tag not in tag_stack: tag_count += 1 - text += '\\%s' % pml_tag + text.append('\\%s' % pml_tag) tag_stack.append(pml_tag) # Special processing of tags that require an argument. @@ -219,27 +220,27 @@ class PMLMLizer(object): if href not in self.link_hrefs.keys(): self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) href = self.link_hrefs[href] - text += '\\q="#%s"' % href + text.append('\\q="#%s"' % href) tag_count += 1 tag_stack.append('q') # Anchor ids id_name = elem.get('id') if id_name: - text += self.get_anchor(page, id_name) + text.append(self.get_anchor(page, id_name)) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag and style_tag not in tag_stack: tag_count += 1 - text += '\\%s' % style_tag + text.append('\\%s' % style_tag) tag_stack.append(style_tag) # margin # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': - text += self.elem_text(elem, tag_stack) + text.append(self.elem_text(elem, tag_stack)) for item in elem: text += self.dump_text(item, stylizer, page, tag_stack) @@ -249,16 +250,16 @@ class PMLMLizer(object): close_tag_list.insert(0, tag_stack.pop()) text += self.close_tags(close_tag_list) if tag in SEPARATE_TAGS: - text += os.linesep + os.linesep + text.append(os.linesep + os.linesep) if 'block' not in tag_stack: - text += os.linesep + os.linesep + text.append(os.linesep + os.linesep) #if style['page-break-after'] == 'always': - # text += '\\p' + # text.append('\\p') if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': - text += self.elem_tail(elem, tag_stack) + text.append(self.elem_tail(elem, tag_stack)) return text @@ -276,10 +277,10 @@ class PMLMLizer(object): return text def close_tags(self, tags): - text = u'' + text = [u''] for i in range(0, len(tags)): tag = tags.pop() if tag != 'block': - text += '\\%s' % tag + text.append('\\%s' % tag) return text From 7c9b132a7e079e83e69562b955a8748f82e31e01 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 2 Sep 2009 18:34:41 -0400 Subject: [PATCH 017/120] RB output: Optimize string manipulation. --- src/calibre/ebooks/rb/rbml.py | 52 +++++++++++++++++------------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/calibre/ebooks/rb/rbml.py b/src/calibre/ebooks/rb/rbml.py index 679ccaa39b..e9c8d98c10 100644 --- a/src/calibre/ebooks/rb/rbml.py +++ b/src/calibre/ebooks/rb/rbml.py @@ -71,12 +71,12 @@ class RBMLizer(object): def mlize_spine(self): self.link_hrefs = {} - output = u'' - output += self.get_cover_page() - output += u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk' - output += self.get_text() - output += u'' - output = output.replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc()) + output = [u''] + output.append(self.get_cover_page()) + output.append(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk') + output.append(self.get_text()) + output.append(u'') + output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc()) output = self.clean_text(output) return output @@ -92,26 +92,26 @@ class RBMLizer(object): return output def get_toc(self): - toc = u'' + toc = [u''] if self.opts.inline_toc: self.log.debug('Generating table of contents...') - toc += u'

%s

    \n' % _('Table of Contents:') + toc.append(u'

    %s

      \n' % _('Table of Contents:')) for item in self.oeb_book.toc: if item.href in self.link_hrefs.keys(): - toc += '
    • %s
    • \n' % (self.link_hrefs[item.href], item.title) + toc.append('
    • %s
    • \n' % (self.link_hrefs[item.href], item.title)) else: self.oeb.warn('Ignoring toc item: %s not found in document.' % item) - toc += '
    ' - return toc + toc.append('
') + return ''.join(toc) def get_text(self): - output = u'' + output = [u''] for item in self.oeb_book.spine: self.log.debug('Converting %s to RocketBook HTML...' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) - output += self.add_page_anchor(item) + output.append(self.add_page_anchor(item)) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) - return output + return ''.join(output) def add_page_anchor(self, page): return self.get_anchor(page, '') @@ -135,14 +135,14 @@ class RBMLizer(object): def dump_text(self, elem, stylizer, page, tag_stack=[]): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: - return u'' + return [u''] - text = u'' + text = [u''] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': - return u'' + return [u''] tag = barename(elem.tag) tag_count = 0 @@ -153,12 +153,12 @@ class RBMLizer(object): if elem.attrib.get('src', None): if page.abshref(elem.attrib['src']) not in self.name_map.keys(): self.name_map[page.abshref(elem.attrib['src'])] = unique_name('%s' % len(self.image_hrefs.keys()), self.image_hrefs.keys(), self.name_map.keys()) - text += '' % self.name_map[page.abshref(elem.attrib['src'])] + text.append('' % self.name_map[page.abshref(elem.attrib['src'])]) rb_tag = tag.upper() if tag in TAGS else None if rb_tag: tag_count += 1 - text += '<%s>' % rb_tag + text.append('<%s>' % rb_tag) tag_stack.append(rb_tag) # Anchors links @@ -172,14 +172,14 @@ class RBMLizer(object): if href not in self.link_hrefs.keys(): self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) href = self.link_hrefs[href] - text += '' % href + text.append('' % href) tag_count += 1 tag_stack.append('A') # Anchor ids id_name = elem.get('id') if id_name: - text += self.get_anchor(page, id_name) + text.append(self.get_anchor(page, id_name)) # Processes style information for s in STYLES: @@ -187,12 +187,12 @@ class RBMLizer(object): if style_tag: style_tag = style_tag.upper() tag_count += 1 - text += '<%s>' % style_tag + text.append('<%s>' % style_tag) tag_stack.append(style_tag) # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': - text += prepare_string_for_xml(elem.text) + text.append(prepare_string_for_xml(elem.text)) for item in elem: text += self.dump_text(item, stylizer, page, tag_stack) @@ -204,14 +204,14 @@ class RBMLizer(object): text += self.close_tags(close_tag_list) if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': - text += prepare_string_for_xml(elem.tail) + text.append(prepare_string_for_xml(elem.tail)) return text def close_tags(self, tags): - text = u'' + text = [u''] for i in range(0, len(tags)): tag = tags.pop() - text += '' % tag + text.append('' % tag) return text From 017688ee5deefa3198c9349d2a98fa239b392281 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 2 Sep 2009 18:46:47 -0400 Subject: [PATCH 018/120] FB2 output: Optimize string manipulation. TXT Output: Fix merge issue. --- src/calibre/ebooks/fb2/fb2ml.py | 66 ++++++++++++++++----------------- src/calibre/ebooks/txt/txtml.py | 2 +- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 3bc3424c39..36e65f6f0e 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -67,14 +67,14 @@ class FB2MLizer(object): def fb2mlize_spine(self): self.image_hrefs = {} self.link_hrefs = {} - output = self.fb2_header() - output += self.get_cover_page() - output += u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk' - output += self.get_text() - output += self.fb2_body_footer() - output += self.fb2mlize_images() - output += self.fb2_footer() - output = output.replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc()) + output = [self.fb2_header()] + output.append(self.get_cover_page()) + output.append(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk') + output.append(self.get_text()) + output.append(self.fb2_body_footer()) + output.append(self.fb2mlize_images()) + output.append(self.fb2_footer()) + output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc()) return u'\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True) def fb2_header(self): @@ -117,25 +117,25 @@ class FB2MLizer(object): return output def get_toc(self): - toc = u'' + toc = [u''] if self.opts.inline_toc: self.log.debug('Generating table of contents...') - toc += u'

%s

' % _('Table of Contents:') + toc.append(u'

%s

' % _('Table of Contents:')) for item in self.oeb_book.toc: if item.href in self.link_hrefs.keys(): - toc += '

%s

\n' % (self.link_hrefs[item.href], item.title) + toc.append('

%s

\n' % (self.link_hrefs[item.href], item.title)) else: self.oeb.warn('Ignoring toc item: %s not found in document.' % item) - return toc + return ''.join(toc) def get_text(self): - text = u'' + text = [u''] for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) - text += self.add_page_anchor(item) + text.append(self.add_page_anchor(item)) text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) - return text + return ''.join(text) def fb2_body_footer(self): return u'\n\n' @@ -155,7 +155,7 @@ class FB2MLizer(object): return '' % aid def fb2mlize_images(self): - images = u'' + images = [u''] for item in self.oeb_book.manifest: if item.media_type in OEB_RASTER_IMAGES: try: @@ -174,23 +174,23 @@ class FB2MLizer(object): col = 1 col += 1 data += char - images += '%s\n' % (self.image_hrefs.get(item.href, '0000.JPEG'), item.media_type, data) + images.append('%s\n' % (self.image_hrefs.get(item.href, '0000.JPEG'), item.media_type, data)) except Exception as e: self.log.error('Error: Could not include file %s becuase ' \ '%s.' % (item.href, e)) - return images + return ''.join(images) def dump_text(self, elem, stylizer, page, tag_stack=[]): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: - return u'' + return [u''] - fb2_text = u'' + fb2_text = [u''] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': - return u'' + return [u''] tag = barename(elem.tag) tag_count = 0 @@ -199,33 +199,33 @@ class FB2MLizer(object): if elem.attrib.get('src', None): if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys(): self.image_hrefs[page.abshref(elem.attrib['src'])] = '%s.jpg' % len(self.image_hrefs.keys()) - fb2_text += '' % self.image_hrefs[page.abshref(elem.attrib['src'])] + fb2_text.append('' % self.image_hrefs[page.abshref(elem.attrib['src'])]) if tag in TAG_LINKS: href = elem.get('href') if href: href = prepare_string_for_xml(page.abshref(href)) if '://' in href: - fb2_text += '' % href + fb2_text.append('' % href) else: if '#' not in href: href += '#' if href not in self.link_hrefs.keys(): self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) href = self.link_hrefs[href] - fb2_text += '' % href + fb2_text.append('' % href) tag_count += 1 tag_stack.append('a') # Anchor ids id_name = elem.get('id') if id_name: - fb2_text += self.get_anchor(page, id_name) + fb2_text.append(self.get_anchor(page, id_name)) fb2_tag = TAG_MAP.get(tag, None) if fb2_tag and fb2_tag not in tag_stack: tag_count += 1 - fb2_text += '<%s>' % fb2_tag + fb2_text.append('<%s>' % fb2_tag) tag_stack.append(fb2_tag) # Processes style information @@ -233,15 +233,15 @@ class FB2MLizer(object): style_tag = s[1].get(style[s[0]], None) if style_tag: tag_count += 1 - fb2_text += '<%s>' % style_tag + fb2_text.append('<%s>' % style_tag) tag_stack.append(style_tag) if tag in TAG_SPACE: if not fb2_text or fb2_text[-1] != ' ': - fb2_text += ' ' + fb2_text.append(' ') if hasattr(elem, 'text') and elem.text != None: - fb2_text += prepare_string_for_xml(elem.text) + fb2_text.append(prepare_string_for_xml(elem.text)) for item in elem: fb2_text += self.dump_text(item, stylizer, page, tag_stack) @@ -253,16 +253,16 @@ class FB2MLizer(object): if hasattr(elem, 'tail') and elem.tail != None: if 'p' not in tag_stack: - fb2_text += '

%s

' % prepare_string_for_xml(elem.tail) + fb2_text.append('

%s

' % prepare_string_for_xml(elem.tail)) else: - fb2_text += prepare_string_for_xml(elem.tail) + fb2_text.append(prepare_string_for_xml(elem.tail)) return fb2_text def close_tags(self, tags): - fb2_text = u'' + fb2_text = [u''] for i in range(0, len(tags)): fb2_tag = tags.pop() - fb2_text += '' % fb2_tag + fb2_text.append('' % fb2_tag) return fb2_text diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index eff239668d..c705bcf221 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -52,7 +52,7 @@ class TXTMLizer(object): stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) content = self.remove_newlines(content) - output += self.dump_text(etree.fromstring(content), stylizer)) + output += self.dump_text(etree.fromstring(content), stylizer) output = self.cleanup_text(u''.join(output)) return output From 1214b476a7a798dd46bdf13b2c6a1b76e0e507ff Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 2 Sep 2009 20:22:18 -0400 Subject: [PATCH 019/120] GUI TXT Output: Correct import. --- src/calibre/gui2/convert/txt_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index c2474ac4b8..eca85f1292 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' from calibre.gui2.convert.txt_output_ui import Ui_Form from calibre.gui2.convert import Widget -from calibre.ebooks.txt.writer import TxtNewlines +from calibre.ebooks.txt.newlines import TxtNewlines from calibre.gui2.widgets import BasicComboModel newline_model = None From 392d9033b5689df1a002c7f2ce1c340300985a8f Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 2 Sep 2009 21:04:17 -0400 Subject: [PATCH 020/120] Implement bug #3232: Ask the user if they want to reconvert already converted files. --- src/calibre/gui2/tools.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index 91a25ca426..a6dec46ab2 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -12,7 +12,7 @@ import cPickle from PyQt4.Qt import QDialog from calibre.ptempfile import PersistentTemporaryFile -from calibre.gui2 import warning_dialog +from calibre.gui2 import warning_dialog, question_dialog from calibre.gui2.convert.single import NoSupportedInputFormats from calibre.gui2.convert.single import Config as SingleConfig from calibre.gui2.convert.bulk import BulkConfig @@ -42,6 +42,9 @@ def convert_single_ebook(parent, db, book_ids, auto_conversion=False, out_format result = d.exec_() if result == QDialog.Accepted: + if not convert_existing(parent, db, [book_id], d.output_format): + continue + mi = db.get_metadata(book_id, True) in_file = db.format_abspath(book_id, d.input_format, True) @@ -100,6 +103,7 @@ def convert_bulk_ebook(parent, db, book_ids, out_format=None): output_format = d.output_format recs = cPickle.loads(d.recommendations) + book_ids = convert_existing(parent, db, book_ids, output_format) for i, book_id in enumerate(book_ids): temp_files = [] @@ -186,4 +190,15 @@ def fetch_scheduled_recipe(recipe, script): return 'gui_convert', args, _('Fetch news from ')+recipe.title, fmt.upper(), [pt] +def convert_existing(parent, db, book_ids, output_format): + already_converted_ids = [] + already_converted_titles = [] + for book_id in book_ids: + if db.has_format(book_id, output_format, index_is_id=True): + already_converted_ids.append(book_id) + already_converted_titles.append(db.get_metadata(book_id, True).title) + if not question_dialog(parent, _('Convert existing'), _('The following books have already been converted to %s format. Do you wish to reconvert them?' % output_format), '\n'.join(already_converted_titles)): + book_ids = [x for x in book_ids if x not in already_converted_ids] + + return book_ids From ebb2cad20aefbd3602327a14b6f63194b293db72 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 3 Sep 2009 07:22:25 -0400 Subject: [PATCH 021/120] Fix bug #2913: crop PDF works. --- src/calibre/ebooks/pdf/manipulate/crop.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/pdf/manipulate/crop.py b/src/calibre/ebooks/pdf/manipulate/crop.py index de18dc66dc..8957320280 100644 --- a/src/calibre/ebooks/pdf/manipulate/crop.py +++ b/src/calibre/ebooks/pdf/manipulate/crop.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, James Beal , ' \ @@ -10,7 +9,9 @@ __docformat__ = 'restructuredtext en' Crop a pdf file ''' -import sys, re +import sys +import re +from decimal import Decimal from optparse import OptionGroup, Option from calibre.ebooks.metadata.meta import metadata_from_formats @@ -108,11 +109,11 @@ def crop_pdf(pdf_path, opts, metadata=None): mo = bounding_regex.search(blines.next()) if mo == None: raise Exception('Error in bounding file %s' % opts.bounding) - page.mediaBox.upperRight = (float(mo.group('top_x')), float(mo.group('top_y'))) - page.mediaBox.lowerLeft = (float(mo.group('bottom_x')), float(mo.group('bottom_y'))) + page.mediaBox.upperRight = (float(mo.group('top_x')), Decimal(mo.group('top_y'))) + page.mediaBox.lowerLeft = (float(mo.group('bottom_x')), Decimal(mo.group('bottom_y'))) else: - page.mediaBox.upperRight = (page.bleedBox.getUpperRight_x() - float(opts.top_right_x), page.bleedBox.getUpperRight_y() - float(opts.top_right_y)) - page.mediaBox.lowerLeft = (page.bleedBox.getLowerLeft_x() + float(opts.bottom_left_x), page.bleedBox.getLowerLeft_y() + float(opts.bottom_left_y)) + page.mediaBox.upperRight = (page.bleedBox.getUpperRight_x() - Decimal(opts.top_right_x), page.bleedBox.getUpperRight_y() - Decimal(opts.top_right_y)) + page.mediaBox.lowerLeft = (page.bleedBox.getLowerLeft_x() + Decimal(opts.bottom_left_x), page.bleedBox.getLowerLeft_y() + Decimal(opts.bottom_left_y)) output_pdf.addPage(page) with open(opts.output, 'wb') as output_file: From ca124fba0354d92c1caf2402fad11d923e5dc3bc Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 4 Sep 2009 21:54:30 -0400 Subject: [PATCH 022/120] Fix bug #3405: PML small caps makes characters captial. --- src/calibre/ebooks/pml/pmlconverter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 1c3c749f76..2c97f509fa 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -34,7 +34,7 @@ PML_HTML_RULES = [ (re.compile(r'\\B(?P.*?)\\B', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\Sp(?P.*?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\Sb(?P.*?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\k(?P.*?)\\k', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\k(?P.*?)\\k', re.DOTALL), lambda match: '%s' % match.group('text').upper() if match.group('text') else ''), (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%s;' % match.group('num')), (re.compile(r'\\U(?P\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % image_name(match.group('name')).strip('\x00')), From 5196f6d006d4c8a3c52aa8f7e7f8f353ab1456a9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 5 Sep 2009 08:12:27 -0400 Subject: [PATCH 023/120] Only ask to convert existing when there are already converted books. --- src/calibre/gui2/tools.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index 7e1655b425..5670031e68 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -198,10 +198,11 @@ def convert_existing(parent, db, book_ids, output_format): already_converted_ids.append(book_id) already_converted_titles.append(db.get_metadata(book_id, True).title) - if not question_dialog(parent, _('Convert existing'), - _('The following books have already been converted to %s format. ' - 'Do you wish to reconvert them?') % output_format, - '\n'.join(already_converted_titles)): - book_ids = [x for x in book_ids if x not in already_converted_ids] + if already_converted_ids: + if not question_dialog(parent, _('Convert existing'), + _('The following books have already been converted to %s format. ' + 'Do you wish to reconvert them?') % output_format, + '\n'.join(already_converted_titles)): + book_ids = [x for x in book_ids if x not in already_converted_ids] return book_ids From 0bba611fe0d763fa6323962c304416448527ee9a Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 5 Sep 2009 08:44:32 -0400 Subject: [PATCH 024/120] Fine tune scrolling in the viewer. --- src/calibre/gui2/viewer/documentview.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/calibre/gui2/viewer/documentview.py b/src/calibre/gui2/viewer/documentview.py index 3cda941be6..b8dcbb8a7f 100644 --- a/src/calibre/gui2/viewer/documentview.py +++ b/src/calibre/gui2/viewer/documentview.py @@ -541,11 +541,6 @@ class DocumentView(QWebView): if self.manager is not None: self.manager.scrolled(self.scroll_fraction) - def wheel_event(self, down=True): - QWebView.wheelEvent(self, - QWheelEvent(QPoint(100, 100), (-120 if down else 120), - Qt.NoButton, Qt.NoModifier)) - def next_page(self): delta_y = self.document.window_height - 25 if self.document.at_bottom: @@ -654,9 +649,9 @@ class DocumentView(QWebView): else: self.scroll_to(1) elif key in [Qt.Key_J]: - self.wheel_event() + self.scroll_by(y=15) elif key in [Qt.Key_K]: - self.wheel_event(down=False) + self.scroll_by(y=-15) elif key in [Qt.Key_H]: self.scroll_by(x=-15) elif key in [Qt.Key_L]: From ec48f4029ba346bc10be45644bfbe7feefe0629a Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 5 Sep 2009 16:48:32 -0400 Subject: [PATCH 025/120] GUI: Regex builder for removing header and footer. --- src/calibre/gui2/convert/__init__.py | 6 +- src/calibre/gui2/convert/regex_builder.py | 138 ++++++++++++++++++ src/calibre/gui2/convert/regex_builder.ui | 80 ++++++++++ .../gui2/convert/structure_detection.py | 9 +- .../gui2/convert/structure_detection.ui | 45 ++---- .../convert/{xpath_edit.ui => xexp_edit.ui} | 0 src/calibre/gui2/convert/xpath_wizard.py | 2 +- 7 files changed, 247 insertions(+), 33 deletions(-) create mode 100644 src/calibre/gui2/convert/regex_builder.py create mode 100644 src/calibre/gui2/convert/regex_builder.ui rename src/calibre/gui2/convert/{xpath_edit.ui => xexp_edit.ui} (100%) diff --git a/src/calibre/gui2/convert/__init__.py b/src/calibre/gui2/convert/__init__.py index bbab3a1edd..a96008b1c3 100644 --- a/src/calibre/gui2/convert/__init__.py +++ b/src/calibre/gui2/convert/__init__.py @@ -77,6 +77,7 @@ class Widget(QWidget): def get_value(self, g): from calibre.gui2.convert.xpath_wizard import XPathEdit + from calibre.gui2.convert.regex_builder import RegexEdit ret = self.get_value_handler(g) if ret != 'this is a dummy return value, xcswx1avcx4x': return ret @@ -94,12 +95,15 @@ class Widget(QWidget): return bool(g.isChecked()) elif isinstance(g, XPathEdit): return g.xpath if g.xpath else None + elif isinstance(g, RegexEdit): + return g.regex if g.regex else None else: raise Exception('Can\'t get value from %s'%type(g)) def set_value(self, g, val): from calibre.gui2.convert.xpath_wizard import XPathEdit + from calibre.gui2.convert.regex_builder import RegexEdit if self.set_value_handler(g, val): return if isinstance(g, (QSpinBox, QDoubleSpinBox)): @@ -116,7 +120,7 @@ class Widget(QWidget): g.setCurrentIndex(idx) elif isinstance(g, QCheckBox): g.setCheckState(Qt.Checked if bool(val) else Qt.Unchecked) - elif isinstance(g, XPathEdit): + elif isinstance(g, (XPathEdit, RegexEdit)): g.edit.setText(val if val else '') else: raise Exception('Can\'t set value %s in %s'%(repr(val), diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py new file mode 100644 index 0000000000..07c63de93a --- /dev/null +++ b/src/calibre/gui2/convert/regex_builder.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import re + +from PyQt4.QtCore import SIGNAL, Qt +from PyQt4.QtGui import QDialog, QWidget, QDialogButtonBox, QFileDialog, \ + QBrush, QSyntaxHighlighter, QTextCharFormat + +from calibre.gui2.convert.regex_builder_ui import Ui_RegexBuilder +from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit +from calibre.gui2 import qstring_to_unicode +from calibre.gui2 import error_dialog +from calibre.ebooks.oeb.iterator import EbookIterator +from calibre.gui2.dialogs.choose_format import ChooseFormatDialog + +class RegexHighlighter(QSyntaxHighlighter): + + def __init__(self, *args): + QSyntaxHighlighter.__init__(self, *args) + + self.regex = u'' + + def update_regex(self, regex): + self.regex = qstring_to_unicode(regex) + self.rehighlight() + + def highlightBlock(self, text): + valid_regex = True + text = qstring_to_unicode(text) + format = QTextCharFormat() + format.setBackground(QBrush(Qt.yellow)) + + if self.regex: + try: + for mo in re.finditer(self.regex, text): + self.setFormat(mo.start(), mo.end() - mo.start(), format) + except: + valid_regex = False + self.emit(SIGNAL('regex_valid(PyQt_PyObject)'), valid_regex) + +class RegexBuilder(QDialog, Ui_RegexBuilder): + + def __init__(self, db, book_id, regex, *args): + QDialog.__init__(self, *args) + self.setupUi(self) + + self.regex.setText(regex) + self.regex_valid(True) + self.highlighter = RegexHighlighter(self.preview.document()) + + if not db or not book_id: + self.button_box.addButton(QDialogButtonBox.Open) + else: + self.select_format(db, book_id) + + self.connect(self.button_box, SIGNAL('clicked(QAbstractButton*)'), self.button_clicked) + self.connect(self.regex, SIGNAL('textChanged(QString)'), self.highlighter.update_regex) + self.connect(self.highlighter, SIGNAL('regex_valid(PyQt_PyObject)'), self.regex_valid) + + def regex_valid(self, valid): + if valid: + self.regex.setStyleSheet('QLineEdit { color: black; background-color: white; }') + else: + self.regex.setStyleSheet('QLineEdit { color: black; background-color: rgb(255,0,0,20%); }') + + def select_format(self, db, book_id): + format = None + formats = db.formats(book_id, index_is_id=True).upper().split(',') + if len(formats) == 1: + format = formats[0] + elif len(formats) > 1: + d = ChooseFormatDialog(self, _('Choose the format to view'), formats) + d.exec_() + if d.result() == QDialog.Accepted: + format = d.format() + + if not format: + error_dialog(self, _('No formats available'), _('Cannot build regex using the GUI builder without a book.')) + QDialog.reject() + else: + self.open_book(db.format_abspath(book_id, format, index_is_id=True)) + + def open_book(self, pathtoebook): + self.iterator = EbookIterator(pathtoebook) + self.iterator.__enter__() + text = [u''] + for path in self.iterator.spine: + html = open(path, 'rb').read().decode(path.encoding, 'replace') + text.append(html) + self.preview.setPlainText('\n\n'.join(text)) + + def button_clicked(self, button): + if button == self.button_box.button(QDialogButtonBox.Open): + name = QFileDialog.getOpenFileName(self, _('Open book'), _('~')) + if name: + self.open_book(qstring_to_unicode(name)) + if button == self.button_box.button(QDialogButtonBox.Ok): + self.accept() + +class RegexEdit(QWidget, Ui_Edit): + + def __init__(self, parent=None): + QWidget.__init__(self, parent) + self.setupUi(self) + + self.book_id = None + self.db = None + + self.connect(self.button, SIGNAL('clicked()'), self.builder) + + def builder(self): + bld = RegexBuilder(self.db, self.book_id, self.edit.text(), self) + if bld.exec_() == bld.Accepted: + self.edit.setText(bld.regex.text()) + + def set_msg(self, msg): + self.msg.setText(msg) + + def set_book_id(self, book_id): + self.book_id = book_id + + def set_db(self, db): + self.db = db + + @property + def text(self): + return unicode(self.edit.text()) + + @property + def regex(self): + return self.text + + def check(self): + return True diff --git a/src/calibre/gui2/convert/regex_builder.ui b/src/calibre/gui2/convert/regex_builder.ui new file mode 100644 index 0000000000..3448c4dded --- /dev/null +++ b/src/calibre/gui2/convert/regex_builder.ui @@ -0,0 +1,80 @@ + + + RegexBuilder + + + + 0 + 0 + 662 + 505 + + + + Regex Builder + + + + + + Preview + + + + + + false + + + true + + + Qt::TextSelectableByMouse + + + + + + + + + + Qt::Horizontal + + + QDialogButtonBox::Cancel|QDialogButtonBox::Ok + + + + + + + Regex: + + + + + + + + + + + + button_box + rejected() + RegexBuilder + reject() + + + 316 + 260 + + + 286 + 274 + + + + + diff --git a/src/calibre/gui2/convert/structure_detection.py b/src/calibre/gui2/convert/structure_detection.py index 00c5e30d6b..65e6a1d62a 100644 --- a/src/calibre/gui2/convert/structure_detection.py +++ b/src/calibre/gui2/convert/structure_detection.py @@ -34,11 +34,18 @@ class StructureDetectionWidget(Widget, Ui_Form): self.opt_chapter.set_msg(_('Detect chapters at (XPath expression):')) self.opt_page_breaks_before.set_msg(_('Insert page breaks before ' '(XPath expression):')) + self.opt_header_regex.set_msg(_('Header regular expression:')) + self.opt_header_regex.set_book_id(book_id) + self.opt_header_regex.set_db(db) + self.opt_footer_regex.set_msg(_('Footer regular expression:')) + self.opt_footer_regex.set_book_id(book_id) + self.opt_footer_regex.set_db(db) + def pre_commit_check(self): for x in ('header_regex', 'footer_regex'): x = getattr(self, 'opt_'+x) try: - pat = unicode(x.text()) + pat = unicode(x.regex) re.compile(pat) except Exception, err: error_dialog(self, _('Invalid regular expression'), diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui index 6952abce96..e4414473f5 100644 --- a/src/calibre/gui2/convert/structure_detection.ui +++ b/src/calibre/gui2/convert/structure_detection.ui @@ -28,8 +28,7 @@
- - + @@ -45,27 +44,17 @@ - - - - &Footer regular expression: - - - opt_footer_regex - - - - + &Preprocess input file to possibly improve structure detection - + - + Qt::Vertical @@ -78,17 +67,7 @@ - - - - &Header regular expression: - - - opt_header_regex - - - - + Remove F&ooter @@ -102,11 +81,11 @@ - - + + - - + + @@ -117,6 +96,12 @@
convert/xpath_wizard.h
1 + + RegexEdit + QWidget +
regex_builder.h
+ 1 +
diff --git a/src/calibre/gui2/convert/xpath_edit.ui b/src/calibre/gui2/convert/xexp_edit.ui similarity index 100% rename from src/calibre/gui2/convert/xpath_edit.ui rename to src/calibre/gui2/convert/xexp_edit.ui diff --git a/src/calibre/gui2/convert/xpath_wizard.py b/src/calibre/gui2/convert/xpath_wizard.py index 9b8e44ddaa..ef42a876d3 100644 --- a/src/calibre/gui2/convert/xpath_wizard.py +++ b/src/calibre/gui2/convert/xpath_wizard.py @@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en' from PyQt4.Qt import QDialog, QWidget, SIGNAL, Qt, QDialogButtonBox, QVBoxLayout from calibre.gui2.convert.xpath_wizard_ui import Ui_Form -from calibre.gui2.convert.xpath_edit_ui import Ui_Form as Ui_Edit +from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit class WizardWidget(QWidget, Ui_Form): From 2de625b3e1355737a54185deb3bb31ee6a56ddd8 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 6 Sep 2009 12:59:21 -0400 Subject: [PATCH 026/120] Implement #3418: Handle print style formatting for TXT input. --- src/calibre/ebooks/pdb/input.py | 6 ++++++ src/calibre/ebooks/pdb/palmdoc/reader.py | 9 ++++++--- src/calibre/ebooks/pdb/ztxt/reader.py | 9 ++++++--- src/calibre/ebooks/txt/input.py | 13 +++++++++++-- src/calibre/ebooks/txt/processor.py | 6 +++++- src/calibre/gui2/convert/pdb_input.py | 2 +- src/calibre/gui2/convert/pdb_input.ui | 9 ++++++++- src/calibre/gui2/convert/txt_input.py | 2 +- src/calibre/gui2/convert/txt_input.ui | 13 ++++++++++--- 9 files changed, 54 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 3ad1a6121c..8f4751b42b 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -22,6 +22,12 @@ class PDBInput(InputFormatPlugin): help=_('Normally calibre treats blank lines as paragraph markers. ' 'With this option it will assume that every line represents ' 'a paragraph instead.')), + OptionRecommendation(name='print_formatted_paras', recommended_value=False, + help=_('Normally calibre treats blank lines as paragraph markers. ' + 'With this option it will assume that every line starting with ' + 'an indent (either a tab or 2+ spaces) represents a paragraph.' + 'Paragraphs end when the next line that starts with an indent ' + 'is reached.')), ]) def convert(self, stream, options, file_ext, log, diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index 8992382597..aaa121f1ed 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -13,8 +13,8 @@ import struct from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \ - opf_writer +from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ + separate_paragraphs_single_line class HeaderRecord(object): ''' @@ -36,6 +36,7 @@ class Reader(FormatReader): self.log = log self.encoding = options.input_encoding self.single_line_paras = options.single_line_paras + self.print_formatted_paras = options.print_formatted_paras self.sections = [] for i in range(header.num_sections): @@ -63,7 +64,9 @@ class Reader(FormatReader): self.log.info('Converting text to OEB...') if self.single_line_paras: - txt = separate_paragraphs(txt) + txt = separate_paragraphs_single_line(txt) + if self.print_formatted_paras: + txt = separate_paragraphs_print_formatted(txt) html = convert_basic(txt) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 664f498bee..4379159d81 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -12,8 +12,8 @@ import os, struct, zlib from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ztxt import zTXTError -from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \ - opf_writer +from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ + separate_paragraphs_single_line SUPPORTED_VERSION = (1, 40) @@ -40,6 +40,7 @@ class Reader(FormatReader): self.log = log self.encoding = options.input_encoding self.single_line_paras = options.single_line_paras + self.print_formatted_paras = options.print_formatted_paras self.sections = [] for i in range(header.num_sections): @@ -79,7 +80,9 @@ class Reader(FormatReader): self.log.info('Converting text to OEB...') if self.single_line_paras: - txt = separate_paragraphs(txt) + txt = separate_paragraphs_single_line(txt) + if self.print_formatted_paras: + txt = separate_paragraphs_print_formatted(txt) html = convert_basic(txt) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index f52bfa6fb5..eb86113f7a 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -8,7 +8,7 @@ import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ - separate_paragraphs + separate_paragraphs_single_line, separate_paragraphs_print_formatted class TXTInput(InputFormatPlugin): @@ -22,6 +22,12 @@ class TXTInput(InputFormatPlugin): help=_('Normally calibre treats blank lines as paragraph markers. ' 'With this option it will assume that every line represents ' 'a paragraph instead.')), + OptionRecommendation(name='print_formatted_paras', recommended_value=False, + help=_('Normally calibre treats blank lines as paragraph markers. ' + 'With this option it will assume that every line starting with ' + 'an indent (either a tab or 2+ spaces) represents a paragraph.' + 'Paragraphs end when the next line that starts with an indent ' + 'is reached.')), OptionRecommendation(name='markdown', recommended_value=False, help=_('Run the text input through the markdown pre-processor. To ' 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), @@ -35,8 +41,11 @@ class TXTInput(InputFormatPlugin): log.debug('Reading text from file...') txt = stream.read().decode(ienc, 'replace') + # Adjust paragraph formatting as requested if options.single_line_paras: - txt = separate_paragraphs(txt) + txt = separate_paragraphs_single_line(txt) + if options.print_formatted_paras: + txt = separate_paragraphs_print_formatted(txt) if options.markdown: log.debug('Running text though markdown conversion...') diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index f6503c0bc5..9d0e1283c1 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -45,12 +45,16 @@ def convert_markdown(txt, title=''): safe_mode=False,) return HTML_TEMPLATE % (title, md.convert(txt)) -def separate_paragraphs(txt): +def separate_paragraphs_single_line(txt): txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt) return txt +def separate_paragraphs_print_formatted(txt): + txt = re.sub('(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt) + return txt + def opf_writer(path, opf_name, manifest, spine, mi): opf = OPFCreator(path, mi) opf.create_manifest(manifest) diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py index 4b0ba73fda..058f589856 100644 --- a/src/calibre/gui2/convert/pdb_input.py +++ b/src/calibre/gui2/convert/pdb_input.py @@ -14,6 +14,6 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, 'pdb_input', - ['single_line_paras']) + ['single_line_paras', 'print_formatted_paras']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/pdb_input.ui b/src/calibre/gui2/convert/pdb_input.ui index 191e749833..2b632b1a33 100644 --- a/src/calibre/gui2/convert/pdb_input.ui +++ b/src/calibre/gui2/convert/pdb_input.ui @@ -14,7 +14,7 @@ Form - + Qt::Vertical @@ -34,6 +34,13 @@
+ + + + Assume print formatting + + + diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py index 3d17eefe0d..505a916f81 100644 --- a/src/calibre/gui2/convert/txt_input.py +++ b/src/calibre/gui2/convert/txt_input.py @@ -14,6 +14,6 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, 'txt_input', - ['single_line_paras', 'markdown']) + ['single_line_paras', 'print_formatted_paras', 'markdown']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui index 8c22ff721e..94cc1ad25f 100644 --- a/src/calibre/gui2/convert/txt_input.ui +++ b/src/calibre/gui2/convert/txt_input.ui @@ -14,7 +14,7 @@ Form - + Qt::Vertical @@ -34,14 +34,14 @@ - + Process using markdown - + <p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>. @@ -51,6 +51,13 @@ + + + + Assume print formatting + + + From bd1b37f3a96bbd4a24bc760ca543ec0e5d7278a9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 6 Sep 2009 14:09:43 -0400 Subject: [PATCH 027/120] Add missing import. Allow TXT output to split lines based on a maximum line length value. --- src/calibre/ebooks/pdb/palmdoc/reader.py | 2 +- src/calibre/ebooks/pdb/ztxt/reader.py | 2 +- src/calibre/ebooks/txt/output.py | 11 ++++++++ src/calibre/ebooks/txt/txtml.py | 34 ++++++++++++++++++++++++ src/calibre/gui2/convert/txt_output.py | 5 ++-- src/calibre/gui2/convert/txt_output.ui | 28 ++++++++++++++++--- 6 files changed, 74 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index aaa121f1ed..0a57e3f51a 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -14,7 +14,7 @@ import struct from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line + separate_paragraphs_single_line, separate_paragraphs_print_formatted class HeaderRecord(object): ''' diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 4379159d81..86fb9d868c 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -13,7 +13,7 @@ import os, struct, zlib from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ztxt import zTXTError from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line + separate_paragraphs_single_line, separate_paragraphs_print_formatted SUPPORTED_VERSION = (1, 40) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index b3bda7fa9d..32bde90fe8 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -36,6 +36,17 @@ class TXTOutput(OutputFormatPlugin): OptionRecommendation(name='flush_paras', recommended_value=False, level=OptionRecommendation.LOW, help=_('Do not add a blank line between paragraphs.')), + OptionRecommendation(name='max_line_length', + recommended_value=0, level=OptionRecommendation.LOW, + help=_('The maximum number of characters per line. This splits on ' + 'the first space before the specified value. If no space is found ' + 'the line will be broken at the space after and will exceed the ' + 'specified value. Also, there is a minimum of 25 characters. ' + 'Use 0 to disable line splitting.')), + OptionRecommendation(name='force_max_line_length', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Force splitting on the max-line-length value when no space ' + 'is present. Also allows max-line-length to be below the minimum')), OptionRecommendation(name='indent_paras', recommended_value=False, level=OptionRecommendation.LOW, help=_('Add a tab at the beginning of each paragraph.')), diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index c705bcf221..63a5cdc8af 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -105,6 +105,40 @@ class TXTMLizer(object): if self.opts.indent_paras: text = re.sub('(?imu)^(?=.)', '\t', text) + if self.opts.max_line_length: + max_length = self.opts.max_line_length + if self.opts.max_line_length < 25 and not self.opts.force_max_line_length: + max_length = 25 + short_lines = [] + lines = text.splitlines() + for line in lines: + while len(line) > max_length: + space = line.rfind(' ', 0, max_length) + if space != -1: + # Space was found. + short_lines.append(line[:space]) + line = line[space + 1:] + else: + # Space was not found. + if self.opts.force_max_line_length: + # Force breaking at max_lenght. + short_lines.append(line[:max_length]) + line = line[max_length:] + else: + # Look for the first space after max_length. + space = line.find(' ', max_length, len(line)) + if space != -1: + # Space was found. + short_lines.append(line[:space]) + line = line[space + 1:] + else: + # No space was found cannot break line. + short_lines.append(line) + line = '' + # Add the text that was less than max_lengh to the list + short_lines.append(line) + text = '\n'.join(short_lines) + return text def dump_text(self, elem, stylizer, end=''): diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index eca85f1292..2fc7f19908 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -17,8 +17,9 @@ class PluginWidget(Widget, Ui_Form): HELP = _('Options specific to')+' TXT '+_('output') def __init__(self, parent, get_option, get_help, db=None, book_id=None): - Widget.__init__(self, parent, 'txt_output', ['newline', 'inline_toc', - 'flush_paras', 'indent_paras']) + Widget.__init__(self, parent, 'txt_output', + ['newline', 'max_line_length', 'force_max_line_length', + 'inline_toc', 'flush_paras', 'indent_paras']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 900198aca9..8e5429b0ce 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -27,7 +27,7 @@ - + Qt::Vertical @@ -40,27 +40,47 @@ - + &Inline TOC - + Do not add a blank line between paragraphs. - + Add a tab at the beginning of each paragraph + + + + + + + &Maximum line length: + + + opt_max_line_length + + + + + + + Force maximum line lenght + + + From 6d585f30904cb6e9d31100366f4b9edd712dae9d Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 7 Sep 2009 10:34:36 -0400 Subject: [PATCH 028/120] Fix bug #3424: Load defaults and book specific options in GUI bulk convert. --- src/calibre/gui2/tools.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index 5670031e68..52273adbd3 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -18,6 +18,8 @@ from calibre.gui2.convert.single import Config as SingleConfig from calibre.gui2.convert.bulk import BulkConfig from calibre.customize.conversion import OptionRecommendation from calibre.utils.config import prefs +from calibre.ebooks.conversion.config import GuiRecommendations, \ + load_defaults, load_specifics, save_specifics def convert_single_ebook(parent, db, book_ids, auto_conversion=False, out_format=None): changed = False @@ -56,7 +58,6 @@ def convert_single_ebook(parent, db, book_ids, auto_conversion=False, out_format desc = _('Convert book %d of %d (%s)') % (i + 1, total, repr(mi.title)) recs = cPickle.loads(d.recommendations) - args = [in_file, out_file.name, recs] if d.opf_file is not None: recs.append(('read_metadata_from_opf', d.opf_file.name, OptionRecommendation.HIGH)) @@ -65,6 +66,7 @@ def convert_single_ebook(parent, db, book_ids, auto_conversion=False, out_format recs.append(('cover', d.cover_file.name, OptionRecommendation.HIGH)) temp_files.append(d.cover_file) + args = [in_file, out_file.name, recs] temp_files.append(out_file) jobs.append(('gui_convert', args, desc, d.output_format.upper(), book_id, temp_files)) @@ -101,7 +103,7 @@ def convert_bulk_ebook(parent, db, book_ids, out_format=None): return jobs, changed, bad output_format = d.output_format - recs = cPickle.loads(d.recommendations) + user_recs = cPickle.loads(d.recommendations) book_ids = convert_existing(parent, db, book_ids, output_format) for i, book_id in enumerate(book_ids): @@ -119,7 +121,17 @@ def convert_bulk_ebook(parent, db, book_ids, out_format=None): out_file.close() temp_files = [] - lrecs = list(recs) + combined_recs = GuiRecommendations() + default_recs = load_defaults('%s_input' % d.input_format) + specific_recs = load_specifics(db, book_id) + for key in default_recs: + combined_recs[key] = default_recs[key] + for key in specific_recs: + combined_recs[key] = specific_recs[key] + for item in user_recs: + combined_recs[item[0]] = item[1] + save_specifics(db, book_id, combined_recs) + lrecs = list(combined_recs.to_recommendations()) if d.opf_file is not None: lrecs.append(('read_metadata_from_opf', d.opf_file.name, From 9cf5d0a5f8c053eb9ab41779f980c1ce9cc0891b Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 9 Sep 2009 06:52:55 -0400 Subject: [PATCH 029/120] Update prs descriptions. --- src/calibre/devices/prs505/driver.py | 6 +++--- src/calibre/devices/prs700/driver.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index 07260ca31f..58ad12c078 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -21,16 +21,16 @@ from calibre import __appname__ class PRS505(CLI, Device): - name = 'PRS-505 Device Interface' + name = 'PRS-300/505 Device Interface' gui_name = 'SONY Pocket Edition' - description = _('Communicate with the Sony PRS-505 eBook reader.') + description = _('Communicate with the Sony PRS-300/505 eBook reader.') author = _('Kovid Goyal and John Schember') supported_platforms = ['windows', 'osx', 'linux'] FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt'] VENDOR_ID = [0x054c] #: SONY Vendor Id - PRODUCT_ID = [0x031e] #: Product Id for the PRS-505 + PRODUCT_ID = [0x031e] #: Product Id for the PRS 300 and 505 BCD = [0x229, 0x1000] VENDOR_NAME = 'SONY' diff --git a/src/calibre/devices/prs700/driver.py b/src/calibre/devices/prs700/driver.py index 804f400c8a..807e04dc26 100644 --- a/src/calibre/devices/prs700/driver.py +++ b/src/calibre/devices/prs700/driver.py @@ -13,8 +13,8 @@ import re class PRS700(PRS505): - name = 'PRS-700 Device Interface' - description = _('Communicate with the Sony PRS-700 eBook reader.') + name = 'PRS-600/700 Device Interface' + description = _('Communicate with the Sony PRS-600/700 eBook reader.') author = _('Kovid Goyal and John Schember') gui_name = 'SONY Touch edition' supported_platforms = ['windows', 'osx', 'linux'] From a1f05532cea92b6498ff471e9ede03e807673189 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 12 Sep 2009 20:04:29 -0400 Subject: [PATCH 030/120] Untested fix for bug #3472 --- src/calibre/ebooks/pdb/output.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/calibre/ebooks/pdb/output.py b/src/calibre/ebooks/pdb/output.py index 06f2f20d10..3b4065213e 100644 --- a/src/calibre/ebooks/pdb/output.py +++ b/src/calibre/ebooks/pdb/output.py @@ -47,6 +47,11 @@ class PDBOutput(OutputFormatPlugin): if Writer is None: raise PDBError('No writer available for format %s.' % format) + setattr(opts, 'flush_paras', False) + setattr(opts, 'max_line_length', 0) + setattr(opts, 'force_max_line_length', False) + setattr(opts, 'indent_paras', False) + writer = Writer(opts, log) out_stream.seek(0) From 9991c41fcbc30e5643e60dcf5733c349c7987ba6 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 21 Sep 2009 17:26:37 -0400 Subject: [PATCH 031/120] TXT Output: Fix specified_newlines to change the line ending type correctly. --- src/calibre/ebooks/txt/newlines.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/newlines.py b/src/calibre/ebooks/txt/newlines.py index ae766a216f..d7e97654b4 100644 --- a/src/calibre/ebooks/txt/newlines.py +++ b/src/calibre/ebooks/txt/newlines.py @@ -19,7 +19,11 @@ class TxtNewlines(object): self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) def specified_newlines(newline, text): + # Convert all newlines to \n + text = text.replace('\r\n', '\n') + text = text.replace('\r', '\n') + if newline == '\n': return text - return text.replace(os.linesep, newline) + return text.replace('\n', newline) From 6e600b0e56d7153bc13eaa277fa5cd529ebafbcb Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 22 Sep 2009 17:22:18 -0400 Subject: [PATCH 032/120] Don't read invalid data from title in PDB header. --- src/calibre/ebooks/pdb/header.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index 3436c262d8..753c5e29b9 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -30,7 +30,7 @@ class PdbHeaderReader(object): def name(self): self.stream.seek(0) - return self.stream.read(32).replace('\x00', '') + return re.sub('[^-A-Za-z0-9 ]+', '_', self.stream.read(32).replace('\x00', '')) def full_section_info(self, number): if number not in range(0, self.num_sections): From 0956c683f031ea7e61435db6fd1fdbe312fb9e75 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 22 Sep 2009 20:28:04 -0400 Subject: [PATCH 033/120] TXT Output: Table support. --- src/calibre/ebooks/txt/txtml.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 63a5cdc8af..6957e53d43 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -26,12 +26,18 @@ BLOCK_TAGS = [ 'h5', 'h6', 'li', + 'tr', ] BLOCK_STYLES = [ 'block', ] +SPACE_TAGS = [ + 'span', + 'td', +] + class TXTMLizer(object): def __init__(self, log): @@ -170,6 +176,10 @@ class TXTMLizer(object): if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': text.append(u'\n\n') + if tag in SPACE_TAGS: + if not end.endswith('u ') and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text.append(u' ') + # Process tags that contain text. if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': text.append(elem.text) From 72fd1a1d590dfd59193039da0c039f0234d33cff Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 23 Sep 2009 19:51:28 -0400 Subject: [PATCH 034/120] Fix bug #3567: Limit text length used for TXT metadata. --- src/calibre/ebooks/metadata/txt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/metadata/txt.py b/src/calibre/ebooks/metadata/txt.py index 8dbc0c1453..79713774e3 100644 --- a/src/calibre/ebooks/metadata/txt.py +++ b/src/calibre/ebooks/metadata/txt.py @@ -22,6 +22,8 @@ def get_metadata(stream, extract_cover=True): else: mdata += line + mdata = mdata[:100] + mo = re.search('(?u)^[ ]*(?P.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata) if mo != None: mi.title = mo.group('title') From fd65635a014095cb418c7d8eaba851b716edb6c3 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 30 Sep 2009 08:24:41 -0400 Subject: [PATCH 035/120] Fix bug #3620: Don't put a space before span tags. --- src/calibre/ebooks/txt/txtml.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 6957e53d43..59c3ea671a 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -34,7 +34,6 @@ BLOCK_STYLES = [ ] SPACE_TAGS = [ - 'span', 'td', ] From 7eba89285e33d729c0c26da4d454b4233a1a6346 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 3 Oct 2009 13:33:35 -0400 Subject: [PATCH 036/120] Possible fix for style and images not included in produced pdf files on Windows. --- src/calibre/ebooks/pdf/writer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index b87aba7bc0..bf11394c02 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -111,7 +111,8 @@ class PDFWriter(QObject): self.logger.debug('Processing %s...' % item) - self.view.load(QUrl(item)) + #self.view.load(QUrl(item)) + self.view.setHtml(open(item, 'r+b').read().decode('utf-8'), QUrl(item)) def _render_html(self, ok): if ok: From d6984b83c3b4240ecece296bfc8d0fac751c1efd Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 3 Oct 2009 13:46:22 -0400 Subject: [PATCH 037/120] Use fromLocalFile with QUrl. --- src/calibre/ebooks/pdf/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index bf11394c02..e10c13d7d3 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -112,7 +112,7 @@ class PDFWriter(QObject): self.logger.debug('Processing %s...' % item) #self.view.load(QUrl(item)) - self.view.setHtml(open(item, 'r+b').read().decode('utf-8'), QUrl(item)) + self.view.setHtml(open(item, 'r+b').read().decode('utf-8'), QUrl.fromLocalFile(item)) def _render_html(self, ok): if ok: From 62317ca8cfa72d773c092cd5bbedd9b76ee09520 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 8 Oct 2009 08:38:40 -0400 Subject: [PATCH 038/120] Fix FB2 output. --- src/calibre/ebooks/fb2/fb2ml.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index d2e90b6d44..ff914568d2 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -30,6 +30,7 @@ TAG_MAP = { 'i' : 'emphasis', 'p' : 'p', 'li' : 'p', + 'br' : 'empty-line', } TAG_SPACE = [ @@ -74,6 +75,7 @@ class FB2MLizer(object): output.append(self.fb2mlize_images()) output.append(self.fb2_footer()) output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc()) + return output return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True) def fb2_header(self): @@ -112,7 +114,7 @@ class FB2MLizer(object): item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) - output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output def get_toc(self): @@ -151,7 +153,7 @@ class FB2MLizer(object): if aid not in self.link_hrefs.keys(): self.link_hrefs[aid] = 'calibre_link-%s' % len(self.link_hrefs.keys()) aid = self.link_hrefs[aid] - return '<v id="%s"></v>' % aid + return '<a id="%s" />' % aid def fb2mlize_images(self): images = [u''] @@ -204,6 +206,7 @@ class FB2MLizer(object): href = elem.get('href') if href: href = prepare_string_for_xml(page.abshref(href)) + href = href.replace('"', '"') if '://' in href: fb2_text.append('<a xlink:href="%s">' % href) else: @@ -240,7 +243,10 @@ class FB2MLizer(object): fb2_text.append(' ') if hasattr(elem, 'text') and elem.text != None: - fb2_text.append(prepare_string_for_xml(elem.text)) + if 'p' not in tag_stack: + fb2_text.append('<p>%s</p>' % prepare_string_for_xml(elem.text)) + else: + fb2_text.append(prepare_string_for_xml(elem.text)) for item in elem: fb2_text += self.dump_text(item, stylizer, page, tag_stack) From 61e8c4222a6c7d8628ddfb156b7fb995b2179eea Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 10 Oct 2009 16:35:31 -0400 Subject: [PATCH 039/120] Add XML header to fb2 output. --- src/calibre/ebooks/fb2/fb2ml.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index ff914568d2..aaf8361b99 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -75,7 +75,6 @@ class FB2MLizer(object): output.append(self.fb2mlize_images()) output.append(self.fb2_footer()) output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc()) - return output return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True) def fb2_header(self): From 552735c41eb6c8974025c2e47b4eea96b21c93fa Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 10 Oct 2009 23:38:00 -0400 Subject: [PATCH 040/120] Update eReader PDB code to produce files that are closer to what DropBook produces: Set the text size record, Write image size, Handle 1.5.2 sidebar and footnote changes. Fix PML output stripping backslash character. --- src/calibre/ebooks/pdb/ereader/reader132.py | 1 - src/calibre/ebooks/pdb/ereader/writer.py | 111 ++++++++++++++------ src/calibre/ebooks/pml/pmlconverter.py | 5 +- 3 files changed, 80 insertions(+), 37 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py index 98dbe13790..49fdfb8980 100644 --- a/src/calibre/ebooks/pdb/ereader/reader132.py +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -34,7 +34,6 @@ class HeaderRecord(object): self.has_metadata, = struct.unpack('>H', raw[24:26]) self.footnote_rec, = struct.unpack('>H', raw[28:30]) self.sidebar_rec, = struct.unpack('>H', raw[30:32]) - self.bookmark_offset, = struct.unpack('>H', raw[32:34]) self.image_data_offset, = struct.unpack('>H', raw[40:42]) self.metadata_offset, = struct.unpack('>H', raw[44:46]) self.footnote_offset, = struct.unpack('>H', raw[48:50]) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 2f4e3bf16f..8a88c6a689 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -28,7 +28,7 @@ IDENTITY = 'PNRdPPrs' # This is an arbitrary number that is small enough to work. The actual maximum # record size is unknown. -MAX_RECORD_SIZE = 3560 +MAX_RECORD_SIZE = 8192 class Writer(FormatWriter): @@ -37,13 +37,28 @@ class Writer(FormatWriter): self.log = log def write_content(self, oeb_book, out_stream, metadata=None): - text, image_hrefs = self._text(oeb_book) + text, image_hrefs, text_sizes = self._text(oeb_book) images = self._images(oeb_book.manifest, image_hrefs) metadata = [self._metadata(metadata)] - hr = [self._header_record(len(text), len(images))] - sections = hr+text+images+metadata+['MeTaInFo\x00'] + ''' + Record order as generated by Dropbook. + 1. eReader Header + 2. Compressed text + 3. Small font page index + 4. Large font page index + 5. Chapter index + 6. Links index + 7. Images + 8. (Extrapolation: there should be one more record type here though yet uncovered what it might be). + 9. Metadata + 10. Sidebar records + 11. Footnote records + 12. Text block size record + 13. "MeTaInFo\x00" word record + ''' + sections = hr+text+images+metadata+[text_sizes]+['MeTaInFo\x00'] lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections] @@ -62,12 +77,38 @@ class Writer(FormatWriter): pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') pml_pages = [] - for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1): - pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])) + text_sizes = '' + index = 0 + while index < len(pml): + ''' + Split on the space character closest to MAX_RECORD_SIZE when possible. + ''' + split = pml.rfind(' ', index, MAX_RECORD_SIZE) + if split == -1: + len_end = len(pml[index:]) + if len_end > MAX_RECORD_SIZE: + split = MAX_RECORD_SIZE + else: + split = len_end + if split == 0: + split = 1 + pml_pages.append(zlib.compress(pml[index:index+split])) + text_sizes += struct.pack('>H', split) + index += split - return pml_pages, pmlmlizer.image_hrefs + return pml_pages, pmlmlizer.image_hrefs, text_sizes def _images(self, manifest, image_hrefs): + ''' + Image format. + + 0-4 : 'PNG '. There must be a space after PNG. + 4-36 : Image name. Must be exactly 32 bytes long. Pad with \x00 for names shorter than 32 bytes + 36-58 : Unknown. + 58-60 : Width. + 60-62 : Height. + 62-...: Raw image data in 8 bit PNG format. + ''' images = [] for item in manifest: @@ -82,6 +123,8 @@ class Writer(FormatWriter): header = 'PNG ' header += image_hrefs[item.href].ljust(32, '\x00')[:32] + header = header.ljust(58, '\x00') + header += struct.pack('>HH', im.size[0], im.size[1]) header = header.ljust(62, '\x00') if len(data) + len(header) < 65505: @@ -126,7 +169,7 @@ class Writer(FormatWriter): text_items = the number of text pages image_items = the number of images ''' - version = 10 # Zlib compression + compression = 10 # zlib compression. non_text_offset = text_items + 1 if image_items > 0: @@ -140,33 +183,33 @@ class Writer(FormatWriter): record = '' - record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM - record += struct.pack('>H', 0) # [2:4] - record += struct.pack('>H', 0) # [4:6] + record += struct.pack('>H', compression) # [0:2] # Compression. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM + record += struct.pack('>H', 0) # [2:4] # Unknown. + record += struct.pack('>H', 0) # [4:6] # Unknown. record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text - record += struct.pack('>H', 0) # [8:10] - record += struct.pack('>H', 0) # [10:12] - record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset - record += struct.pack('>H', 0) # [14:16] - record += struct.pack('>H', 0) # [16:18] - record += struct.pack('>H', 0) # [18:20] - record += struct.pack('>H', image_items) # [20:22] # Number of images - record += struct.pack('>H', 0) # [22:24] - record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not - record += struct.pack('>H', 0) # [26:28] - record += struct.pack('>H', 0) # [28:30] # footnote_rec - record += struct.pack('>H', 0) # [30:32] # sidebar_rec - record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset - record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC - record += struct.pack('>H', 0) # [36:38] - record += struct.pack('>H', 0) # [38:40] - record += struct.pack('>H', image_data_offset) # [40:42] # image_data_offset. This will be the last data offset if there are no images - record += struct.pack('>H', 0) # [42:44] - record += struct.pack('>H', meta_data_offset) # [44:46] # meta_data_offset. This will be the last data offset if there are no images - record += struct.pack('>H', 0) # [46:48] - record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset. This will be the last data offset if there are no images - record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset. This will be the last data offset if there are no images - record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset + record += struct.pack('>H', 0) # [8:10] # Number of small font pages. 0 if page index is not built. + record += struct.pack('>H', 0) # [10:12] # Number of large font pages. 0 if page index is not built. + record += struct.pack('>H', non_text_offset) # [12:14] # Non-Text record start. + record += struct.pack('>H', 0) # [14:16] # Number of chapter index records. + record += struct.pack('>H', 0) # [16:18] # Number of small font page index records. + record += struct.pack('>H', 0) # [18:20] # Number of large font page index records. + record += struct.pack('>H', image_items) # [20:22] # Number of images. + record += struct.pack('>H', 0) # [22:24] # Number of links. + record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not. + record += struct.pack('>H', 0) # [26:28] # Unknown. + record += struct.pack('>H', 0) # [28:30] # Number of Footnotes. + record += struct.pack('>H', 0) # [30:32] # Number of Sidebars. + record += struct.pack('>H', last_data_offset) # [32:34] # Chapter index offset. + record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC. + record += struct.pack('>H', last_data_offset) # [36:38] # Small font page offset. This will be the last data offset if there are none. + record += struct.pack('>H', last_data_offset) # [38:40] # Large font page offset. This will be the last data offset if there are none. + record += struct.pack('>H', image_data_offset) # [40:42] # Image offset. This will be the last data offset if there are none. + record += struct.pack('>H', image_data_offset) # [42:44] # Links offset. This will be the last data offset if there are none. + record += struct.pack('>H', meta_data_offset) # [44:46] # Metadata offset. This will be the last data offset if there are none. + record += struct.pack('>H', 0) # [46:48] # Unknown. + record += struct.pack('>H', last_data_offset) # [48:50] # Footnote offset. This will be the last data offset if there are none. + record += struct.pack('>H', last_data_offset) # [50:52] # Sidebar offset. This will be the last data offset if there are none. + record += struct.pack('>H', last_data_offset) # [52:54] # Last data offset. for i in range(54, 132, 2): record += struct.pack('>H', 0) # [54:132] diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index b4ab238da9..ca7721350c 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -64,7 +64,7 @@ PML_HTML_RULES = [ (re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''), (re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''), # Remove invalid single item pml codes. - (re.compile(r'(?<=[^\\])\\.'), lambda match: ''), + (re.compile(r'(?<=[^\\])\\[^\\]'), lambda match: ''), # Replace \\ with \. (re.compile(r'\\\\'), lambda match: '\\'), @@ -78,6 +78,7 @@ def pml_to_html(pml): return html def footnote_sidebar_to_html(id, pml): + if id.startswith('\x01'): + id = id[2:] html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml)) return html - From 4b2f26f123b5d0ee0172c4960e3442adedeecb07 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 11 Oct 2009 08:48:07 -0400 Subject: [PATCH 041/120] PML input cleanup. Generate chapter and link index with eReader PDB output. --- src/calibre/ebooks/pdb/ereader/writer.py | 93 ++++++++++++++++++------ src/calibre/ebooks/pml/pmlconverter.py | 6 +- 2 files changed, 74 insertions(+), 25 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 8a88c6a689..1a172ea07d 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -8,6 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' +import re import struct import zlib @@ -37,10 +38,15 @@ class Writer(FormatWriter): self.log = log def write_content(self, oeb_book, out_stream, metadata=None): - text, image_hrefs, text_sizes = self._text(oeb_book) - images = self._images(oeb_book.manifest, image_hrefs) + pmlmlizer = PMLMLizer(self.log) + pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') + + text, text_sizes = self._text(pml) + chapter_index = self._chapter_index(pml) + link_index = self._link_index(pml) + images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs) metadata = [self._metadata(metadata)] - hr = [self._header_record(len(text), len(images))] + hr = [self._header_record(len(text), len(chapter_index.split('\x00')), len(link_index.split('\x00')), len(images))] ''' Record order as generated by Dropbook. @@ -58,7 +64,7 @@ class Writer(FormatWriter): 12. Text block size record 13. "MeTaInFo\x00" word record ''' - sections = hr+text+images+metadata+[text_sizes]+['MeTaInFo\x00'] + sections = hr+text+[chapter_index]+[link_index]+images+metadata+[text_sizes]+['MeTaInFo\x00'] lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections] @@ -72,10 +78,7 @@ class Writer(FormatWriter): else: out_stream.write(item) - def _text(self, oeb_book): - pmlmlizer = PMLMLizer(self.log) - pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') - + def _text(self, pml): pml_pages = [] text_sizes = '' index = 0 @@ -96,7 +99,38 @@ class Writer(FormatWriter): text_sizes += struct.pack('>H', split) index += split - return pml_pages, pmlmlizer.image_hrefs, text_sizes + return pml_pages, text_sizes + + def _index_item(self, mo): + index = '' + if 'text' in mo.groupdict().keys(): + index += struct.pack('>L', mo.start('text')) + # Strip all PML tags from text + text = re.sub(r'[^\\]\\[^\\]', '', mo.group('text')) + text = re.sub(r'\\\\', r'\\', mo.group('text')) + if 'val' in mo.groupdict().keys(): + text = '%s%s' % ('\x20' * 4 * int(mo.group('val')), text) + index += text + index += '\x00' + return index + + def _chapter_index(self, pml): + chapter_marks = [ + r'(?s)\\x(?P<text>.+?)\\x', + r'(?s)\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', + r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"', + ] + index = '' + for chapter_mark in chapter_marks: + for mo in re.finditer(chapter_mark, pml): + index += self._index_item(mo) + return index + + def _link_index(self, pml): + index = '' + for mo in re.finditer(r'(?s)\\Q="(?P<text>.+?)"', pml): + index += self._index_item(mo) + return index def _images(self, manifest, image_hrefs): ''' @@ -164,23 +198,38 @@ class Writer(FormatWriter): return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn) - def _header_record(self, text_items, image_items): + def _header_record(self, text_count, chapter_count, link_count, image_count): ''' - text_items = the number of text pages - image_items = the number of images + text_count = the number of text pages + image_count = the number of images ''' compression = 10 # zlib compression. - non_text_offset = text_items + 1 + non_text_offset = text_count + 1 - if image_items > 0: - image_data_offset = text_items + 1 - meta_data_offset = image_data_offset + image_items + if chapter_count > 0: + chapter_offset = text_count + 1 + else: + chapter_offset = text_count + + if link_count > 0: + link_offset = chapter_offset + 1 + else: + link_offset = chapter_offset + + if image_count > 0: + image_data_offset = link_offset + 1 + meta_data_offset = image_data_offset + image_count last_data_offset = meta_data_offset + 1 else: - meta_data_offset = text_items + 1 + meta_data_offset = link_offset + 1 last_data_offset = meta_data_offset + 1 image_data_offset = last_data_offset + if chapter_count <= 0: + chapter_offset = last_data_offset + if link_count <= 0: + link_offset = last_data_offset + record = '' record += struct.pack('>H', compression) # [0:2] # Compression. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM @@ -190,21 +239,21 @@ class Writer(FormatWriter): record += struct.pack('>H', 0) # [8:10] # Number of small font pages. 0 if page index is not built. record += struct.pack('>H', 0) # [10:12] # Number of large font pages. 0 if page index is not built. record += struct.pack('>H', non_text_offset) # [12:14] # Non-Text record start. - record += struct.pack('>H', 0) # [14:16] # Number of chapter index records. + record += struct.pack('>H', chapter_count) # [14:16] # Number of chapter index records. record += struct.pack('>H', 0) # [16:18] # Number of small font page index records. record += struct.pack('>H', 0) # [18:20] # Number of large font page index records. - record += struct.pack('>H', image_items) # [20:22] # Number of images. - record += struct.pack('>H', 0) # [22:24] # Number of links. + record += struct.pack('>H', image_count) # [20:22] # Number of images. + record += struct.pack('>H', link_count) # [22:24] # Number of links. record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not. record += struct.pack('>H', 0) # [26:28] # Unknown. record += struct.pack('>H', 0) # [28:30] # Number of Footnotes. record += struct.pack('>H', 0) # [30:32] # Number of Sidebars. - record += struct.pack('>H', last_data_offset) # [32:34] # Chapter index offset. + record += struct.pack('>H', chapter_offset) # [32:34] # Chapter index offset. record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC. record += struct.pack('>H', last_data_offset) # [36:38] # Small font page offset. This will be the last data offset if there are none. record += struct.pack('>H', last_data_offset) # [38:40] # Large font page offset. This will be the last data offset if there are none. record += struct.pack('>H', image_data_offset) # [40:42] # Image offset. This will be the last data offset if there are none. - record += struct.pack('>H', image_data_offset) # [42:44] # Links offset. This will be the last data offset if there are none. + record += struct.pack('>H', link_offset) # [42:44] # Links offset. This will be the last data offset if there are none. record += struct.pack('>H', meta_data_offset) # [44:46] # Metadata offset. This will be the last data offset if there are none. record += struct.pack('>H', 0) # [46:48] # Unknown. record += struct.pack('>H', last_data_offset) # [48:50] # Footnote offset. This will be the last data offset if there are none. diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index ca7721350c..3e1b3b4828 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -18,10 +18,10 @@ PML_HTML_RULES = [ (re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s style="page-break-before: always;">%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''), (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry - (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<span style="text-align: center; display: block; margin: auto;">%s</span>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<span style="text-align: right; display: block;">%s</span>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<span style="text-decoration: underline;">%s</span>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''), (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''), From 599de056d01ba2e87b4fdda280a304fc2c055ad7 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 11 Oct 2009 10:32:38 -0400 Subject: [PATCH 042/120] eReader PDB output: proper length of indexes and do not try to add them if they are not avaliable. PML Outpu: cleanup. PML Input: read unicode and entity PML tags correctly. --- src/calibre/ebooks/pdb/ereader/writer.py | 12 ++++++++---- src/calibre/ebooks/pml/pmlconverter.py | 4 ++-- src/calibre/ebooks/pml/pmlml.py | 5 +++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 1a172ea07d..1e108d113b 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -43,10 +43,14 @@ class Writer(FormatWriter): text, text_sizes = self._text(pml) chapter_index = self._chapter_index(pml) + chapter_index = [chapter_index] if chapter_index != '' else [] link_index = self._link_index(pml) + link_index = [link_index] if link_index != '' else [] images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs) metadata = [self._metadata(metadata)] - hr = [self._header_record(len(text), len(chapter_index.split('\x00')), len(link_index.split('\x00')), len(images))] + chapter_index_count = len(chapter_index[0].split('\x00')) - 1 if len(chapter_index) >= 1 else 0 + link_index_count = len(link_index[0].split('\x00')) - 1 if len(link_index) >= 1 else 0 + hr = [self._header_record(len(text), chapter_index_count, link_index_count, len(images))] ''' Record order as generated by Dropbook. @@ -64,7 +68,7 @@ class Writer(FormatWriter): 12. Text block size record 13. "MeTaInFo\x00" word record ''' - sections = hr+text+[chapter_index]+[link_index]+images+metadata+[text_sizes]+['MeTaInFo\x00'] + sections = hr+text+chapter_index+link_index+images+metadata+[text_sizes]+['MeTaInFo\x00'] lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections] @@ -106,8 +110,8 @@ class Writer(FormatWriter): if 'text' in mo.groupdict().keys(): index += struct.pack('>L', mo.start('text')) # Strip all PML tags from text - text = re.sub(r'[^\\]\\[^\\]', '', mo.group('text')) - text = re.sub(r'\\\\', r'\\', mo.group('text')) + text = re.sub(r'\\.', '', mo.group('text')) + # Add appropriate spacing to denote the various levels of headings if 'val' in mo.groupdict().keys(): text = '%s%s' % ('\x20' * 4 * int(mo.group('val')), text) index += text diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 3e1b3b4828..c72a21a5f9 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -35,8 +35,8 @@ PML_HTML_RULES = [ (re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 50%%">%s</span>' % match.group('text').upper() if match.group('text') else ''), - (re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')), - (re.compile(r'\\U(?P<num>\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), + (re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')), + (re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')), (re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''), (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')), diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 72b55d00b1..b6a62e7c1f 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -154,10 +154,15 @@ class PMLMLizer(object): for unused in anchors.difference(links): text = text.replace('\\Q="%s"' % unused, '') + # Turn all html entities into unicode. This should not be necessary as + # lxml should have already done this but we want to be sure it happens. for entity in set(re.findall('&.+?;', text)): mo = re.search('(%s)' % entity[1:-1], text) text = text.replace(entity, entity_to_unicode(mo)) + # Turn all unicode characters into their PML hex equivelent + text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text) + return text def dump_text(self, elem, stylizer, page, tag_stack=[]): From 86a7524b1175bae9807a45aa87f4b6737d020224 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 11 Oct 2009 16:28:04 -0400 Subject: [PATCH 043/120] eReader PDB Output: Disable index generation. --- src/calibre/ebooks/pdb/ereader/writer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 1e108d113b..b8f2cddd0b 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -42,10 +42,12 @@ class Writer(FormatWriter): pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') text, text_sizes = self._text(pml) - chapter_index = self._chapter_index(pml) - chapter_index = [chapter_index] if chapter_index != '' else [] - link_index = self._link_index(pml) - link_index = [link_index] if link_index != '' else [] + #chapter_index = self._chapter_index(pml) + #chapter_index = [chapter_index] if chapter_index != '' else [] + chapter_index = [] + #link_index = self._link_index(pml) + #link_index = [link_index] if link_index != '' else [] + link_index = [] images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs) metadata = [self._metadata(metadata)] chapter_index_count = len(chapter_index[0].split('\x00')) - 1 if len(chapter_index) >= 1 else 0 From 1424435bff6674ecc52e8d5f13b0d0801b595573 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 11 Oct 2009 20:34:33 -0400 Subject: [PATCH 044/120] eReader PDB Output: Generate chapter and link indexes properly. --- src/calibre/ebooks/pdb/ereader/writer.py | 42 +++++++++++------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index b8f2cddd0b..a1203aa9f2 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -42,17 +42,11 @@ class Writer(FormatWriter): pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') text, text_sizes = self._text(pml) - #chapter_index = self._chapter_index(pml) - #chapter_index = [chapter_index] if chapter_index != '' else [] - chapter_index = [] - #link_index = self._link_index(pml) - #link_index = [link_index] if link_index != '' else [] - link_index = [] + chapter_index = self._chapter_index(pml) + link_index = self._link_index(pml) images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs) metadata = [self._metadata(metadata)] - chapter_index_count = len(chapter_index[0].split('\x00')) - 1 if len(chapter_index) >= 1 else 0 - link_index_count = len(link_index[0].split('\x00')) - 1 if len(link_index) >= 1 else 0 - hr = [self._header_record(len(text), chapter_index_count, link_index_count, len(images))] + hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))] ''' Record order as generated by Dropbook. @@ -110,12 +104,15 @@ class Writer(FormatWriter): def _index_item(self, mo): index = '' if 'text' in mo.groupdict().keys(): - index += struct.pack('>L', mo.start('text')) + index += struct.pack('>L', mo.start()) + text = mo.group('text') # Strip all PML tags from text - text = re.sub(r'\\.', '', mo.group('text')) + text = re.sub(r'\\U[0-9a-z]{4}', '', text) + text = re.sub(r'\\a\d{3}', '', text) + text = re.sub(r'\\.', '', text) # Add appropriate spacing to denote the various levels of headings if 'val' in mo.groupdict().keys(): - text = '%s%s' % ('\x20' * 4 * int(mo.group('val')), text) + text = '%s%s' % (' ' * 4 * int(mo.group('val')), text) index += text index += '\x00' return index @@ -126,16 +123,16 @@ class Writer(FormatWriter): r'(?s)\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"', ] - index = '' + index = [] for chapter_mark in chapter_marks: for mo in re.finditer(chapter_mark, pml): - index += self._index_item(mo) + index.append(self._index_item(mo)) return index def _link_index(self, pml): - index = '' + index = [] for mo in re.finditer(r'(?s)\\Q="(?P<text>.+?)"', pml): - index += self._index_item(mo) + index.append(self._index_item(mo)) return index def _images(self, manifest, image_hrefs): @@ -213,27 +210,26 @@ class Writer(FormatWriter): non_text_offset = text_count + 1 if chapter_count > 0: - chapter_offset = text_count + 1 + chapter_offset = non_text_offset else: chapter_offset = text_count - if link_count > 0: - link_offset = chapter_offset + 1 + link_offset = chapter_offset + chapter_count else: link_offset = chapter_offset if image_count > 0: - image_data_offset = link_offset + 1 + image_data_offset = link_offset + link_count meta_data_offset = image_data_offset + image_count last_data_offset = meta_data_offset + 1 else: - meta_data_offset = link_offset + 1 + meta_data_offset = link_offset + link_count last_data_offset = meta_data_offset + 1 image_data_offset = last_data_offset - if chapter_count <= 0: + if chapter_count == 0: chapter_offset = last_data_offset - if link_count <= 0: + if link_count == 0: link_offset = last_data_offset record = '' From d373b5d71e0a01a9dae226573f35e4f9a84ecd21 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 11 Oct 2009 20:39:50 -0400 Subject: [PATCH 045/120] eReader PDB Output: Generate eReader header correctly when no chapter and link indexes are present. --- src/calibre/ebooks/pdb/ereader/writer.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index a1203aa9f2..263f6964bf 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -209,14 +209,8 @@ class Writer(FormatWriter): compression = 10 # zlib compression. non_text_offset = text_count + 1 - if chapter_count > 0: - chapter_offset = non_text_offset - else: - chapter_offset = text_count - if link_count > 0: - link_offset = chapter_offset + chapter_count - else: - link_offset = chapter_offset + chapter_offset = non_text_offset + link_offset = chapter_offset + chapter_count if image_count > 0: image_data_offset = link_offset + link_count From 1b2efaaf6f81a05dffbbbf41e34de907f1922716 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Mon, 12 Oct 2009 20:12:38 -0400 Subject: [PATCH 046/120] TCR input. --- src/calibre/customize/builtins.py | 2 + src/calibre/ebooks/tcr/__init__.py | 5 +++ src/calibre/ebooks/tcr/input.py | 72 ++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+) create mode 100644 src/calibre/ebooks/tcr/__init__.py create mode 100644 src/calibre/ebooks/tcr/input.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 11317bc312..1660e890fc 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -337,6 +337,7 @@ from calibre.ebooks.pml.input import PMLInput from calibre.ebooks.rb.input import RBInput from calibre.web.feeds.input import RecipeInput from calibre.ebooks.rtf.input import RTFInput +from calibre.ebooks.tcr.input import TCRInput from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.lrf.input import LRFInput @@ -385,6 +386,7 @@ plugins += [ RBInput, RecipeInput, RTFInput, + TCRInput, TXTInput, LRFInput, ] diff --git a/src/calibre/ebooks/tcr/__init__.py b/src/calibre/ebooks/tcr/__init__.py new file mode 100644 index 0000000000..9e2aad729c --- /dev/null +++ b/src/calibre/ebooks/tcr/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py new file mode 100644 index 0000000000..066d97a421 --- /dev/null +++ b/src/calibre/ebooks/tcr/input.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ + separate_paragraphs_single_line, separate_paragraphs_print_formatted + +class TCRInput(InputFormatPlugin): + + name = 'TCR Input' + author = 'John Schember' + description = 'Convert TCR files to HTML' + file_types = set(['tcr']) + + options = set([ + OptionRecommendation(name='single_line_paras', recommended_value=False, + help=_('Normally calibre treats blank lines as paragraph markers. ' + 'With this option it will assume that every line represents ' + 'a paragraph instead.')), + OptionRecommendation(name='print_formatted_paras', recommended_value=False, + help=_('Normally calibre treats blank lines as paragraph markers. ' + 'With this option it will assume that every line starting with ' + 'an indent (either a tab or 2+ spaces) represents a paragraph. ' + 'Paragraphs end when the next line that starts with an indent ' + 'is reached.')), + ]) + + def convert(self, stream, options, file_ext, log, accelerators): + txt = [] + + log.debug('Checking TCR header...') + if stream.read(9) != '!!8-Bit!!': + raise ValueError('File %s contaions an invalid TCR header.' % stream.name) + + log.debug('Building string dictionary...') + # Dictionary codes that the file contents are broken down into. + entries = [] + for i in xrange(256): + entry_len = ord(stream.read(1)) + entries.append(stream.read(entry_len)) + + log.info('Decompressing text...') + # Map the values in the file to locations in the string list. + entry_loc = stream.read(1) + while entry_loc != '': # EOF + txt.append(entries[ord(entry_loc)]) + entry_loc = stream.read(1) + + ienc = options.input_encoding if options.input_encoding else 'utf-8' + txt = ''.join(txt).decode(ienc, 'replace') + + log.info('Converting text to OEB...') + if options.single_line_paras: + txt = separate_paragraphs_single_line(txt) + if options.print_formatted_paras: + txt = separate_paragraphs_print_formatted(txt) + html = convert_basic(txt) + with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index: + index.write(html.encode('utf-8')) + + from calibre.ebooks.metadata.meta import get_metadata + mi = get_metadata(stream, 'tcr') + manifest = [('index.html', None)] + spine = ['index.html'] + opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi) + + return os.path.join(os.getcwd(), 'metadata.opf') From 2782f49dca43a8c57de2240db44d16d1c5938672 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 14 Oct 2009 06:51:26 -0400 Subject: [PATCH 047/120] Regex Builder: Initially highlight the already set regex. --- src/calibre/gui2/convert/regex_builder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py index 58c4efbe1d..0ef4f29202 100644 --- a/src/calibre/gui2/convert/regex_builder.py +++ b/src/calibre/gui2/convert/regex_builder.py @@ -51,6 +51,7 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): self.regex.setText(regex) self.regex_valid(True) self.highlighter = RegexHighlighter(self.preview.document()) + self.highlighter.update_regex(regex) if not db or not book_id: self.button_box.addButton(QDialogButtonBox.Open) From da700ca9330ca603cf7b987a63638497271df056 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 14 Oct 2009 17:43:43 -0400 Subject: [PATCH 048/120] Regex Builder: Show correct input step. --- src/calibre/ebooks/oeb/iterator.py | 4 ++-- src/calibre/gui2/convert/regex_builder.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index 33cc96f08b..05bbe7410d 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -122,7 +122,7 @@ class EbookIterator(object): else: print 'Loaded embedded font:', repr(family) - def __enter__(self): + def __enter__(self, raw_only=False): self.delete_on_exit = [] self._tdir = TemporaryDirectory('_ebook_iter') self.base = self._tdir.__enter__() @@ -139,7 +139,7 @@ class EbookIterator(object): plumber.opts, plumber.input_fmt, self.log, {}, self.base) - if plumber.input_fmt.lower() in ('pdf', 'rb'): + if not raw_only and plumber.input_fmt.lower() in ('pdf', 'rb'): self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts, plumber.input_plugin) if hasattr(self.pathtoopf, 'manifest'): diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py index 0ef4f29202..20da8d7aaf 100644 --- a/src/calibre/gui2/convert/regex_builder.py +++ b/src/calibre/gui2/convert/regex_builder.py @@ -87,7 +87,7 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): def open_book(self, pathtoebook): self.iterator = EbookIterator(pathtoebook) - self.iterator.__enter__() + self.iterator.__enter__(raw_only=True) text = [u''] for path in self.iterator.spine: html = open(path, 'rb').read().decode(path.encoding, 'replace') From c26fd05fce100316da46e91a8de03eeed41ad557 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 17 Oct 2009 14:17:26 -0400 Subject: [PATCH 049/120] Opus driver generate t2b files and delete helper files when removing a book from the device. --- src/calibre/devices/cybookg3/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 20a7b259ee..6b5e5ff4ed 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -79,7 +79,7 @@ class CYBOOKG3(USBMS): return zip(paths, cycle([on_card])) -class CYBOOK_OPUS(USBMS): +class CYBOOK_OPUS(CYBOOKG3): name = 'Cybook Opus Device Interface' gui_name = 'Cybook Opus' From df6d759b3852e77c066ea007815177db079dfa01 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Mon, 19 Oct 2009 07:15:27 -0400 Subject: [PATCH 050/120] TCR Output. Move TCR decompression to TCR compression file. --- src/calibre/customize/builtins.py | 2 + src/calibre/ebooks/compression/tcr.py | 126 ++++++++++++++++++++++++++ src/calibre/ebooks/tcr/input.py | 22 +---- src/calibre/ebooks/tcr/output.py | 58 ++++++++++++ 4 files changed, 188 insertions(+), 20 deletions(-) create mode 100644 src/calibre/ebooks/compression/tcr.py create mode 100644 src/calibre/ebooks/tcr/output.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 1660e890fc..e52d693bb5 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -352,6 +352,7 @@ from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pml.output import PMLOutput from calibre.ebooks.rb.output import RBOutput from calibre.ebooks.rtf.output import RTFOutput +from calibre.ebooks.tcr.output import TCROutput from calibre.ebooks.txt.output import TXTOutput from calibre.customize.profiles import input_profiles, output_profiles @@ -402,6 +403,7 @@ plugins += [ PMLOutput, RBOutput, RTFOutput, + TCROutput, TXTOutput, ] plugins += [ diff --git a/src/calibre/ebooks/compression/tcr.py b/src/calibre/ebooks/compression/tcr.py new file mode 100644 index 0000000000..40bed613ec --- /dev/null +++ b/src/calibre/ebooks/compression/tcr.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import re + +def decompress(stream): + txt = [] + stream.seek(0) + if stream.read(9) != '!!8-Bit!!': + raise ValueError('File %s contaions an invalid TCR header.' % stream.name) + + # Codes that the file contents are broken down into. + entries = [] + for i in xrange(256): + entry_len = ord(stream.read(1)) + entries.append(stream.read(entry_len)) + + # Map the values in the file to locations in the string list. + entry_loc = stream.read(1) + while entry_loc != '': # EOF + txt.append(entries[ord(entry_loc)]) + entry_loc = stream.read(1) + + return ''.join(txt) + + +def compress(txt, level=5): + ''' + TCR compression takes the form header+code_list+coded_text. + The header is always "!!8-Bit!!". The code list is a list of 256 strings. + The list takes the form 1 byte length and then a string. Each position in + The list corresponds to a code found in the file. The coded text is + string of characters vaules. for instance the character Q represents the + value 81 which corresponds to the string in the code list at position 81. + ''' + # Turn each unique character into a coded value. + # The code of the string at a given position are represented by the position + # they occupy in the list. + codes = list(set(re.findall('(?msu).', txt))) + for i in range(len(codes), 256): + codes.append('') + # Set the compression level. + if level <= 1: + new_length = 256 + if level >= 10: + new_length = 1 + else: + new_length = int(256 * (10 - level) * .1) + new_length = 1 if new_length < 1 else new_length + # Replace txt with codes. + coded_txt = '' + for c in txt: + coded_txt += chr(codes.index(c)) + txt = coded_txt + # Start compressing the text. + new = True + merged = True + while new or merged: + # Merge codes that always follow another code + merge = [] + merged = False + for i in xrange(256): + if codes[i] != '': + # Find all codes that are next to i. + fall = list(set(re.findall('(?msu)%s.' % re.escape(chr(i)), txt))) + # 1 if only one code comes after i. + if len(fall) == 1: + # We are searching codes and each code is always 1 character. + j = ord(fall[0][1:2]) + # Only merge if the total length of the string represented by + # code is less than 256. + if len(codes[i]) + len(codes[j]) < 256: + merge.append((i, j)) + if merge: + merged = True + for i, j in merge: + # Merge the string for j into the string for i. + if i == j: + # Don't use += here just in case something goes wrong. This + # will prevent out of control memory consumption. This is + # unecessary but when creating this routine it happened due + # to an error. + codes[i] = codes[i] + codes[i] + else: + codes[i] = codes[i] + codes[j] + txt = txt.replace(chr(i)+chr(j), chr(i)) + if chr(j) not in txt: + codes[j] = '' + new = False + if '' in codes: + # Create a list of codes based on combinations of codes that are next + # to each other. The amount of savings for the new code is calculated. + new_codes = [] + for c in list(set(re.findall('(?msu)..', txt))): + i = ord(c[0:1]) + j = ord(c[1:2]) + if codes[i]+codes[j] in codes: + continue + savings = txt.count(chr(i)+chr(j)) - len(codes[i]) - len(codes[j]) + if savings > 2 and len(codes[i]) + len(codes[j]) < 256: + new_codes.append((savings, i, j, codes[i], codes[j])) + if new_codes: + new = True + # Sort the codes from highest savings to lowest. + new_codes.sort(lambda x, y: -1 if x[0] > y[0] else 1 if x[0] < y[0] else 0) + # The shorter new_length the more chances time merging will happen + # giving more changes for better codes to be created. However, + # the shorter new_lengh the longer it will take to compress. + new_codes = new_codes[:new_length] + for code in new_codes: + if '' not in codes: + break + c = codes.index('') + codes[c] = code[3]+code[4] + txt = txt.replace(chr(code[1])+chr(code[2]), chr(c)) + # Generate the code dictionary. + header = [] + for code in codes: + header.append(chr(len(code))+code) + for i in xrange(len(header), 256): + header.append(chr(0)) + # Join the identifier with the dictionary and coded text. + return '!!8-Bit!!'+''.join(header)+txt diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 066d97a421..67fa6ac66e 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -9,6 +9,7 @@ import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted +from calibre.ebooks.compression.tcr import decompress class TCRInput(InputFormatPlugin): @@ -31,28 +32,9 @@ class TCRInput(InputFormatPlugin): ]) def convert(self, stream, options, file_ext, log, accelerators): - txt = [] - - log.debug('Checking TCR header...') - if stream.read(9) != '!!8-Bit!!': - raise ValueError('File %s contaions an invalid TCR header.' % stream.name) - - log.debug('Building string dictionary...') - # Dictionary codes that the file contents are broken down into. - entries = [] - for i in xrange(256): - entry_len = ord(stream.read(1)) - entries.append(stream.read(entry_len)) - log.info('Decompressing text...') - # Map the values in the file to locations in the string list. - entry_loc = stream.read(1) - while entry_loc != '': # EOF - txt.append(entries[ord(entry_loc)]) - entry_loc = stream.read(1) - ienc = options.input_encoding if options.input_encoding else 'utf-8' - txt = ''.join(txt).decode(ienc, 'replace') + txt = decompress(stream).decode(ienc, 'replace') log.info('Converting text to OEB...') if options.single_line_paras: diff --git a/src/calibre/ebooks/tcr/output.py b/src/calibre/ebooks/tcr/output.py new file mode 100644 index 0000000000..8aed995c44 --- /dev/null +++ b/src/calibre/ebooks/tcr/output.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import OutputFormatPlugin, \ + OptionRecommendation +from calibre.ebooks.txt.txtml import TXTMLizer +from calibre.ebooks.compression.tcr import compress + +class TCROutput(OutputFormatPlugin): + + name = 'TCR Output' + author = 'John Schember' + file_type = 'tcr' + + options = set([ + OptionRecommendation(name='output_encoding', recommended_value='utf-8', + level=OptionRecommendation.LOW, + help=_('Specify the character encoding of the output document. ' \ + 'The default is utf-8.')), + OptionRecommendation(name='compression_level', recommended_value=5, + level=OptionRecommendation.LOW, + help=_('Speciy the compression level to use. Scale 1 - 10. 1 ' \ + 'being the lowest compression but the fastest and 10 being the ' \ + 'highest compression but the slowest.')), + ]) + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + setattr(opts, 'flush_paras', False) + setattr(opts, 'max_line_length', 0) + setattr(opts, 'force_max_line_length', False) + setattr(opts, 'indent_paras', False) + + writer = TXTMLizer(log) + txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace') + + log.info('Compressing text...') + txt = compress(txt, opts.compression_level) + + out_stream.seek(0) + out_stream.truncate() + out_stream.write(txt) + + if close: + out_stream.close() From d58f7a92d673ba8f5612263a7e76afc719faeffc Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Tue, 20 Oct 2009 18:37:22 -0400 Subject: [PATCH 051/120] Fix bug #3806: eReader PDB footnote and sidebar anchors set properly during conversion. --- src/calibre/ebooks/pml/pmlconverter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index c72a21a5f9..7133e3f251 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -41,14 +41,14 @@ PML_HTML_RULES = [ (re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''), (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')), (re.compile(r'\\-'), lambda match: ''), - (re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.*?)\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.*?)\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''), + (re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.*?)\\Fn'), lambda match: '<a href="#fns-%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''), + (re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.*?)\\Sd'), lambda match: '<a href="#fns-%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''), # Just italicize index items as that is how the eReader software renders them. (re.compile(r'\\I(?P<text>.*?)\\I', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''), # Sidebar and Footnotes - (re.compile(r'<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''), + (re.compile(r'<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', re.DOTALL), lambda match: '<div id="fns-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''), + (re.compile(r'<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', re.DOTALL), lambda match: '<div id="fns-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''), # eReader files are one paragraph per line. # This forces the lines to wrap properly. @@ -80,5 +80,5 @@ def pml_to_html(pml): def footnote_sidebar_to_html(id, pml): if id.startswith('\x01'): id = id[2:] - html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml)) + html = '<div id="fns-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml)) return html From 8a63b67c7aae0bba9cc40455858438785bb7e17d Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 21 Oct 2009 19:49:59 -0400 Subject: [PATCH 052/120] PML Input: Fix \w and \s tags. --- src/calibre/ebooks/pml/pmlconverter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 7133e3f251..dafe1e4f6a 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -26,9 +26,9 @@ PML_HTML_RULES = [ (re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''), (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.*?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%;">%s</div>' % (match.group('val'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')), + (re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s" />' % match.group('val')), (re.compile(r'\\n'), lambda match: ''), - (re.compile(r'\\s'), lambda match: ''), + (re.compile(r'\\s(?P<text>.*?)\\s', re.DOTALL), lambda match: '<span style="font-size: 50%%">%s</span>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\b(?P<text>.*?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''), # \b is deprecated; \B should be used instead. (re.compile(r'\\l(?P<text>.*?)\\l', re.DOTALL), lambda match: '<span style="font-size: 175%%">%s</span>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\B(?P<text>.*?)\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''), From feb4feaae781e0ba42fc7bb39ee5c9f310faf449 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 21 Oct 2009 21:11:17 -0400 Subject: [PATCH 053/120] PML Output: Use set cover as cover or use first image as cover if no cover is specified. --- src/calibre/ebooks/pml/pmlml.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index b6a62e7c1f..7b1813256e 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -95,6 +95,9 @@ class PMLMLizer(object): def get_cover_page(self): output = u'' + if 'cover' in self.oeb_book.guide: + output += '\\m="cover.png"\n' + self.image_hrefs[self.oeb_book.guide['cover'].href] = 'cover.png' if 'titlepage' in self.oeb_book.guide: self.log.debug('Generating title page...') href = self.oeb_book.guide['titlepage'].href @@ -191,7 +194,10 @@ class PMLMLizer(object): if tag in IMAGE_TAGS: if elem.attrib.get('src', None): if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys(): - self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') + if len(self.image_hrefs.keys()) == 0: + self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png' + else: + self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])]) if tag == 'hr': w = '\\w' From 53a97fc98c633f1f8941831939586ee383aa6461 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 23 Oct 2009 21:11:16 -0400 Subject: [PATCH 054/120] PML Output: Add .png to image names. Fix removing excessive newlines from PML output. PMLZ Output: Name images correctly. --- src/calibre/ebooks/pdb/ereader/__init__.py | 6 +-- src/calibre/ebooks/pml/output.py | 16 +++---- src/calibre/ebooks/pml/pmlml.py | 50 ++++++++++------------ 3 files changed, 32 insertions(+), 40 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/__init__.py b/src/calibre/ebooks/pdb/ereader/__init__.py index 3f08b068cb..89560c9448 100644 --- a/src/calibre/ebooks/pdb/ereader/__init__.py +++ b/src/calibre/ebooks/pdb/ereader/__init__.py @@ -16,11 +16,11 @@ def image_name(name, taken_names=[]): cut = len(name) - 32 names = name[:10] namee = name[10+cut:] - name = names + namee + name = '%s%s.png' % (names, namee) while name in taken_names: - for i in xrange(9999999999999999999999999999999): - name = '%s%s' % (name[:-len('%s' % i)], i) + for i in xrange(999999999999999999999999999): + name = '%s%s.png' % (name[:-len('%s' % i)], i) name = name.ljust(32, '\x00')[:32] diff --git a/src/calibre/ebooks/pml/output.py b/src/calibre/ebooks/pml/output.py index 360e63c98e..774fc4c8d1 100644 --- a/src/calibre/ebooks/pml/output.py +++ b/src/calibre/ebooks/pml/output.py @@ -18,7 +18,7 @@ from calibre.customize.conversion import OutputFormatPlugin from calibre.customize.conversion import OptionRecommendation from calibre.ptempfile import TemporaryDirectory from calibre.utils.zipfile import ZipFile -from calibre.ebooks.oeb.base import OEB_IMAGES +from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES from calibre.ebooks.pml.pmlml import PMLMLizer class PMLOutput(OutputFormatPlugin): @@ -40,28 +40,26 @@ class PMLOutput(OutputFormatPlugin): def convert(self, oeb_book, output_path, input_plugin, opts, log): with TemporaryDirectory('_pmlz_output') as tdir: pmlmlizer = PMLMLizer(log) - content = pmlmlizer.extract_content(oeb_book, opts) + pml = unicode(pmlmlizer.extract_content(oeb_book, opts)) with open(os.path.join(tdir, 'index.pml'), 'wb') as out: - out.write(content.encode(opts.output_encoding, 'replace')) + out.write(pml.encode(opts.output_encoding, 'replace')) - self.write_images(oeb_book.manifest, tdir) + self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, tdir) log.debug('Compressing output...') pmlz = ZipFile(output_path, 'w') pmlz.add_dir(tdir) - def write_images(self, manifest, out_dir): + def write_images(self, manifest, image_hrefs, out_dir): for item in manifest: - if item.media_type in OEB_IMAGES: + if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys(): im = Image.open(cStringIO.StringIO(item.data)) data = cStringIO.StringIO() im.save(data, 'PNG') data = data.getvalue() - name = os.path.splitext(os.path.basename(item.href))[0] + '.png' - path = os.path.join(out_dir, name) + path = os.path.join(out_dir, image_hrefs[item.href]) with open(path, 'wb') as out: out.write(data) - diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 7b1813256e..862f0ea0ae 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en' Transform OEB content into PML markup ''' -import os import re from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace @@ -138,16 +137,13 @@ class PMLMLizer(object): aid = self.link_hrefs[aid] return u'\\Q="%s"' % aid + def remove_newlines(self, text): + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + return text + def clean_text(self, text): - # Remove excess spaces at beginning and end of lines - text = re.sub('(?m)^[ ]+', '', text) - text = re.sub('(?m)[ ]+$', '', text) - - # Remove excessive newlines - text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) - text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) - text = re.sub('[ ]{2,}', ' ', text) - # Remove excessive \p tags text = re.sub(r'\\p\s*\\p', '', text) @@ -166,6 +162,17 @@ class PMLMLizer(object): # Turn all unicode characters into their PML hex equivelent text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text) + # Remove excess spaces at beginning and end of lines + text = re.sub('(?m)^[ ]+', '', text) + text = re.sub('(?m)[ ]+$', '', text) + + # Remove excessive spaces + text = re.sub('[ ]{2,}', ' ', text) + + # Remove excessive newlines + text = re.sub('\n[ ]+\n', '\n\n', text) + text = re.sub('\n\n\n+', '\n\n', text) + return text def dump_text(self, elem, stylizer, page, tag_stack=[]): @@ -197,7 +204,7 @@ class PMLMLizer(object): if len(self.image_hrefs.keys()) == 0: self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png' else: - self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') + self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])]) if tag == 'hr': w = '\\w' @@ -251,7 +258,7 @@ class PMLMLizer(object): # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': - text.append(self.elem_text(elem, tag_stack)) + text.append(self.remove_newlines(elem.text)) for item in elem: text += self.dump_text(item, stylizer, page, tag_stack) @@ -261,32 +268,19 @@ class PMLMLizer(object): close_tag_list.insert(0, tag_stack.pop()) text += self.close_tags(close_tag_list) if tag in SEPARATE_TAGS: - text.append(os.linesep + os.linesep) + text.append('\n\n') if 'block' not in tag_stack: - text.append(os.linesep + os.linesep) + text.append('\n\n') #if style['page-break-after'] == 'always': # text.append('\\p') if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': - text.append(self.elem_tail(elem, tag_stack)) + text.append(self.remove_newlines(elem.tail)) return text - def elem_text(self, elem, tag_stack): - return self.block_text(elem.text, 'block' in tag_stack) - - def elem_tail(self, elem, tag_stack): - return self.block_text(elem.tail, 'block' in tag_stack) - - def block_text(self, text, in_block): - if in_block: - text = text.replace('\n\r', ' ') - text = text.replace('\n', ' ') - text = text.replace('\r', ' ') - return text - def close_tags(self, tags): text = [u''] for i in range(0, len(tags)): From 4d3af2132d481c0994476dc1c25943ac0bea5428 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 25 Oct 2009 10:16:44 -0400 Subject: [PATCH 055/120] Fix bug with removing spaces to agressively. --- src/calibre/ebooks/fb2/fb2ml.py | 4 ++-- src/calibre/ebooks/pml/pmlml.py | 4 ++-- src/calibre/ebooks/rb/rbml.py | 4 ++-- src/calibre/ebooks/txt/txtml.py | 10 +++++----- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index aaf8361b99..16c822d263 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -241,7 +241,7 @@ class FB2MLizer(object): if not fb2_text or fb2_text[-1] != ' ': fb2_text.append(' ') - if hasattr(elem, 'text') and elem.text != None: + if hasattr(elem, 'text') and elem.text: if 'p' not in tag_stack: fb2_text.append('<p>%s</p>' % prepare_string_for_xml(elem.text)) else: @@ -255,7 +255,7 @@ class FB2MLizer(object): close_tag_list.insert(0, tag_stack.pop()) fb2_text += self.close_tags(close_tag_list) - if hasattr(elem, 'tail') and elem.tail != None: + if hasattr(elem, 'tail') and elem.tail: if 'p' not in tag_stack: fb2_text.append('<p>%s</p>' % prepare_string_for_xml(elem.tail)) else: diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 862f0ea0ae..27e88eb48b 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -257,7 +257,7 @@ class PMLMLizer(object): # margin # Proccess tags that contain text. - if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + if hasattr(elem, 'text') and elem.text: text.append(self.remove_newlines(elem.text)) for item in elem: @@ -276,7 +276,7 @@ class PMLMLizer(object): #if style['page-break-after'] == 'always': # text.append('\\p') - if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + if hasattr(elem, 'tail') and elem.tail: text.append(self.remove_newlines(elem.tail)) return text diff --git a/src/calibre/ebooks/rb/rbml.py b/src/calibre/ebooks/rb/rbml.py index 945e21c994..c293880343 100644 --- a/src/calibre/ebooks/rb/rbml.py +++ b/src/calibre/ebooks/rb/rbml.py @@ -191,7 +191,7 @@ class RBMLizer(object): tag_stack.append(style_tag) # Proccess tags that contain text. - if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + if hasattr(elem, 'text') and elem.text: text.append(prepare_string_for_xml(elem.text)) for item in elem: @@ -203,7 +203,7 @@ class RBMLizer(object): text += self.close_tags(close_tag_list) - if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + if hasattr(elem, 'tail') and elem.tail: text.append(prepare_string_for_xml(elem.tail)) return text diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 59c3ea671a..45383675b4 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -94,7 +94,7 @@ class TXTMLizer(object): text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text) # Remove multiple spaces. - text = re.sub('[ ]+', ' ', text) + text = re.sub('[ ]{2,}', ' ', text) # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) @@ -172,15 +172,15 @@ class TXTMLizer(object): # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: in_block = True - if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text: text.append(u'\n\n') if tag in SPACE_TAGS: - if not end.endswith('u ') and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + if not end.endswith('u ') and hasattr(elem, 'text') and elem.text: text.append(u' ') # Process tags that contain text. - if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + if hasattr(elem, 'text') and elem.text: text.append(elem.text) for item in elem: @@ -192,7 +192,7 @@ class TXTMLizer(object): if in_block: text.append(u'\n\n') - if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + if hasattr(elem, 'tail') and elem.tail: text.append(elem.tail) return text From abe52807cb074c5ed10622ca303146b49c3ce630 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Mon, 26 Oct 2009 19:16:52 -0400 Subject: [PATCH 056/120] Add to conversion documentation. --- src/calibre/manual/conversion.rst | 59 +++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index 1f23d43419..ac25dc8121 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -306,6 +306,11 @@ the headers and footers and remove them. Remember that they operate on the inter by the conversion pipeline. There is also a wizard to help you customize the regular expressions for your document. +The header and footer regular expressions are used in conjunction with the remove header and footer options. +If the remove option is not enabled the regular expression will not be applied to remove the matched text. +As stated the removal works by using a python regular expression. All matched text is simply removed from +the document. You can learn more about regular expressions and the syntax at http://docs.python.org/library/re.html + Miscellaneous ~~~~~~~~~~~~~~ @@ -419,7 +424,61 @@ generating the Table of Contents much simpler. It is called BookCreator and is a Convert TXT documents ~~~~~~~~~~~~~~~~~~~~~~ +TXT documents can contain any imaginable layout. Since TXT documents provide no way to explicitly mark parts of +the text, by default |app| only groups parts of the document into paragraphs. The default is to assume one or +more blank lines are a paragraph boundary. + +.. code-block:: txt + + This is the first. + + This is the + second paragraph. + +TXT input supports a number of options to differentiate how paragraphs are detected. + +:guilabel:`Treat each line as a paragraph` + Assumes that every line is a paragraph. + + .. code-block:: txt + + This is the first. + This is the second. + This is the third. + +:guilabel:`Assume print formatting` + Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when + the next line that starts with an indent is reached. + + .. code-block:: txt + + This is the + first. + This is the second. + + This is the + third. + +:guilabel:`Process using markdown` + |app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown + allows for basic formatting to be added to the document and is an easy way to differentiate non-paragraph + elements such as chapter headings. Marking chapter headings with # and settings the chapter XPath detection + expression to "//h:h1" is the easiest way to have a proper table of contents generated from a TXT document. + You can learn more about the markdown syntax at http://daringfireball.net/projects/markdown/syntax + Convert PDF documents ~~~~~~~~~~~~~~~~~~~~~~ +PDF documents are one of the worst formats to convert from. They are a fixed page size and text placement format. +Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap +paragraphs using a configurable, :guilabel:`Line Un-Wrapping Factor`. This is a scale used to determine the length at which a line should be unwrapped. Valid values are a decimal +between 0 and 1. The default is 0.5, this is the median line length. Lower this value to include more +text in the unwrapping. Increase to include less. + +Also, they often have headers and footers as part of the document that will become included with the text. +Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not +removed from the text it can throw off the paragraph unwrapping. + +Some limitations of PDF input is complex, multi-column, and image based documents are not supported. +Extraction of SVG images from within the document is also not supported. From 68e3acd43ab05ae8ba417b35840e067b0115ca10 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Mon, 26 Oct 2009 19:55:10 -0400 Subject: [PATCH 057/120] Get header and footer regex matching working better. --- src/calibre/ebooks/conversion/preprocess.py | 11 +++++------ src/calibre/ebooks/oeb/iterator.py | 4 ++-- src/calibre/gui2/convert/regex_builder.py | 6 +++--- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 39ca28e87f..b63c7ca861 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -228,17 +228,16 @@ class HTMLPreProcessor(object): else: rules = [] - pre_rules = [] + end_rules = [] if getattr(self.extra_opts, 'remove_header', None): - pre_rules.append( + end_rules.append( (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '') ) if getattr(self.extra_opts, 'remove_footer', None): - pre_rules.append( + end_rules.append( (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') ) - - end_rules = [] + if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) if length: @@ -247,7 +246,7 @@ class HTMLPreProcessor(object): (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), ) - for rule in self.PREPROCESS + pre_rules + rules + end_rules: + for rule in self.PREPROCESS + rules + end_rules: html = rule[0].sub(rule[1], html) # Handle broken XHTML w/ SVG (ugh) diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index 762b14c3e5..565ceed519 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -123,7 +123,7 @@ class EbookIterator(object): else: print 'Loaded embedded font:', repr(family) - def __enter__(self, raw_only=False): + def __enter__(self, processed=False): self.delete_on_exit = [] self._tdir = TemporaryDirectory('_ebook_iter') self.base = self._tdir.__enter__() @@ -140,7 +140,7 @@ class EbookIterator(object): plumber.opts, plumber.input_fmt, self.log, {}, self.base) - if not raw_only and plumber.input_fmt.lower() in ('pdf', 'rb'): + if processed or plumber.input_fmt.lower() in ('pdf', 'rb'): self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts, plumber.input_plugin) if hasattr(self.pathtoopf, 'manifest'): diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py index 20da8d7aaf..b1d8fbcbd5 100644 --- a/src/calibre/gui2/convert/regex_builder.py +++ b/src/calibre/gui2/convert/regex_builder.py @@ -87,12 +87,12 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): def open_book(self, pathtoebook): self.iterator = EbookIterator(pathtoebook) - self.iterator.__enter__(raw_only=True) + self.iterator.__enter__(processed=True) text = [u''] for path in self.iterator.spine: - html = open(path, 'rb').read().decode(path.encoding, 'replace') + html = open(path, 'rb').read().decode('utf-8', 'replace') text.append(html) - self.preview.setPlainText('\n\n'.join(text)) + self.preview.setPlainText('\n---\n'.join(text)) def button_clicked(self, button): if button == self.button_box.button(QDialogButtonBox.Open): From 66f7802f9ebe3464f9de2ce4013d98637329261e Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Tue, 27 Oct 2009 07:44:55 -0400 Subject: [PATCH 058/120] Replace non-breaking spaces with spaces. --- src/calibre/ebooks/pml/pmlml.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 27e88eb48b..aa608496c7 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -153,6 +153,10 @@ class PMLMLizer(object): for unused in anchors.difference(links): text = text.replace('\\Q="%s"' % unused, '') + # Replace bad characters. + text = text.replace(u'\xc2', '') + text = text.replace(u'\xa0', ' ') + # Turn all html entities into unicode. This should not be necessary as # lxml should have already done this but we want to be sure it happens. for entity in set(re.findall('&.+?;', text)): From 4cc5e18606af68984d61894f43d6af82e789ca3e Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 31 Oct 2009 11:33:30 -0400 Subject: [PATCH 059/120] Remove stray setup. Revert create_upload_path to work with the custom path when sending to device. --- setup/installer/osx/freeze.py | 1 - src/calibre/devices/usbms/device.py | 53 ++++------------------------- 2 files changed, 7 insertions(+), 47 deletions(-) diff --git a/setup/installer/osx/freeze.py b/setup/installer/osx/freeze.py index f30a037703..bc764d25d2 100644 --- a/setup/installer/osx/freeze.py +++ b/setup/installer/osx/freeze.py @@ -10,7 +10,6 @@ from setup import __version__ as VERSION, __appname__ as APPNAME, SRC, Command, try: from setuptools import setup except: - setup class setup: pass diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index 0799f6779b..86f8de8e39 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -702,52 +702,13 @@ class Device(DeviceConfig, DevicePlugin): raise FreeSpaceError(_("There is insufficient free space on the storage card")) return path - def create_upload_path(self, path, mdata, fname): - path = os.path.abspath(path) - newpath = path - extra_components = [] - - if self.SUPPORTS_SUB_DIRS and self.settings().use_subdirs: - if 'tags' in mdata.keys(): - for tag in mdata['tags']: - if tag.startswith(_('News')): - extra_components.append('news') - c = sanitize(mdata.get('title', '')) - if c: - extra_components.append(c) - c = sanitize(mdata.get('timestamp', '')) - if c: - extra_components.append(c) - break - elif tag.startswith('/'): - for c in tag.split('/'): - c = sanitize(c) - if not c: continue - extra_components.append(c) - break - - if not extra_components: - c = sanitize(mdata.get('authors', _('Unknown'))) - if c: - extra_components.append(c) - c = sanitize(mdata.get('title', _('Unknown'))) - if c: - extra_components.append(c) - newpath = os.path.join(newpath, c) - - fname = sanitize(fname) - extra_components.append(fname) - extra_components = [str(x) for x in extra_components] - def remove_trailing_periods(x): - ans = x - while ans.endswith('.'): - ans = ans[:-1] - if not ans: - ans = 'x' - return ans - extra_components = list(map(remove_trailing_periods, extra_components)) - components = shorten_components_to(250 - len(path), extra_components) - filepath = os.path.join(path, *components) + def create_upload_path(self, root, mdata, ext, id): + from calibre.library.save_to_disk import config, get_components + opts = config().parse() + components = get_components(opts.template, mdata, id, opts.timefmt, 250) + components = [str(x) for x in components] + components = shorten_components_to(250 - len(root), components) + filepath = '%s%s' % (os.path.join(root, *components), ext) filedir = os.path.dirname(filepath) if not self.SUPPORTS_SUB_DIRS or not self.settings().use_subdirs: From c6eec96b7ea7e70cd52a82009c761e758743c50e Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 31 Oct 2009 12:11:23 -0400 Subject: [PATCH 060/120] Add user specified cover page support to FB2 and RB Output. Fix bug when adding images to RB Output. --- src/calibre/ebooks/fb2/fb2ml.py | 3 +++ src/calibre/ebooks/rb/rbml.py | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 16c822d263..78ecc94681 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -107,6 +107,9 @@ class FB2MLizer(object): def get_cover_page(self): output = u'' + if 'cover' in self.oeb_book.guide: + output += '<image xlink:href="#cover.jpg" />' + self.image_hrefs[self.oeb_book.guide['cover'].href] = 'cover.jpg' if 'titlepage' in self.oeb_book.guide: self.log.debug('Generating cover page...') href = self.oeb_book.guide['titlepage'].href diff --git a/src/calibre/ebooks/rb/rbml.py b/src/calibre/ebooks/rb/rbml.py index c293880343..5574aa94b6 100644 --- a/src/calibre/ebooks/rb/rbml.py +++ b/src/calibre/ebooks/rb/rbml.py @@ -82,13 +82,16 @@ class RBMLizer(object): def get_cover_page(self): output = u'' + if 'cover' in self.oeb_book.guide: + if self.name_map.get(self.oeb_book.guide['cover'].href, None): + output += '<IMG SRC="%s">' % self.name_map[self.oeb_book.guide['cover'].href] if 'titlepage' in self.oeb_book.guide: self.log.debug('Generating cover page...') href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) - output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output def get_toc(self): @@ -152,7 +155,7 @@ class RBMLizer(object): if tag in IMAGE_TAGS: if elem.attrib.get('src', None): if page.abshref(elem.attrib['src']) not in self.name_map.keys(): - self.name_map[page.abshref(elem.attrib['src'])] = unique_name('%s' % len(self.image_hrefs.keys()), self.image_hrefs.keys(), self.name_map.keys()) + self.name_map[page.abshref(elem.attrib['src'])] = unique_name('%s' % len(self.name_map.keys()), self.name_map.keys()) text.append('<IMG SRC="%s">' % self.name_map[page.abshref(elem.attrib['src'])]) rb_tag = tag.upper() if tag in TAGS else None From f11d876a9d99b3ee90d780a02e116d7c36d98496 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 11 Nov 2009 12:07:56 -0500 Subject: [PATCH 061/120] Integrate WayneD's PML input image fix. --- src/calibre/ebooks/pml/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index 270c8a7b0f..4128e4aa38 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -79,7 +79,7 @@ class PMLInput(InputFormatPlugin): pimg_name = os.path.basename(img) pimg_path = os.path.join(os.getcwd(), 'images', pimg_name) - images.append(pimg_name) + images.append('images/' + pimg_name) shutil.move(img, pimg_path) else: From 3fba659ba3421cc92f229ef2495f32621be7dd98 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 11 Nov 2009 12:19:57 -0500 Subject: [PATCH 062/120] Integrate WayneD's PML input parsing fixes. --- src/calibre/ebooks/pml/input.py | 16 ++++++- src/calibre/ebooks/pml/pmlconverter.py | 58 ++++++++++++++++---------- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index 4128e4aa38..f2d00742ba 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -42,9 +42,23 @@ class PMLInput(InputFormatPlugin): if self.options.input_encoding: ienc = self.options.input_encoding + style = ''' +<style> +.s {font-size: 1em} +.l {font-size: 1.5em} +.k {font-size: 0.75em} +.c {text-align: center; margin: auto} +.r {text-align: right} +.t {margin-left: 5%} +.p {page-break-after: always} +.x {page-break-before: always} +</style> +''' self.log.debug('Converting PML to HTML...') html = pml_to_html(pml_stream.read().decode(ienc)) - html_stream.write('<html><head><title /></head><body>' + html.encode('utf-8', 'replace') + '</body></html>') + html_stream.write('<html><head><title />%s</head><body>' % style) + html_stream.write(html.encode('utf-8', 'replace')) + html_stream.write('</body></html>') if pclose: pml_stream.close() diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index dafe1e4f6a..1b42f99cc1 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -14,27 +14,38 @@ from calibre import my_unichr from calibre.ebooks.pdb.ereader import image_name PML_HTML_RULES = [ - (re.compile(r'\\p'), lambda match: '<br /><br style="page-break-after: always;" />'), - (re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s style="page-break-before: always;">%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''), + # Any literal <, &, and > chars be escaped to avoid HTML issues (though + # <footnote> and <sidebar> tags are handled specially later). + (re.compile(r'&'), lambda match: '&'), + (re.compile(r'<'), lambda match: '<'), + (re.compile(r'>'), lambda match: '>'), + + # NOP-process all \x escapes, turning \\ into \ This keeps the regex + # parsing simple while making sure that we don't try to honor \\x as \x + # (and also makes sure we DO honor \\\x as \ followed by \x). + (re.compile(r'\\(.)'), lambda match: '\' if match.group(1) == '\\' else '\\' + match.group(1)), + + (re.compile(r'\\p'), lambda match: '<br /><br class="p" />'), + (re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 class="x">%s</h1>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s>%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''), (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry - (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<span style="text-align: center; display: block; margin: auto;">%s</span>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<span style="text-align: right; display: block;">%s</span>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div class="c">%s</div>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div class="r">%s</div>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<span style="text-decoration: underline;">%s</span>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<u>%s</u>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.*?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%;">%s</div>' % (match.group('val'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s" />' % match.group('val')), + (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div class="t">%s</div>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.*?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%">%s</div>' % (match.group('val'), match.group('text')) if match.group('text') else ''), + (re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')), (re.compile(r'\\n'), lambda match: ''), - (re.compile(r'\\s(?P<text>.*?)\\s', re.DOTALL), lambda match: '<span style="font-size: 50%%">%s</span>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\s(?P<text>.*?)\\s', re.DOTALL), lambda match: '<span class="s">%s</span>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\b(?P<text>.*?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''), # \b is deprecated; \B should be used instead. - (re.compile(r'\\l(?P<text>.*?)\\l', re.DOTALL), lambda match: '<span style="font-size: 175%%">%s</span>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\l(?P<text>.*?)\\l', re.DOTALL), lambda match: '<span class="l">%s</span>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\B(?P<text>.*?)\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 50%%">%s</span>' % match.group('text').upper() if match.group('text') else ''), + (re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span class="k">%s</span>' % match.group('text').upper() if match.group('text') else ''), (re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')), (re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')), @@ -47,8 +58,8 @@ PML_HTML_RULES = [ (re.compile(r'\\I(?P<text>.*?)\\I', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''), # Sidebar and Footnotes - (re.compile(r'<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', re.DOTALL), lambda match: '<div id="fns-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', re.DOTALL), lambda match: '<div id="fns-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''), + (re.compile(r'<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', re.DOTALL), lambda match: '<div id="fns-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''), + (re.compile(r'<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', re.DOTALL), lambda match: '<div id="fns-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''), # eReader files are one paragraph per line. # This forces the lines to wrap properly. @@ -58,16 +69,17 @@ PML_HTML_RULES = [ # Ensure empty lines carry over. (re.compile('(\r\n|\n|\r){3}'), lambda match: '<br />'), - # Remove unmatched plm codes. - (re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''), - (re.compile(r'(?<=[^\\])\\X[0-4]'), lambda match: ''), - (re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''), - (re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''), - # Remove invalid single item pml codes. - (re.compile(r'(?<=[^\\])\\[^\\]'), lambda match: ''), + # Try to fix some of the misordering of character-attribute tags. + (re.compile(r'(?P<ch>(<(i|u|b|del|sup|sub)( [^>]+)?>)+)(?P<close>(</(div|span)>)+)'), lambda match: match.group('close') + match.group('ch')), + (re.compile(r'(?P<ch>(<(i|u|b|del|sup|sub|span)( [^>]+)?>)+)(?P<blk>(<(div|h\d)( [^>]+)?>)+)'), lambda match: match.group('blk') + match.group('ch')), - # Replace \\ with \. - (re.compile(r'\\\\'), lambda match: '\\'), + # Remove unmatched plm codes. + (re.compile(r'\\X[0-4]'), lambda match: ''), + (re.compile(r'\\T="\d+%*"'), lambda match: ''), + (re.compile(r'\\Sp'), lambda match: ''), + (re.compile(r'\\Sb'), lambda match: ''), + # Remove invalid single item pml codes. + (re.compile(r'\\.'), lambda match: ''), ] def pml_to_html(pml): From 155fd8a9e62a8d41f3b5d88cb03ddc787aac2125 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 12 Nov 2009 13:23:10 -0500 Subject: [PATCH 063/120] PML metadata reader. --- src/calibre/customize/builtins.py | 11 +++++++ src/calibre/ebooks/metadata/pml.py | 53 ++++++++++++++++++++++++++++++ src/calibre/ebooks/pml/input.py | 3 +- 3 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/metadata/pml.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 22ae0d4b04..061a4409a6 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -197,6 +197,17 @@ class PDFMetadataReader(MetadataReaderPlugin): return get_quick_metadata(stream) return get_metadata(stream) +class PMLMetadataReader(MetadataReaderPlugin): + + name = 'Read PML metadata' + file_types = set(['pml', 'pmlz']) + description = _('Read metadata from %s files') % 'PML' + author = 'John Schember' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.pml import get_metadata + return get_metadata(stream) + class RARMetadataReader(MetadataReaderPlugin): name = 'Read RAR metadata' diff --git a/src/calibre/ebooks/metadata/pml.py b/src/calibre/ebooks/metadata/pml.py new file mode 100644 index 0000000000..57ca29172a --- /dev/null +++ b/src/calibre/ebooks/metadata/pml.py @@ -0,0 +1,53 @@ +'''Read meta information from TXT files''' + +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' + +import os +import glob +import re + +from calibre.ebooks.metadata import MetaInformation +from calibre.ptempfile import TemporaryDirectory +from calibre.utils.zipfile import ZipFile + +def get_metadata(stream, extract_cover=True): + """ Return metadata as a L{MetaInfo} object """ + mi = MetaInformation(_('Unknown'), [_('Unknown')]) + stream.seek(0) + + pml = '' + if stream.name.endswith('.pmlz'): + with TemporaryDirectory('_unpmlz') as tdir: + zf = ZipFile(stream) + zf.extractall(tdir) + + pmls = glob.glob(os.path.join(tdir, '*.pml')) + for p in pmls: + with open(p, 'r+b') as p_stream: + pml += p_stream.read() + else: + pml = stream.read() + + for comment in re.findall(r'(?mus)\\v.*?\\v', pml): + m = re.search(r'TITLE="(.*?)"', comment) + if m: + mi.title = m.group(1).strip().decode('cp1252', 'replace') + m = re.search(r'AUTHOR="(.*?)"', comment) + if m: + if mi.authors == [_('Unknown')]: + mi.authors = [] + mi.authors.append(m.group(1).strip().decode('cp1252', 'replace')) + m = re.search(r'PUBLISHER="(.*?)"', comment) + if m: + mi.publisher = m.group(1).strip().decode('cp1252', 'replace') + m = re.search(r'COPYRIGHT="(.*?)"', comment) + if m: + mi.rights = m.group(1).strip().decode('cp1252', 'replace') + m = re.search(r'ISBN="(.*?)"', comment) + if m: + mi.isbn = m.group(1).strip().decode('cp1252', 'replace') + + return mi diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index f2d00742ba..2475e40c34 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -31,6 +31,7 @@ class PMLInput(InputFormatPlugin): pclose = True else: pml_stream = pml_path + pml_stream.seek(0) if not hasattr(html_path, 'write'): html_stream = open(html_path, 'wb') @@ -38,7 +39,7 @@ class PMLInput(InputFormatPlugin): else: html_stream = html_path - ienc = pml_stream.encoding if pml_stream.encoding else 'utf-8' + ienc = pml_stream.encoding if pml_stream.encoding else 'cp1252' if self.options.input_encoding: ienc = self.options.input_encoding From c2d9b2e6ad47409cf135d7b619bf5aea96b6cfa5 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 13 Nov 2009 11:41:09 -0500 Subject: [PATCH 064/120] Fix bug #3941: Handle input with <br> separated paragraphs better. --- src/calibre/ebooks/fb2/fb2ml.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 78ecc94681..41b93f6d6b 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -30,7 +30,7 @@ TAG_MAP = { 'i' : 'emphasis', 'p' : 'p', 'li' : 'p', - 'br' : 'empty-line', + 'br' : 'p', } TAG_SPACE = [ @@ -227,8 +227,14 @@ class FB2MLizer(object): fb2_text.append(self.get_anchor(page, id_name)) fb2_tag = TAG_MAP.get(tag, None) - if fb2_tag and fb2_tag not in tag_stack: - tag_count += 1 + if fb2_tag: + if fb2_tag not in tag_stack: + tag_count += 1 + else: + tag_stack.reverse() + tag_stack.remove(fb2_tag) + tag_stack.reverse() + fb2_text.append('</%s>' % fb2_tag) fb2_text.append('<%s>' % fb2_tag) tag_stack.append(fb2_tag) From 4d5f1894b158802b6c48911918e1308058361f84 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 14 Nov 2009 16:54:21 -0500 Subject: [PATCH 065/120] Fix pml parsing changes as they break PDB eReader input badly. --- src/calibre/ebooks/pml/input.py | 16 +--------------- src/calibre/ebooks/pml/pmlconverter.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 23 deletions(-) diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index 2475e40c34..ead6c988f4 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -43,23 +43,9 @@ class PMLInput(InputFormatPlugin): if self.options.input_encoding: ienc = self.options.input_encoding - style = ''' -<style> -.s {font-size: 1em} -.l {font-size: 1.5em} -.k {font-size: 0.75em} -.c {text-align: center; margin: auto} -.r {text-align: right} -.t {margin-left: 5%} -.p {page-break-after: always} -.x {page-break-before: always} -</style> -''' self.log.debug('Converting PML to HTML...') html = pml_to_html(pml_stream.read().decode(ienc)) - html_stream.write('<html><head><title />%s</head><body>' % style) - html_stream.write(html.encode('utf-8', 'replace')) - html_stream.write('</body></html>') + html_stream.write('<html><head><title /></head><body>%s</body></html>' % html.encode('utf-8', 'replace')) if pclose: pml_stream.close() diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 1b42f99cc1..140317c9df 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -25,27 +25,27 @@ PML_HTML_RULES = [ # (and also makes sure we DO honor \\\x as \ followed by \x). (re.compile(r'\\(.)'), lambda match: '\' if match.group(1) == '\\' else '\\' + match.group(1)), - (re.compile(r'\\p'), lambda match: '<br /><br class="p" />'), - (re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 class="x">%s</h1>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\p'), lambda match: '<br /><br style="page-break-after: always;" />'), + (re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s>%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''), (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry - (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div class="c">%s</div>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div class="r">%s</div>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; margin: auto;">%s</div>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div style="text-align: right;">%s</div>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<u>%s</u>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div class="t">%s</div>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%;">%s</div>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.*?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%">%s</div>' % (match.group('val'), match.group('text')) if match.group('text') else ''), (re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')), (re.compile(r'\\n'), lambda match: ''), - (re.compile(r'\\s(?P<text>.*?)\\s', re.DOTALL), lambda match: '<span class="s">%s</span>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\s(?P<text>.*?)\\s', re.DOTALL), lambda match: '<span style="font-size: 1em;">%s</span>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\b(?P<text>.*?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''), # \b is deprecated; \B should be used instead. - (re.compile(r'\\l(?P<text>.*?)\\l', re.DOTALL), lambda match: '<span class="l">%s</span>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\l(?P<text>.*?)\\l', re.DOTALL), lambda match: '<span style="font-size: 1.5em;">%s</span>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\B(?P<text>.*?)\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span class="k">%s</span>' % match.group('text').upper() if match.group('text') else ''), + (re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 0.75em;">%s</span>' % match.group('text').upper() if match.group('text') else ''), (re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')), (re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')), From fb7ecb5cf21738ebc37cb89ac25c64144dfcae24 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 15 Nov 2009 19:38:12 -0500 Subject: [PATCH 066/120] Fix typos. --- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/pml/pmlconverter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index b63c7ca861..29ce0e4296 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -243,7 +243,7 @@ class HTMLPreProcessor(object): if length: end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + rules + end_rules: diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 140317c9df..1505e5fc4b 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -35,7 +35,7 @@ PML_HTML_RULES = [ (re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<u>%s</u>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%;">%s</div>' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''), (re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.*?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%">%s</div>' % (match.group('val'), match.group('text')) if match.group('text') else ''), (re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')), (re.compile(r'\\n'), lambda match: ''), From 2bdc1afa259044ab94e583d3831445bb31e1c6b8 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 21 Nov 2009 20:48:35 -0500 Subject: [PATCH 067/120] tweaks. --- src/calibre/ebooks/pdb/ereader/reader.py | 4 ++-- src/calibre/ebooks/pml/pmlconverter.py | 10 ++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 77ca8d6933..ad1df98793 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -31,5 +31,5 @@ class Reader(FormatReader): def dump_pml(self): return self.reader.dump_pml() - def dump_images(self): - return self.reader.dump_images() + def dump_images(self, out_dir): + return self.reader.dump_images(out_dir) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 1505e5fc4b..b0d9ce1ec7 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -10,16 +10,10 @@ __docformat__ = 'restructuredtext en' import re -from calibre import my_unichr +from calibre import my_unichr, prepare_string_for_xml from calibre.ebooks.pdb.ereader import image_name PML_HTML_RULES = [ - # Any literal <, &, and > chars be escaped to avoid HTML issues (though - # <footnote> and <sidebar> tags are handled specially later). - (re.compile(r'&'), lambda match: '&'), - (re.compile(r'<'), lambda match: '<'), - (re.compile(r'>'), lambda match: '>'), - # NOP-process all \x escapes, turning \\ into \ This keeps the regex # parsing simple while making sure that we don't try to honor \\x as \x # (and also makes sure we DO honor \\\x as \ followed by \x). @@ -83,7 +77,7 @@ PML_HTML_RULES = [ ] def pml_to_html(pml): - html = pml + html = prepare_string_for_xml(pml) for rule in PML_HTML_RULES: html = rule[0].sub(rule[1], html) From c569ba843fbfc01013ddd0c10683c7bcd2294169 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 21 Nov 2009 21:20:15 -0500 Subject: [PATCH 068/120] Line oriented simple state machine for PML parsing. --- src/calibre/ebooks/pml/pmlconverter.py | 435 +++++++++++++++++++++---- 1 file changed, 373 insertions(+), 62 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index b0d9ce1ec7..05cf488617 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -9,79 +9,390 @@ __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' import re +import StringIO from calibre import my_unichr, prepare_string_for_xml from calibre.ebooks.pdb.ereader import image_name -PML_HTML_RULES = [ - # NOP-process all \x escapes, turning \\ into \ This keeps the regex - # parsing simple while making sure that we don't try to honor \\x as \x - # (and also makes sure we DO honor \\\x as \ followed by \x). - (re.compile(r'\\(.)'), lambda match: '\' if match.group(1) == '\\' else '\\' + match.group(1)), +class PML_HTMLizer(object): - (re.compile(r'\\p'), lambda match: '<br /><br style="page-break-after: always;" />'), - (re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s>%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''), - (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry - (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; margin: auto;">%s</div>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div style="text-align: right;">%s</div>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<u>%s</u>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.*?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%">%s</div>' % (match.group('val'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')), - (re.compile(r'\\n'), lambda match: ''), - (re.compile(r'\\s(?P<text>.*?)\\s', re.DOTALL), lambda match: '<span style="font-size: 1em;">%s</span>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\b(?P<text>.*?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''), # \b is deprecated; \B should be used instead. - (re.compile(r'\\l(?P<text>.*?)\\l', re.DOTALL), lambda match: '<span style="font-size: 1.5em;">%s</span>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\B(?P<text>.*?)\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 0.75em;">%s</span>' % match.group('text').upper() if match.group('text') else ''), - (re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')), - (re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), - (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')), - (re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')), - (re.compile(r'\\-'), lambda match: ''), - (re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.*?)\\Fn'), lambda match: '<a href="#fns-%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.*?)\\Sd'), lambda match: '<a href="#fns-%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''), - # Just italicize index items as that is how the eReader software renders them. - (re.compile(r'\\I(?P<text>.*?)\\I', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''), + STATES = [ + 'i', + 'u', + 'd', + 'b', + 'sp', + 'sb', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'a', + 'c', + 'r', + 't', + 's', + 'l', + 'T', + 'Fn', + 'Sd', + 'FS' + ] - # Sidebar and Footnotes - (re.compile(r'<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', re.DOTALL), lambda match: '<div id="fns-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', re.DOTALL), lambda match: '<div id="fns-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''), + STATES_VALUE_REQ = [ + 'a', + 'T', + 'FS' + ] - # eReader files are one paragraph per line. - # This forces the lines to wrap properly. - (re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')), - # Remove empty <p>'s. - (re.compile('<p>[ ]*</p>'), lambda match: ''), - # Ensure empty lines carry over. - (re.compile('(\r\n|\n|\r){3}'), lambda match: '<br />'), + STATES_TAGS = { + 'h1': ('<h1 style="page-break-after: always;">', '</h1>'), + 'h2': ('<h2>', '</h2>'), + 'h3': ('<h3>', '</h3>'), + 'h4': ('<h4>', '</h4>'), + 'h5': ('<h5>', '</h5>'), + 'h6': ('<h6>', '</h6>'), + 'sp': ('<sup>', '</sup>'), + 'sb': ('<sub>', '</sub>'), + 'a': ('<a href="%s">', '</a>'), + 'c': ('<div style="text-align: center; margin: auto;">', '</div>'), + 'r': ('<div style="text-align: right;">', '</div>'), + 't': ('<div style="margin-left: 5%;">', '</div>'), + 'T': ('<div style="margin-left: %s;">', '</div>'), + 'i': ('<span style="font-style : italic;">', '</span>'), + 'u': ('<span style="text-decoration : underline;">', '</span>'), + 'd': ('<span style="text-decoration: line-through;">', '</span>'), + 'b': ('<span style="font-weight: bold;">', '</span>'), + 's': ('<span style="font-size: 75%">', '</span>'), + 'l': ('<span style="font-size: 125%">', '</span>'), + 'FS': ('<div id="%s">', '</div>'), + } - # Try to fix some of the misordering of character-attribute tags. - (re.compile(r'(?P<ch>(<(i|u|b|del|sup|sub)( [^>]+)?>)+)(?P<close>(</(div|span)>)+)'), lambda match: match.group('close') + match.group('ch')), - (re.compile(r'(?P<ch>(<(i|u|b|del|sup|sub|span)( [^>]+)?>)+)(?P<blk>(<(div|h\d)( [^>]+)?>)+)'), lambda match: match.group('blk') + match.group('ch')), + CODE_STATES = { + 'q': 'a', + 'x': 'h1', + 'X0': 'h2', + 'X1': 'h3', + 'X2': 'h4', + 'X3': 'h5', + 'X4': 'h6', + 'Sp': 'sp', + 'Sb': 'sb', + 'c': 'c', + 'r': 'r', + 't': 't', + 'T': 'T', + 'i': 'i', + 'I': 'i', + 'u': 'u', + 'o': 'd', + 'b': 'b', + 'B': 'b', + 's': 's', + 'l': 'l', + 'Fn': 'a', + 'Sd': 'a', + 'FN': 'FS', + 'SB': 'FS', + } - # Remove unmatched plm codes. - (re.compile(r'\\X[0-4]'), lambda match: ''), - (re.compile(r'\\T="\d+%*"'), lambda match: ''), - (re.compile(r'\\Sp'), lambda match: ''), - (re.compile(r'\\Sb'), lambda match: ''), - # Remove invalid single item pml codes. - (re.compile(r'\\.'), lambda match: ''), -] + DIV_STATES = [ + 'c', + 'r', + 't', + 'T', + 'FS', + ] -def pml_to_html(pml): - html = prepare_string_for_xml(pml) - for rule in PML_HTML_RULES: - html = rule[0].sub(rule[1], html) + SPAN_STATES = [ + 's', + 'l', + 'i', + 'u', + 'd', + 'b', + ] - return html + def __init__(self, close_all): + self.close_all = close_all + self.state = {} + + def prepare_pml(self, pml): + # Remove comments + pml = re.sub(r'(?mus)\\v(?P<text>.*?)\\v', '', pml) + # Footnotes and Sidebars + pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) + pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) + + pml = prepare_string_for_xml(pml) + + pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml) + pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml) + + return pml + + def prepare_line(self, line): + line = re.sub(r'[ ]{2,}', ' ', line) + line = re.sub(r'^[ ]*(?=.)', '', line) + line = re.sub(r'(?<=.)[ ]*$', '', line) + line = re.sub(r'^[ ]*$', '', line) + + return line + + def start_line(self): + start = u'' + + for key, val in self.state.items(): + if val[0]: + if key not in self.STATES_VALUE_REQ: + start += self.STATES_TAGS[key][0] + else: + start += self.STATES_TAGS[key][0] % val[1] + + return u'<p>%s' % start + + def end_line(self): + end = u'' + + for key, val in self.state.items(): + if val[0]: + if key == 'T': + self.state['T'][0] = False + end += self.STATES_TAGS[key][1] + + return u'%s</p>' % end + + def process_code_simple(self, code): + if code not in self.CODE_STATES.keys(): + return u'' + + text = u'' + + if self.state[self.CODE_STATES[code]][0]: + text = self.STATES_TAGS[self.CODE_STATES[code]][1] + else: + text = self.STATES_TAGS[self.CODE_STATES[code]][0] + + self.state[self.CODE_STATES[code]][0] = not self.state[self.CODE_STATES[code]][0] + + return text + + def process_code_link(self, stream, pre=''): + text = u'' + + href = self.code_value(stream) + if pre: + href = '#%s-%s' % (pre, href) + + if self.state['a'][0]: + text = self.STATES_TAGS['a'][1] + else: + text = self.STATES_TAGS['a'][0] % href + self.state['a'][1] = href + + self.state['a'][0] = not self.state['a'][0] + + return text + + def process_code_div_span(self, code, stream): + if self.close_all: + return self.process_code_div_span_call(code, stream) + else: + return self.process_code_div_span_ind(code, stream) + + def process_code_div_span_ind(self, code, stream): + text = u'' + ds = [] + + code = self.CODE_STATES[code] + + if code in self.DIV_STATES: + ds = self.DIV_STATES[:] + elif code in self.SPAN_STATES: + ds = self.SPAN_STATES[:] + + if self.state[code][0]: + # Close all. + for c in ds: + if self.state[c][0]: + text += self.STATES_TAGS[c][1] + # Reopen the based on state. + del ds[ds.index(code)] + for c in ds: + if self.state[c][0]: + if c in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] + else: + text += self.STATES_TAGS[c][0] + else: + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + text = self.STATES_TAGS[code][0] % val + self.state[code][1] = val + else: + text = self.STATES_TAGS[code][0] + + self.state[code][0] = not self.state[code][0] + + return text + + def process_code_div_span_call(self, code, stream): + text = u'' + divs = self.DIV_STATES[:] + spans = self.SPAN_STATES[:] + + code = self.CODE_STATES[code] + + if self.state[code][0]: + # Close all divs then spans. + for c in spans+divs: + if self.state[c][0]: + text += self.STATES_TAGS[c][1] + # Reopen the based on state. Open divs then spans + if code in self.DIV_STATES: + del divs[divs.index(code)] + if code in self.SPAN_STATES: + del spans[spans.index(code)] + for c in divs+spans: + if state[c][0]: + if c in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] + else: + text += self.STATES_TAGS[c][0] + else: + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + text = self.STATES_TAGS[code][0] % val + state[code][1] = val + else: + text = self.STATES_TAGS[code][0] + + self.state[code][0] = not self.state[code][0] + + return text + + def code_value(self, stream): + value = u'' + open = False + + c = stream.read(1) + while c != '': + if open and c != '"': + value += c + if c == '"': + if not open: + open = True + else: + break + c = stream.read(1) + + return value.strip() + + def parse_pml(self, pml): + pml = self.prepare_pml(pml) + output = [] + + self.state = {} + for s in self.STATES: + self.state[s] = [False, '']; + + for line in pml.splitlines(): + if not line: + continue + parsed = [] + empty = True + + # Must use StringIO, cStringIO does not support unicode + line = StringIO.StringIO(self.prepare_line(line)) + parsed.append(self.start_line()) + + c = line.read(1) + while c != '': + text = u'' + + if c == '\\': + c = line.read(1) + + if c == 'x': + text = self.process_code_simple(c) + elif c in 'XS': + l = line.read(1) + if '%s%s' % (c, l) == 'Sd': + text = self.process_code_link(line, 'fns') + elif '%s%s' % (c, l) == 'SB': + text = self.process_code_div_span('SB', line) + else: + text = self.process_code_simple('%s%s' % (c, l)) + elif c == 'q': + text = self.process_code_link(line) + elif c in 'crtTiIuobB': + text = self.process_code_div_span(c, line) + elif c in 'sl': + close = u'' + if c == 's' and self.state['l']: + close = self.process_code_div_span('l', line) + if c == 'l' and self.state['s']: + close = self.process_code_div_span('s', line) + text = self.process_code_div_span(c, line) + text = close+text + elif c == 'm': + empty = False + src = self.code_value(line) + text = '<img src="images/%s" />' % image_name(src).strip('\x00') + elif c == 'Q': + empty = False + id = self.code_value(line) + text = '<span id="%s"></span>' % id + elif c == 'p': + empty = False + text = '<br /><br style="page-break-after: always;" />' + elif c == 'C': + # This should be made to create a TOC entry + line.read(1) + self.code_value(line) + elif c == 'n': + pass + elif c == 'F': + l = line.read(1) + if '%s%s' % (c, l) == 'Fn': + text = self.process_code_link(line, 'fns') + elif '%s%s' % (c, l) == 'FN': + text = self.process_code_div_span('FN', line) + elif c == 'w': + empty = False + text = '<hr width="%s" />' % self.code_value(line) + elif c == '-': + empty = False + text = '­' + elif c == '\\': + empty = False + text = '\\' + else: + if c != ' ': + empty = False + text = c + parsed.append(text) + c = line.read(1) + + if not empty: + text = self.end_line() + parsed.append(text) + output.append(u''.join(parsed)) + line.close() + + return u'\n'.join(output) + + +def pml_to_html(pml, close_all=False): + ''' + close_all will close div all div and span tags when one is closed and then + re-open the appropriate ones. + ''' + + hizer = PML_HTMLizer(close_all) + return hizer.parse_pml(pml) def footnote_sidebar_to_html(id, pml): if id.startswith('\x01'): From 07f9db1b2008410b6518dc7bb3800ecef030e42d Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 22 Nov 2009 12:00:00 -0500 Subject: [PATCH 069/120] PML Input: Create Toc from CX tags. --- src/calibre/ebooks/pml/input.py | 20 ++++++++++++++------ src/calibre/ebooks/pml/pmlconverter.py | 17 ++++++++++++++--- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index ead6c988f4..eac2e99e05 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -11,7 +11,8 @@ import shutil from calibre.customize.conversion import InputFormatPlugin from calibre.ptempfile import TemporaryDirectory from calibre.utils.zipfile import ZipFile -from calibre.ebooks.pml.pmlconverter import pml_to_html +from calibre.ebooks.pml.pmlconverter import PML_HTMLizer +from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.opf2 import OPFCreator class PMLInput(InputFormatPlugin): @@ -22,7 +23,7 @@ class PMLInput(InputFormatPlugin): # pmlz is a zip file containing pml files and png images. file_types = set(['pml', 'pmlz']) - def process_pml(self, pml_path, html_path): + def process_pml(self, pml_path, html_path, close_all=False): pclose = False hclose = False @@ -44,7 +45,8 @@ class PMLInput(InputFormatPlugin): ienc = self.options.input_encoding self.log.debug('Converting PML to HTML...') - html = pml_to_html(pml_stream.read().decode(ienc)) + hizer = PML_HTMLizer(close_all) + html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path) html_stream.write('<html><head><title /></head><body>%s</body></html>' % html.encode('utf-8', 'replace')) if pclose: @@ -52,11 +54,14 @@ class PMLInput(InputFormatPlugin): if hclose: html_stream.close() + return hizer.get_toc() + def convert(self, stream, options, file_ext, log, accelerators): self.options = options self.log = log pages, images = [], [] + toc = TOC() if file_ext == 'pmlz': log.debug('De-compressing content to temporary directory...') @@ -71,7 +76,8 @@ class PMLInput(InputFormatPlugin): pages.append(html_name) log.debug('Processing PML item %s...' % pml) - self.process_pml(pml, html_path) + ttoc = self.process_pml(pml, html_path) + toc += ttoc imgs = glob.glob(os.path.join(tdir, '*.png')) if len(imgs) > 0: @@ -84,7 +90,7 @@ class PMLInput(InputFormatPlugin): shutil.move(img, pimg_path) else: - self.process_pml(stream, 'index.html') + toc = self.process_pml(stream, 'index.html') pages.append('index.html') images = [] @@ -103,7 +109,9 @@ class PMLInput(InputFormatPlugin): log.debug('Generating manifest...') opf.create_manifest(manifest_items) opf.create_spine(pages) + opf.set_toc(toc) with open('metadata.opf', 'wb') as opffile: - opf.render(opffile) + with open('toc.ncx', 'wb') as tocfile: + opf.render(opffile, tocfile, 'toc.ncx') return os.path.join(os.getcwd(), 'metadata.opf') diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index cb8ae15298..62227c94ea 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -12,6 +12,7 @@ import re import StringIO from calibre import my_unichr, prepare_string_for_xml +from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.pdb.ereader import image_name class PML_HTMLizer(object): @@ -118,6 +119,8 @@ class PML_HTMLizer(object): def __init__(self, close_all): self.close_all = close_all self.state = {} + self.toc = TOC() + self.file_name = '' def prepare_pml(self, pml): # Remove comments @@ -290,11 +293,14 @@ class PML_HTMLizer(object): return value.strip() - def parse_pml(self, pml): + def parse_pml(self, pml, file_name=''): pml = self.prepare_pml(pml) output = [] self.state = {} + self.toc = TOC() + self.file_name = file_name + for s in self.STATES: self.state[s] = [False, '']; @@ -350,8 +356,10 @@ class PML_HTMLizer(object): text = '<br /><br style="page-break-after: always;" />' elif c == 'C': # This should be made to create a TOC entry - line.read(1) - self.code_value(line) + l = line.read(1) + id = 'pml_toc-%s' % len(self.toc) + self.toc.add_item(self.file_name, id, self.code_value(line)) + text = '<span id="%s"></span>' % id elif c == 'n': pass elif c == 'F': @@ -384,6 +392,9 @@ class PML_HTMLizer(object): return u'\n'.join(output) + def get_toc(self): + return self.toc + def pml_to_html(pml, close_all=False): ''' From bee0c326e1af6186343878845e39ab20f44e9230 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 22 Nov 2009 12:04:29 -0500 Subject: [PATCH 070/120] ... --- src/calibre/ebooks/pml/pmlconverter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 62227c94ea..a8a7e9922b 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -355,8 +355,7 @@ class PML_HTMLizer(object): empty = False text = '<br /><br style="page-break-after: always;" />' elif c == 'C': - # This should be made to create a TOC entry - l = line.read(1) + line.read(1) id = 'pml_toc-%s' % len(self.toc) self.toc.add_item(self.file_name, id, self.code_value(line)) text = '<span id="%s"></span>' % id From 8d364272ffed49f0a068f39aa9382f12b2e429a4 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 25 Nov 2009 07:49:06 -0500 Subject: [PATCH 071/120] Improve PML conversion. --- src/calibre/ebooks/pml/pmlconverter.py | 42 ++++++++++++++++---------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index a8a7e9922b..5ef218e962 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -66,8 +66,7 @@ class PML_HTMLizer(object): 'u': ('<span style="text-decoration : underline;">', '</span>'), 'd': ('<span style="text-decoration: line-through;">', '</span>'), 'b': ('<span style="font-weight: bold;">', '</span>'), - 's': ('<span style="font-size: 75%">', '</span>'), - 'l': ('<span style="font-size: 125%">', '</span>'), + 'l': ('<span style="font-size: 150%">', '</span>'), 'FS': ('<div id="%s">', '</div>'), } @@ -91,7 +90,6 @@ class PML_HTMLizer(object): 'o': 'd', 'b': 'b', 'B': 'b', - 's': 's', 'l': 'l', 'Fn': 'a', 'Sd': 'a', @@ -108,7 +106,6 @@ class PML_HTMLizer(object): ] SPAN_STATES = [ - 's', 'l', 'i', 'u', @@ -144,6 +141,23 @@ class PML_HTMLizer(object): return line + def cleanup_html(self, html): + old = html + html = self.cleanup_html_remove_redundant(html) + while html != old: + old = html + html = self.cleanup_html_remove_redundant(html) + return html + + def cleanup_html_remove_redundant(self, html): + for key in self.STATES_TAGS.keys(): + open, close = self.STATES_TAGS[key] + if key in self.STATES_VALUE_REQ: + html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html) + else: + html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html) + return html + def start_line(self): start = u'' @@ -213,17 +227,19 @@ class PML_HTMLizer(object): if code in self.DIV_STATES: ds = self.DIV_STATES[:] + ss = self.SPAN_STATES[:] elif code in self.SPAN_STATES: ds = self.SPAN_STATES[:] + ss = [] if self.state[code][0]: # Close all. - for c in ds: + for c in ss+ds: if self.state[c][0]: text += self.STATES_TAGS[c][1] # Reopen the based on state. del ds[ds.index(code)] - for c in ds: + for c in ds+ss: if self.state[c][0]: if c in self.STATES_VALUE_REQ: text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] @@ -333,16 +349,8 @@ class PML_HTMLizer(object): text = self.process_code_simple('%s%s' % (c, l)) elif c == 'q': text = self.process_code_link(line) - elif c in 'crtTiIuobB': + elif c in 'crtTiIuobBl': text = self.process_code_div_span(c, line) - elif c in 'sl': - close = u'' - if c == 's' and self.state['l']: - close = self.process_code_div_span('l', line) - if c == 'l' and self.state['s']: - close = self.process_code_div_span('s', line) - text = self.process_code_div_span(c, line) - text = close+text elif c == 'm': empty = False src = self.code_value(line) @@ -389,7 +397,9 @@ class PML_HTMLizer(object): output.append(u''.join(parsed)) line.close() - return u'\n'.join(output) + output = self.cleanup_html(u'\n'.join(output)) + + return output def get_toc(self): return self.toc From c254f63a0728d90d3aad80d0d7efa16d1c4912c2 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 27 Nov 2009 20:25:57 -0500 Subject: [PATCH 072/120] PML parser fixes. --- src/calibre/ebooks/pml/input.py | 2 +- src/calibre/ebooks/pml/pmlconverter.py | 118 ++++++++++++------------- 2 files changed, 56 insertions(+), 64 deletions(-) diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index eac2e99e05..ad37494ff7 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -45,7 +45,7 @@ class PMLInput(InputFormatPlugin): ienc = self.options.input_encoding self.log.debug('Converting PML to HTML...') - hizer = PML_HTMLizer(close_all) + hizer = PML_HTMLizer() html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path) html_stream.write('<html><head><title /></head><body>%s</body></html>' % html.encode('utf-8', 'replace')) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 5ef218e962..f4a959b3fc 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -62,11 +62,11 @@ class PML_HTMLizer(object): 'r': ('<div style="text-align: right;">', '</div>'), 't': ('<div style="margin-left: 5%;">', '</div>'), 'T': ('<div style="margin-left: %s;">', '</div>'), - 'i': ('<span style="font-style : italic;">', '</span>'), - 'u': ('<span style="text-decoration : underline;">', '</span>'), + 'i': ('<span style="font-style: italic;">', '</span>'), + 'u': ('<span style="text-decoration: underline;">', '</span>'), 'd': ('<span style="text-decoration: line-through;">', '</span>'), 'b': ('<span style="font-weight: bold;">', '</span>'), - 'l': ('<span style="font-size: 150%">', '</span>'), + 'l': ('<span style="font-size: 150%;">', '</span>'), 'FS': ('<div id="%s">', '</div>'), } @@ -113,8 +113,7 @@ class PML_HTMLizer(object): 'b', ] - def __init__(self, close_all): - self.close_all = close_all + def __init__(self): self.state = {} self.toc = TOC() self.file_name = '' @@ -156,6 +155,7 @@ class PML_HTMLizer(object): html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html) else: html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html) + html = re.sub(r'<p>\s*</p>', '', html) return html def start_line(self): @@ -173,11 +173,22 @@ class PML_HTMLizer(object): def end_line(self): end = u'' + div = [] + span = [] + other = [] + for key, val in self.state.items(): if val[0]: if key == 'T': self.state['T'][0] = False - end += self.STATES_TAGS[key][1] + elif key in self.DIV_STATES: + div.append(key) + elif key in self.SPAN_STATES: + span.append(key) + else: + other.append(key) + for key in span+div+other: + end += self.STATES_TAGS[key][1] return u'%s</p>' % end @@ -214,12 +225,6 @@ class PML_HTMLizer(object): return text def process_code_div_span(self, code, stream): - if self.close_all: - return self.process_code_div_span_call(code, stream) - else: - return self.process_code_div_span_ind(code, stream) - - def process_code_div_span_ind(self, code, stream): text = u'' ds = [] @@ -246,47 +251,24 @@ class PML_HTMLizer(object): else: text += self.STATES_TAGS[c][0] else: - if code in self.STATES_VALUE_REQ: - val = self.code_value(stream) - text = self.STATES_TAGS[code][0] % val - self.state[code][1] = val - else: - text = self.STATES_TAGS[code][0] - - self.state[code][0] = not self.state[code][0] - - return text - - def process_code_div_span_call(self, code, stream): - text = u'' - divs = self.DIV_STATES[:] - spans = self.SPAN_STATES[:] - - code = self.CODE_STATES[code] - - if self.state[code][0]: - # Close all divs then spans. - for c in spans+divs: + # Close all spans if code is a div + for c in ss: if self.state[c][0]: text += self.STATES_TAGS[c][1] - # Reopen the based on state. Open divs then spans - if code in self.DIV_STATES: - del divs[divs.index(code)] - if code in self.SPAN_STATES: - del spans[spans.index(code)] - for c in divs+spans: + # Process the code + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + text += self.STATES_TAGS[code][0] % val + self.state[code][1] = val + else: + text += self.STATES_TAGS[code][0] + # Re-open all spans if code was a div based on state + for c in ss: if self.state[c][0]: if c in self.STATES_VALUE_REQ: text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] else: text += self.STATES_TAGS[c][0] - else: - if code in self.STATES_VALUE_REQ: - val = self.code_value(stream) - text = self.STATES_TAGS[code][0] % val - self.state[code][1] = val - else: - text = self.STATES_TAGS[code][0] self.state[code][0] = not self.state[code][0] @@ -294,19 +276,32 @@ class PML_HTMLizer(object): def code_value(self, stream): value = u'' - open = False + # state 0 is before = + # state 1 is before the first " + # state 2 is before the second " + state = 0 + loc = stream.tell() c = stream.read(1) while c != '': - if open and c != '"': - value += c - if c == '"': - if not open: - open = True - else: + if state == 0: + if c == '=': + state = 1 + elif state == 1: + if c == '"': + state = 2 + elif state == 2: + if c == '"': + state = 3 break + else: + value += c c = stream.read(1) + if state != 3: + stream.seek(loc) + value = u'' + return value.strip() def parse_pml(self, pml, file_name=''): @@ -321,13 +316,15 @@ class PML_HTMLizer(object): self.state[s] = [False, '']; for line in pml.splitlines(): - if not line: - continue parsed = [] empty = True + line = self.prepare_line(line) + if not line: + continue + # Must use StringIO, cStringIO does not support unicode - line = StringIO.StringIO(self.prepare_line(line)) + line = StringIO.StringIO(line) parsed.append(self.start_line()) c = line.read(1) @@ -405,13 +402,8 @@ class PML_HTMLizer(object): return self.toc -def pml_to_html(pml, close_all=False): - ''' - close_all will close div all div and span tags when one is closed and then - re-open the appropriate ones. - ''' - - hizer = PML_HTMLizer(close_all) +def pml_to_html(pml): + hizer = PML_HTMLizer() return hizer.parse_pml(pml) def footnote_sidebar_to_html(id, pml): From fe9f40e4a63b77d654f2742b32b7443445d0d42f Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 28 Nov 2009 07:56:35 -0500 Subject: [PATCH 073/120] k and T code patches from WayneD. --- src/calibre/ebooks/pml/pmlconverter.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index f4a959b3fc..ebb451a14b 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -36,6 +36,7 @@ class PML_HTMLizer(object): 't', 's', 'l', + 'k', 'T', 'Fn', 'Sd', @@ -67,6 +68,7 @@ class PML_HTMLizer(object): 'd': ('<span style="text-decoration: line-through;">', '</span>'), 'b': ('<span style="font-weight: bold;">', '</span>'), 'l': ('<span style="font-size: 150%;">', '</span>'), + 'k': ('<span style="font-size: 75%;">', '</span>'), 'FS': ('<div id="%s">', '</div>'), } @@ -91,6 +93,7 @@ class PML_HTMLizer(object): 'b': 'b', 'B': 'b', 'l': 'l', + 'k': 'k', 'Fn': 'a', 'Sd': 'a', 'FN': 'FS', @@ -107,6 +110,7 @@ class PML_HTMLizer(object): SPAN_STATES = [ 'l', + 'k', 'i', 'u', 'd', @@ -125,11 +129,11 @@ class PML_HTMLizer(object): pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) - pml = prepare_string_for_xml(pml) - pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml) pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml) + pml = prepare_string_for_xml(pml) + return pml def prepare_line(self, line): @@ -181,7 +185,7 @@ class PML_HTMLizer(object): if val[0]: if key == 'T': self.state['T'][0] = False - elif key in self.DIV_STATES: + if key in self.DIV_STATES: div.append(key) elif key in self.SPAN_STATES: span.append(key) @@ -238,6 +242,11 @@ class PML_HTMLizer(object): ss = [] if self.state[code][0]: + # Ignore multilple T's on the same line. They do not have a closing + # code. They get closed at the end of the line. + if code == 'T': + self.code_value(stream) + return text # Close all. for c in ss+ds: if self.state[c][0]: @@ -346,7 +355,7 @@ class PML_HTMLizer(object): text = self.process_code_simple('%s%s' % (c, l)) elif c == 'q': text = self.process_code_link(line) - elif c in 'crtTiIuobBl': + elif c in 'crtTiIuobBlk': text = self.process_code_div_span(c, line) elif c == 'm': empty = False @@ -384,7 +393,10 @@ class PML_HTMLizer(object): else: if c != ' ': empty = False - text = c + if self.state['k'][0]: + text = c.upper() + else: + text = c parsed.append(text) c = line.read(1) From 94a11a8c56fd02fa065c3e0800df20d37aaa6cfa Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 28 Nov 2009 08:08:06 -0500 Subject: [PATCH 074/120] Fix bug when handling links. --- src/calibre/ebooks/pml/pmlconverter.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index ebb451a14b..d085b0dc2d 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -214,13 +214,12 @@ class PML_HTMLizer(object): def process_code_link(self, stream, pre=''): text = u'' - href = self.code_value(stream) - if pre: - href = '#%s-%s' % (pre, href) - if self.state['a'][0]: text = self.STATES_TAGS['a'][1] else: + href = self.code_value(stream) + if pre: + href = '#%s-%s' % (pre, href) text = self.STATES_TAGS['a'][0] % href self.state['a'][1] = href From a4847d88d9ffb8562b6f8cf601caab73968e6f99 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 28 Nov 2009 08:32:08 -0500 Subject: [PATCH 075/120] PML input: Mark <a> as block level element. --- src/calibre/ebooks/pml/pmlconverter.py | 27 +++++++------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index d085b0dc2d..2416be596a 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -101,6 +101,7 @@ class PML_HTMLizer(object): } DIV_STATES = [ + 'a', 'c', 'r', 't', @@ -211,23 +212,7 @@ class PML_HTMLizer(object): return text - def process_code_link(self, stream, pre=''): - text = u'' - - if self.state['a'][0]: - text = self.STATES_TAGS['a'][1] - else: - href = self.code_value(stream) - if pre: - href = '#%s-%s' % (pre, href) - text = self.STATES_TAGS['a'][0] % href - self.state['a'][1] = href - - self.state['a'][0] = not self.state['a'][0] - - return text - - def process_code_div_span(self, code, stream): + def process_code_div_span(self, code, stream, pre=''): text = u'' ds = [] @@ -266,6 +251,8 @@ class PML_HTMLizer(object): # Process the code if code in self.STATES_VALUE_REQ: val = self.code_value(stream) + if pre: + val = '#%s-%s' % (pre, val) text += self.STATES_TAGS[code][0] % val self.state[code][1] = val else: @@ -347,13 +334,13 @@ class PML_HTMLizer(object): elif c in 'XS': l = line.read(1) if '%s%s' % (c, l) == 'Sd': - text = self.process_code_link(line, 'fns') + text = self.process_code_div_span('Sd', line, 'fns') elif '%s%s' % (c, l) == 'SB': text = self.process_code_div_span('SB', line) else: text = self.process_code_simple('%s%s' % (c, l)) elif c == 'q': - text = self.process_code_link(line) + text = self.process_code_div_span(c, line) elif c in 'crtTiIuobBlk': text = self.process_code_div_span(c, line) elif c == 'm': @@ -377,7 +364,7 @@ class PML_HTMLizer(object): elif c == 'F': l = line.read(1) if '%s%s' % (c, l) == 'Fn': - text = self.process_code_link(line, 'fns') + text = self.process_code_div_span('Fn', line, 'fns') elif '%s%s' % (c, l) == 'FN': text = self.process_code_div_span('FN', line) elif c == 'w': From fc93d954e5e6bbd8d01401a7ef22d4eef238f56a Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 28 Nov 2009 09:41:23 -0500 Subject: [PATCH 076/120] PML input: cleanup and refactor a bit. Also fix an error when handling a tags. --- src/calibre/ebooks/pml/pmlconverter.py | 107 +++++++++++++++++++------ 1 file changed, 81 insertions(+), 26 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 2416be596a..8c34cc8da9 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -100,8 +100,17 @@ class PML_HTMLizer(object): 'SB': 'FS', } - DIV_STATES = [ + BLOCK_STATES = [ 'a', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + ] + + DIV_STATES = [ 'c', 'r', 't', @@ -197,22 +206,10 @@ class PML_HTMLizer(object): return u'%s</p>' % end - def process_code_simple(self, code): - if code not in self.CODE_STATES.keys(): - return u'' - - text = u'' - - if self.state[self.CODE_STATES[code]][0]: - text = self.STATES_TAGS[self.CODE_STATES[code]][1] - else: - text = self.STATES_TAGS[self.CODE_STATES[code]][0] - - self.state[self.CODE_STATES[code]][0] = not self.state[self.CODE_STATES[code]][0] - - return text - - def process_code_div_span(self, code, stream, pre=''): + def process_code(self, code, stream): + ''' + Used for processing div and span elements. + ''' text = u'' ds = [] @@ -225,6 +222,10 @@ class PML_HTMLizer(object): ds = self.SPAN_STATES[:] ss = [] + # Close code. + # Close all tags starting with the inline then close block. Remove the + # Tag that is closed from the list and reopen them all starting with + # block followed by inline. if self.state[code][0]: # Ignore multilple T's on the same line. They do not have a closing # code. They get closed at the end of the line. @@ -243,16 +244,17 @@ class PML_HTMLizer(object): text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] else: text += self.STATES_TAGS[c][0] + # Open code. + # If the tag to open is a block we close all inline tags, open the block + # then re-open the inline tags. else: - # Close all spans if code is a div + # Close all spans if code is a div. for c in ss: if self.state[c][0]: text += self.STATES_TAGS[c][1] # Process the code if code in self.STATES_VALUE_REQ: val = self.code_value(stream) - if pre: - val = '#%s-%s' % (pre, val) text += self.STATES_TAGS[code][0] % val self.state[code][1] = val else: @@ -269,6 +271,59 @@ class PML_HTMLizer(object): return text + def process_code_block(self, code, stream, pre=''): + text = u'' + + code = self.CODE_STATES[code] + + # Close all spans + for c in self.SPAN_STATES: + if self.state[c][0]: + text += self.STATES_TAGS[c][1] + + # Process the code + if self.state[code][0]: + # Close tag + text += self.STATES_TAGS[code][1] + else: + # Open tag + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + if pre: + val = '#%s-%s' % (pre, val) + text += self.STATES_TAGS[code][0] % val + self.state[code][1] = val + else: + text += self.STATES_TAGS[code][0] + + # Re-open all spans if code was a div based on state + for c in self.SPAN_STATES: + if self.state[c][0]: + if c in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[code][0] % self.state[c][1] + else: + text += self.STATES_TAGS[c][0] + + self.state[code][0] = not self.state[code][0] + + return text + + + def process_code_simple(self, code): + if code not in self.CODE_STATES.keys(): + return u'' + + text = u'' + + if self.state[self.CODE_STATES[code]][0]: + text = self.STATES_TAGS[self.CODE_STATES[code]][1] + else: + text = self.STATES_TAGS[self.CODE_STATES[code]][0] + + self.state[self.CODE_STATES[code]][0] = not self.state[self.CODE_STATES[code]][0] + + return text + def code_value(self, stream): value = u'' # state 0 is before = @@ -334,15 +389,15 @@ class PML_HTMLizer(object): elif c in 'XS': l = line.read(1) if '%s%s' % (c, l) == 'Sd': - text = self.process_code_div_span('Sd', line, 'fns') + text = self.process_code_block('Sd', line, 'fns') elif '%s%s' % (c, l) == 'SB': - text = self.process_code_div_span('SB', line) + text = self.process_code('SB', line) else: text = self.process_code_simple('%s%s' % (c, l)) elif c == 'q': - text = self.process_code_div_span(c, line) + text = self.process_code_block(c, line) elif c in 'crtTiIuobBlk': - text = self.process_code_div_span(c, line) + text = self.process_code(c, line) elif c == 'm': empty = False src = self.code_value(line) @@ -364,9 +419,9 @@ class PML_HTMLizer(object): elif c == 'F': l = line.read(1) if '%s%s' % (c, l) == 'Fn': - text = self.process_code_div_span('Fn', line, 'fns') + text = self.process_code_block('Fn', line, 'fns') elif '%s%s' % (c, l) == 'FN': - text = self.process_code_div_span('FN', line) + text = self.process_code('FN', line) elif c == 'w': empty = False text = '<hr width="%s" />' % self.code_value(line) From 32895741531b6350213cf3dcb1c1112aaf4e9952 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 28 Nov 2009 14:13:54 -0500 Subject: [PATCH 077/120] PML input: don't turn HTML entities in the PML into the character. --- src/calibre/ebooks/pml/pmlconverter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 8c34cc8da9..7707325131 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -139,6 +139,10 @@ class PML_HTMLizer(object): pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) + # Convert &'s into entities so & in the text doesn't get turned into + # &. It will display as & + pml = pml.replace('&', '&') + pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml) pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml) From 27935c4a71636393b0a4d454b462bd0fdf8a7508 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 28 Nov 2009 14:57:47 -0500 Subject: [PATCH 078/120] PML Input refactoring. --- src/calibre/ebooks/pml/pmlconverter.py | 70 +++++++++++++------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 7707325131..874b39223a 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -50,7 +50,7 @@ class PML_HTMLizer(object): ] STATES_TAGS = { - 'h1': ('<h1 style="page-break-after: always;">', '</h1>'), + 'h1': ('<h1 style="page-break-before: always;">', '</h1>'), 'h2': ('<h2>', '</h2>'), 'h3': ('<h3>', '</h3>'), 'h4': ('<h4>', '</h4>'), @@ -108,6 +108,8 @@ class PML_HTMLizer(object): 'h4', 'h5', 'h6', + 'sb', + 'sp', ] DIV_STATES = [ @@ -135,6 +137,13 @@ class PML_HTMLizer(object): def prepare_pml(self, pml): # Remove comments pml = re.sub(r'(?mus)\\v(?P<text>.*?)\\v', '', pml) + + # Remove extra white spaces. + pml = re.sub(r'(?mus)[ ]{2,}', ' ', pml) + pml = re.sub(r'(?mus)^[ ]*(?=.)', '', pml) + pml = re.sub(r'(?mus)(?<=.)[ ]*$', '', pml) + pml = re.sub(r'(?mus)^[ ]*$', '', pml) + # Footnotes and Sidebars pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) @@ -150,14 +159,6 @@ class PML_HTMLizer(object): return pml - def prepare_line(self, line): - line = re.sub(r'[ ]{2,}', ' ', line) - line = re.sub(r'^[ ]*(?=.)', '', line) - line = re.sub(r'(?<=.)[ ]*$', '', line) - line = re.sub(r'^[ ]*$', '', line) - - return line - def cleanup_html(self, html): old = html html = self.cleanup_html_remove_redundant(html) @@ -217,7 +218,9 @@ class PML_HTMLizer(object): text = u'' ds = [] - code = self.CODE_STATES[code] + code = self.CODE_STATES.get(code, None) + if not code: + return text if code in self.DIV_STATES: ds = self.DIV_STATES[:] @@ -278,7 +281,9 @@ class PML_HTMLizer(object): def process_code_block(self, code, stream, pre=''): text = u'' - code = self.CODE_STATES[code] + code = self.CODE_STATES.get(code, None) + if not code: + return text # Close all spans for c in self.SPAN_STATES: @@ -312,27 +317,12 @@ class PML_HTMLizer(object): return text - - def process_code_simple(self, code): - if code not in self.CODE_STATES.keys(): - return u'' - - text = u'' - - if self.state[self.CODE_STATES[code]][0]: - text = self.STATES_TAGS[self.CODE_STATES[code]][1] - else: - text = self.STATES_TAGS[self.CODE_STATES[code]][0] - - self.state[self.CODE_STATES[code]][0] = not self.state[self.CODE_STATES[code]][0] - - return text - def code_value(self, stream): value = u'' # state 0 is before = # state 1 is before the first " # state 2 is before the second " + # state 3 is after the second " state = 0 loc = stream.tell() @@ -341,6 +331,13 @@ class PML_HTMLizer(object): if state == 0: if c == '=': state = 1 + elif c != ' ': + # A code that requires an argument should have = after the + # code but sometimes has spaces. If it has anything other + # than a space or = after the code then we can assume the + # markup is invalid. We will stop looking for the value + # and continue to hopefully not lose any data. + break; elif state == 1: if c == '"': state = 2 @@ -353,6 +350,8 @@ class PML_HTMLizer(object): c = stream.read(1) if state != 3: + # Unable to complete the sequence to reterieve the value. Reset + # the stream to the location it started. stream.seek(loc) value = u'' @@ -370,13 +369,12 @@ class PML_HTMLizer(object): self.state[s] = [False, '']; for line in pml.splitlines(): - parsed = [] - empty = True - - line = self.prepare_line(line) if not line: continue + parsed = [] + empty = True + # Must use StringIO, cStringIO does not support unicode line = StringIO.StringIO(line) parsed.append(self.start_line()) @@ -389,15 +387,15 @@ class PML_HTMLizer(object): c = line.read(1) if c == 'x': - text = self.process_code_simple(c) + text = self.process_code_block(c, line) elif c in 'XS': l = line.read(1) - if '%s%s' % (c, l) == 'Sd': - text = self.process_code_block('Sd', line, 'fns') - elif '%s%s' % (c, l) == 'SB': + if '%s%s' % (c, l) == 'SB': text = self.process_code('SB', line) + elif '%s%s' % (c, l) == 'Sd': + text = self.process_code_block('Sd', line, 'fns') else: - text = self.process_code_simple('%s%s' % (c, l)) + text = self.process_code_block('%s%s' % (c, l), line) elif c == 'q': text = self.process_code_block(c, line) elif c in 'crtTiIuobBlk': From 71c4beccfc230292341c291f57497c1482622db5 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 28 Nov 2009 15:41:24 -0500 Subject: [PATCH 079/120] PML Input: more refactoring. --- src/calibre/ebooks/pml/pmlconverter.py | 130 +++++++++++++++---------- 1 file changed, 81 insertions(+), 49 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 874b39223a..e6c352e2b5 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -211,52 +211,68 @@ class PML_HTMLizer(object): return u'%s</p>' % end - def process_code(self, code, stream): - ''' - Used for processing div and span elements. - ''' + def process_code(self, code, stream, pre=''): text = u'' - ds = [] code = self.CODE_STATES.get(code, None) if not code: return text if code in self.DIV_STATES: - ds = self.DIV_STATES[:] - ss = self.SPAN_STATES[:] - elif code in self.SPAN_STATES: - ds = self.SPAN_STATES[:] - ss = [] - - # Close code. - # Close all tags starting with the inline then close block. Remove the - # Tag that is closed from the list and reopen them all starting with - # block followed by inline. - if self.state[code][0]: # Ignore multilple T's on the same line. They do not have a closing # code. They get closed at the end of the line. - if code == 'T': + if code == 'T' and self.state['T'][0]: self.code_value(stream) return text + text = self.process_code_div(code, stream) + elif code in self.SPAN_STATES: + text = self.process_code_span(code, stream) + elif code in self.BLOCK_STATES: + text = self.process_code_block(code, stream, pre) + else: + text = self.process_code_simple(code) + + self.state[code][0] = not self.state[code][0] + + return text + + def process_code_simple(self, code): + text = u'' + + if self.state[code][0]: + text = self.STATES_TAGS[code][1] + else: + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + text += self.STATES_TAGS[code][0] % val + self.state[code][1] = val + else: + text = self.STATES_TAGS[code][0] + + return text + + def process_code_div(self, code, stream): + text = u'' + + # Close code. + if self.state[code][0]: # Close all. - for c in ss+ds: + for c in self.SPAN_STATES+self.DIV_STATES: if self.state[c][0]: text += self.STATES_TAGS[c][1] # Reopen the based on state. - del ds[ds.index(code)] - for c in ds+ss: + for c in self.DIV_STATES+self.SPAN_STATES: + if code == c: + continue if self.state[c][0]: if c in self.STATES_VALUE_REQ: text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] else: text += self.STATES_TAGS[c][0] # Open code. - # If the tag to open is a block we close all inline tags, open the block - # then re-open the inline tags. else: - # Close all spans if code is a div. - for c in ss: + # Close all spans. + for c in self.SPAN_STATES: if self.state[c][0]: text += self.STATES_TAGS[c][1] # Process the code @@ -266,25 +282,49 @@ class PML_HTMLizer(object): self.state[code][1] = val else: text += self.STATES_TAGS[code][0] - # Re-open all spans if code was a div based on state - for c in ss: + # Re-open all spans based on state + for c in self.SPAN_STATES: if self.state[c][0]: if c in self.STATES_VALUE_REQ: text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] else: text += self.STATES_TAGS[c][0] - self.state[code][0] = not self.state[code][0] + return text + + def process_code_span(self, code, stream): + text = u'' + + # Close code. + if self.state[code][0]: + # Close all spans + for c in self.SPAN_STATES: + if self.state[c][0]: + text += self.STATES_TAGS[c][1] + # Re-open the spans based on state except for code which will be + # left closed. + for c in self.SPAN_STATES: + if code == c: + continue + if self.state[c][0]: + if c in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[code][0] % self.state[c][1] + else: + text += self.STATES_TAGS[c][0] + # Open code. + else: + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + text += self.STATES_TAGS[code][0] % val + self.state[code][1] = val + else: + text += self.STATES_TAGS[code][0] return text def process_code_block(self, code, stream, pre=''): text = u'' - code = self.CODE_STATES.get(code, None) - if not code: - return text - # Close all spans for c in self.SPAN_STATES: if self.state[c][0]: @@ -313,8 +353,6 @@ class PML_HTMLizer(object): else: text += self.STATES_TAGS[c][0] - self.state[code][0] = not self.state[code][0] - return text def code_value(self, stream): @@ -386,20 +424,20 @@ class PML_HTMLizer(object): if c == '\\': c = line.read(1) - if c == 'x': - text = self.process_code_block(c, line) - elif c in 'XS': + if c in 'xqcrtTiIuobBlk': + text = self.process_code(c, line) + elif c in 'FSX': l = line.read(1) - if '%s%s' % (c, l) == 'SB': + if '%s%s' % (c, l) == 'Fn': + text = self.process_code('Fn', line, 'fns') + elif '%s%s' % (c, l) == 'FN': + text = self.process_code('FN', line) + elif '%s%s' % (c, l) == 'SB': text = self.process_code('SB', line) elif '%s%s' % (c, l) == 'Sd': - text = self.process_code_block('Sd', line, 'fns') + text = self.process_code('Sd', line, 'fns') else: - text = self.process_code_block('%s%s' % (c, l), line) - elif c == 'q': - text = self.process_code_block(c, line) - elif c in 'crtTiIuobBlk': - text = self.process_code(c, line) + text = self.process_code('%s%s' % (c, l), line) elif c == 'm': empty = False src = self.code_value(line) @@ -418,12 +456,6 @@ class PML_HTMLizer(object): text = '<span id="%s"></span>' % id elif c == 'n': pass - elif c == 'F': - l = line.read(1) - if '%s%s' % (c, l) == 'Fn': - text = self.process_code_block('Fn', line, 'fns') - elif '%s%s' % (c, l) == 'FN': - text = self.process_code('FN', line) elif c == 'w': empty = False text = '<hr width="%s" />' % self.code_value(line) From 63e2876f76ddfd5ee7f55c4d4d6c45873d0ae556 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 28 Nov 2009 17:34:30 -0500 Subject: [PATCH 080/120] PML Input: break if opening quote is not found when getting tag value. --- src/calibre/ebooks/pml/pmlconverter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index e6c352e2b5..cd5bafa260 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -375,10 +375,14 @@ class PML_HTMLizer(object): # than a space or = after the code then we can assume the # markup is invalid. We will stop looking for the value # and continue to hopefully not lose any data. - break; + break elif state == 1: if c == '"': state = 2 + elif c != ' ': + # " should always follow = but we will allow for blank + # space after the =. + break elif state == 2: if c == '"': state = 3 From 65d59ebc935f3aa25112e5e214c674ed4d7ebe46 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Tue, 1 Dec 2009 06:50:03 -0500 Subject: [PATCH 081/120] Correct import. --- src/calibre/gui2/convert/gui_conversion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/convert/gui_conversion.py b/src/calibre/gui2/convert/gui_conversion.py index 06ade752d1..c740fb5c1c 100644 --- a/src/calibre/gui2/convert/gui_conversion.py +++ b/src/calibre/gui2/convert/gui_conversion.py @@ -4,9 +4,9 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -from calibre.ebooks.conversion.plumber import Plumber, DummyReporter +from calibre.ebooks.conversion.plumber import Plumber from calibre.utils.logging import Log -from calibre.customize.conversion import OptionRecommendation +from calibre.customize.conversion import OptionRecommendation, DummyReporter def gui_convert(input, output, recommendations, notification=DummyReporter(), abort_after_input_dump=False): From 990f4f3bc46fe457a75a65a2646fa08a3b4efe70 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 2 Dec 2009 19:18:14 -0500 Subject: [PATCH 082/120] PML Input: image_name is unnecessary. --- src/calibre/ebooks/pml/pmlconverter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index e495c24042..c120f2faf9 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -13,7 +13,6 @@ import StringIO from calibre import my_unichr, prepare_string_for_xml from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.pdb.ereader import image_name class PML_HTMLizer(object): @@ -445,7 +444,7 @@ class PML_HTMLizer(object): elif c == 'm': empty = False src = self.code_value(line) - text = '<img src="images/%s" />' % image_name(src).strip('\x00') + text = '<img src="images/%s" />' % src elif c == 'Q': empty = False id = self.code_value(line) From 44ba14e77be9a9d17e80336bd0d6bf1e7f810d77 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 2 Dec 2009 20:05:54 -0500 Subject: [PATCH 083/120] PML Input: Allow for images to be in top level, bookname_img, or images directory for both PML and PMLZ. --- src/calibre/ebooks/pml/input.py | 44 +++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index ad37494ff7..b18630c044 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -1,3 +1,4 @@ +import os.path # -*- coding: utf-8 -*- __license__ = 'GPL v3' @@ -56,6 +57,32 @@ class PMLInput(InputFormatPlugin): return hizer.get_toc() + def get_images(self, stream, tdir, top_level=False): + images = [] + imgs = [] + + if top_level: + imgs = glob.glob(os.path.join(tdir, '*.png')) + # Images not in top level try bookname_img directory because + # that's where Dropbook likes to see them. + if not imgs: + if hasattr(stream, 'name'): + imgs = glob.glob(os.path.join(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img'), '*.png')) + # No images in Dropbook location try generic images directory + if not imgs: + imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png')) + if imgs: + os.makedirs(os.path.join(os.getcwd(), 'images')) + for img in imgs: + pimg_name = os.path.basename(img) + pimg_path = os.path.join(os.getcwd(), 'images', pimg_name) + + images.append('images/' + pimg_name) + + shutil.copy(img, pimg_path) + + return images + def convert(self, stream, options, file_ext, log, accelerators): self.options = options @@ -78,22 +105,13 @@ class PMLInput(InputFormatPlugin): log.debug('Processing PML item %s...' % pml) ttoc = self.process_pml(pml, html_path) toc += ttoc - - imgs = glob.glob(os.path.join(tdir, '*.png')) - if len(imgs) > 0: - os.makedirs(os.path.join(os.getcwd(), 'images')) - for img in imgs: - pimg_name = os.path.basename(img) - pimg_path = os.path.join(os.getcwd(), 'images', pimg_name) - - images.append('images/' + pimg_name) - - shutil.move(img, pimg_path) + images = self.get_images(stream, tdir, True) else: toc = self.process_pml(stream, 'index.html') - pages.append('index.html') - images = [] + + if hasattr(stream, 'name'): + images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name))) # We want pages to be orded alphabetically. pages.sort() From d19848184dccffcb1d3bff6a43a6743fad30fbe5 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 3 Dec 2009 07:18:37 -0500 Subject: [PATCH 084/120] PML Output: Only create \a and \U tags for supported characters. --- src/calibre/ebooks/pml/__init__.py | 60 ++++++++++++++++++++++++++++++ src/calibre/ebooks/pml/pmlml.py | 6 ++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pml/__init__.py b/src/calibre/ebooks/pml/__init__.py index e69de29bb2..9bda82bafb 100644 --- a/src/calibre/ebooks/pml/__init__.py +++ b/src/calibre/ebooks/pml/__init__.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +# Uncommon Characters supported by PML. \\a tag codes +A_CHARS = range(160, 256) + range(130, 136) + range(138, 141) + \ + range(145, 152) + range(153, 157) + [159] + +# Extended Unicode characters supported by PML +Latin_ExtendedA = range(0x0100, 0x0104) + [0x0105, 0x0107, 0x010C, 0x010D, + 0x0112, 0x0113, 0x0115, 0x0117, 0x0119, 0x011B, 0x011D, 0x011F, 0x012A, + 0x012B, 0x012D, 0x012F, 0x0131, 0x0141, 0x0142, 0x0144, 0x0148] + \ + range(0x014B, 0x014E) + [0x014F, 0x0151, 0x0155] + range(0x0159, 0x015C) + \ + [0x015F, 0x0163, 0x0169, 0x016B, 0x016D, 0x0177, 0x017A, 0x017D, 0x017E] +Latin_ExtendedB = [0x01BF, 0x01CE, 0x01D0, 0x01D2, 0x01D4, 0x01E1, 0x01E3, + 0x01E7, 0x01EB, 0x01F0, 0x0207, 0x021D, 0x0227, 0x022F, 0x0233] +IPA_Extensions = [0x0251, 0x0251, 0x0254, 0x0259, 0x025C, 0x0265, 0x026A, + 0x0272, 0x0283, 0x0289, 0x028A, 0x028C, 0x028F, 0x0292, 0x0294, 0x029C] +Spacing_Modifier_Letters = [0x02BE, 0x02BF, 0x02C7, 0x02C8, 0x02CC, 0x02D0, + 0x02D8, 0x02D9] +Greek_and_Coptic = range(0x0391, 0x03A2) + range(0x03A3, 0x03AA) + \ + range(0x03B1, 0x03CA) + [0x03D1, 0x03DD] +Hebrew = range(0x05D0, 0x05EB) +Latin_Extended_Additional = [0x1E0B, 0x1E0D, 0x1E17, 0x1E22, 0x1E24, 0x1E25, + 0x1E2B, 0x1E33, 0x1E37, 0x1E41, 0x1E43, 0x1E45, 0x1E47, 0x1E53] + \ + range(0x1E59, 0x1E5C) + [0x1E61, 0x1E63, 0x1E6B, 0x1E6D, 0x1E6F, 0x1E91, + 0x1E93, 0x1E96, 0x1EA1, 0x1ECD, 0x1EF9] +General_Punctuation = [0x2011, 0x2038, 0x203D, 0x2042] +Arrows = [0x2190, 0x2192] +Mathematical_Operators = [0x2202, 0x221A, 0x221E, 0x2225, 0x222B, 0x2260, + 0x2294, 0x2295, 0x22EE] +Enclosed_Alphanumerics = [0x24CA] +Miscellaneous_Symbols = range(0x261C, 0x2641) + range(0x2642, 0x2648) + \ + range(0x2660, 0x2664) + range(0x266D, 0x2670) +Dingbats = [0x2713, 0x2720] +Private_Use_Area = range(0xE000, 0xE01D) + range(0xE01E, 0xE029) + \ + range(0xE02A, 0xE052) +Alphabetic_Presentation_Forms = [0xFB02, 0xFB2A, 0xFB2B] + +# \\U tag codes. +U_CHARS = Latin_ExtendedA + Latin_ExtendedB + IPA_Extensions + \ + Spacing_Modifier_Letters + Greek_and_Coptic + Hebrew + \ + Latin_Extended_Additional + General_Punctuation + Arrows + \ + Mathematical_Operators + Enclosed_Alphanumerics + Miscellaneous_Symbols + \ + Dingbats + Private_Use_Area + Alphabetic_Presentation_Forms + +def unipmlcode(char): + try: + val = ord(char.encode('cp1252')) + if val in A_CHARS: + return '\\a%i' % val + except: + pass + val = ord(char) + if val in U_CHARS: + return '\\U%04x'.upper() % val + else: + return '?' diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index aa608496c7..b40870c0b5 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -13,6 +13,7 @@ import re from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.pdb.ereader import image_name +from calibre.ebooks.pml import unipmlcode from calibre import entity_to_unicode TAG_MAP = { @@ -163,8 +164,9 @@ class PMLMLizer(object): mo = re.search('(%s)' % entity[1:-1], text) text = text.replace(entity, entity_to_unicode(mo)) - # Turn all unicode characters into their PML hex equivelent - text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text) + # Turn all characters that cannot be represented by themself into their + # PML code equivelent + text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text) # Remove excess spaces at beginning and end of lines text = re.sub('(?m)^[ ]+', '', text) From 113b4c8d090db091938ea08d0cabe30d07beac36 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 3 Dec 2009 07:48:19 -0500 Subject: [PATCH 085/120] PML Meatadata: Read cover. --- src/calibre/ebooks/metadata/pml.py | 23 +++++++++++++++++++++++ src/calibre/ebooks/pml/input.py | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/pml.py b/src/calibre/ebooks/metadata/pml.py index 57ca29172a..dff791cb0b 100644 --- a/src/calibre/ebooks/metadata/pml.py +++ b/src/calibre/ebooks/metadata/pml.py @@ -28,8 +28,12 @@ def get_metadata(stream, extract_cover=True): for p in pmls: with open(p, 'r+b') as p_stream: pml += p_stream.read() + if extract_cover: + mi.cover_data = get_cover(os.path.splitext(os.path.basename(stream.name))[0], tdir, True) else: pml = stream.read() + if extract_cover: + mi.cover_data = get_cover(os.path.splitext(os.path.basename(stream.name))[0], os.path.abspath(os.path.dirname(stream.name))) for comment in re.findall(r'(?mus)\\v.*?\\v', pml): m = re.search(r'TITLE="(.*?)"', comment) @@ -51,3 +55,22 @@ def get_metadata(stream, extract_cover=True): mi.isbn = m.group(1).strip().decode('cp1252', 'replace') return mi + +def get_cover(name, tdir, top_level=False): + cover_path = [] + cover_data = None + + if top_level: + cover_path = glob.glob(os.path.join(tdir, 'cover.png')) + # Images not in top level try bookname_img directory because + # that's where Dropbook likes to see them. + if not cover_path: + cover_path = glob.glob(os.path.join(tdir, name + '_img', 'cover.png')) + # No images in Dropbook location try generic images directory + if not cover_path: + cover_path = glob.glob(os.path.join(os.path.join(tdir, 'images'), 'cover.png')) + if cover_path: + with open(cover_path[0], 'r+b') as cstream: + cover_data = cstream.read() + + return ('png', cover_data) diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index b18630c044..c88a4f947d 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -67,7 +67,7 @@ class PMLInput(InputFormatPlugin): # that's where Dropbook likes to see them. if not imgs: if hasattr(stream, 'name'): - imgs = glob.glob(os.path.join(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img'), '*.png')) + imgs = glob.glob(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img', '*.png')) # No images in Dropbook location try generic images directory if not imgs: imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png')) From 0a384ba93bdadec5de9e491b0361750c02b19728 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 3 Dec 2009 18:11:05 -0500 Subject: [PATCH 086/120] PML Metadata: Improve cover extraction. --- src/calibre/ebooks/metadata/pml.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/metadata/pml.py b/src/calibre/ebooks/metadata/pml.py index dff791cb0b..499cf75d2c 100644 --- a/src/calibre/ebooks/metadata/pml.py +++ b/src/calibre/ebooks/metadata/pml.py @@ -1,9 +1,13 @@ -'''Read meta information from TXT files''' - -from __future__ import with_statement +# -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +''' +Read meta information from TXT files +''' + import os import glob @@ -57,20 +61,15 @@ def get_metadata(stream, extract_cover=True): return mi def get_cover(name, tdir, top_level=False): - cover_path = [] + cover_path = '' cover_data = None if top_level: - cover_path = glob.glob(os.path.join(tdir, 'cover.png')) - # Images not in top level try bookname_img directory because - # that's where Dropbook likes to see them. + cover_path = os.path.join(tdir, 'cover.png') if os.path.exists(os.path.join(tdir, 'cover.png')) else '' if not cover_path: - cover_path = glob.glob(os.path.join(tdir, name + '_img', 'cover.png')) - # No images in Dropbook location try generic images directory - if not cover_path: - cover_path = glob.glob(os.path.join(os.path.join(tdir, 'images'), 'cover.png')) + cover_path = os.path.join(tdir, name + '_img', 'cover.png') if os.path.exists(os.path.join(tdir, name + '_img', 'cover.png')) else os.path.join(os.path.join(tdir, 'images'), 'cover.png') if os.path.exists(os.path.join(os.path.join(tdir, 'images'), 'cover.png')) else '' if cover_path: - with open(cover_path[0], 'r+b') as cstream: + with open(cover_path, 'r+b') as cstream: cover_data = cstream.read() return ('png', cover_data) From 5574f36c75dab3ef40e5501682b3a9b7bdfe934f Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 3 Dec 2009 18:48:19 -0500 Subject: [PATCH 087/120] PML Input: Set cover properly if it is avaliable. --- src/calibre/ebooks/pml/input.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index c88a4f947d..45f54f192f 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -123,6 +123,8 @@ class PMLInput(InputFormatPlugin): from calibre.ebooks.metadata.meta import get_metadata log.debug('Reading metadata from input file...') mi = get_metadata(stream, 'pml') + if 'images/cover.png' in images: + mi.cover = 'images/cover.png' opf = OPFCreator(os.getcwd(), mi) log.debug('Generating manifest...') opf.create_manifest(manifest_items) From 56fd57605540edfa2208faa413037ccc74b29baa Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 3 Dec 2009 20:06:46 -0500 Subject: [PATCH 088/120] eReader Metadata: Extract cover. --- src/calibre/ebooks/metadata/ereader.py | 19 +++++++++++-- src/calibre/ebooks/pdb/ereader/reader132.py | 31 +++++++++++++-------- src/calibre/ebooks/pml/input.py | 14 +++++----- 3 files changed, 43 insertions(+), 21 deletions(-) diff --git a/src/calibre/ebooks/metadata/ereader.py b/src/calibre/ebooks/metadata/ereader.py index 42f575188c..036baff2aa 100644 --- a/src/calibre/ebooks/metadata/ereader.py +++ b/src/calibre/ebooks/metadata/ereader.py @@ -16,6 +16,18 @@ from calibre.ebooks.pdb.ereader.reader132 import HeaderRecord from calibre.ebooks.pdb.header import PdbHeaderBuilder from calibre.ebooks.pdb.header import PdbHeaderReader +def get_cover(pheader, eheader): + cover_data = None + + for i in range(eheader.image_count): + raw = pheader.section_data(eheader.image_data_offset + i) + + if raw[4:4 + 32].strip('\x00') == 'cover.png': + cover_data = raw[62:] + break + + return ('png', cover_data) + def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object @@ -29,7 +41,7 @@ def get_metadata(stream, extract_cover=True): if len(pheader.section_data(0)) == 132: hr = HeaderRecord(pheader.section_data(0)) - if hr.version in (2, 10) and hr.has_metadata == 1: + if hr.compression in (2, 10) and hr.has_metadata == 1: try: mdata = pheader.section_data(hr.metadata_offset) @@ -41,6 +53,9 @@ def get_metadata(stream, extract_cover=True): except: pass + if extract_cover: + mi.cover_data = get_cover(pheader, hr) + if not mi.title: mi.title = pheader.title if pheader.title else _('Unknown') @@ -56,7 +71,7 @@ def set_metadata(stream, mi): sections = [pheader.section_data(x) for x in range(0, pheader.section_count())] hr = HeaderRecord(sections[0]) - if hr.version not in (2, 10): + if hr.compression not in (2, 10): return # Create a metadata record for the file if one does not alreay exist diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py index 49fdfb8980..adb77d478f 100644 --- a/src/calibre/ebooks/pdb/ereader/reader132.py +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -29,12 +29,19 @@ class HeaderRecord(object): ''' def __init__(self, raw): - self.version, = struct.unpack('>H', raw[0:2]) + self.compression, = struct.unpack('>H', raw[0:2]) self.non_text_offset, = struct.unpack('>H', raw[12:14]) + self.chapter_count, = struct.unpack('>H', raw[14:16]) + self.image_count, = struct.unpack('>H', raw[20:22]) + self.link_count, = struct.unpack('>H', raw[22:24]) self.has_metadata, = struct.unpack('>H', raw[24:26]) - self.footnote_rec, = struct.unpack('>H', raw[28:30]) - self.sidebar_rec, = struct.unpack('>H', raw[30:32]) + self.footnote_count, = struct.unpack('>H', raw[28:30]) + self.sidebar_count, = struct.unpack('>H', raw[30:32]) + self.chapter_offset, = struct.unpack('>H', raw[32:34]) + self.small_font_page_offset, = struct.unpack('>H', raw[36:38]) + self.large_font_page_offset, = struct.unpack('>H', raw[38:40]) self.image_data_offset, = struct.unpack('>H', raw[40:42]) + self.link_offset, = struct.unpack('>H', raw[42:44]) self.metadata_offset, = struct.unpack('>H', raw[44:46]) self.footnote_offset, = struct.unpack('>H', raw[48:50]) self.sidebar_offset, = struct.unpack('>H', raw[50:52]) @@ -58,11 +65,11 @@ class Reader132(FormatReader): self.header_record = HeaderRecord(self.section_data(0)) - if self.header_record.version not in (2, 10): - if self.header_record.version in (260, 272): + if self.header_record.compression not in (2, 10): + if self.header_record.compression in (260, 272): raise DRMError('eReader DRM is not supported.') else: - raise EreaderError('Unknown book version %i.' % self.header_record.version) + raise EreaderError('Unknown book compression %i.' % self.header_record.compression) from calibre.ebooks.metadata.pdb import get_metadata self.mi = get_metadata(stream, False) @@ -71,9 +78,9 @@ class Reader132(FormatReader): return self.sections[number] def decompress_text(self, number): - if self.header_record.version == 2: + if self.header_record.compression == 2: return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') - if self.header_record.version == 10: + if self.header_record.compression == 10: return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') def get_image(self, number): @@ -115,19 +122,19 @@ class Reader132(FormatReader): pml += self.get_text_page(i) html += pml_to_html(pml) - if self.header_record.footnote_rec > 0: + if self.header_record.footnote_count > 0: html += '<br /><h1>%s</h1>' % _('Footnotes') footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) - for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)): + for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)): self.log.debug('Extracting footnote page %i' % i) html += '<dl>' html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) html += '</dl>' - if self.header_record.sidebar_rec > 0: + if self.header_record.sidebar_count > 0: html += '<br /><h1>%s</h1>' % _('Sidebar') sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) - for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)): + for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)): self.log.debug('Extracting sidebar page %i' % i) html += '<dl>' html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index 45f54f192f..5453665a55 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -27,20 +27,20 @@ class PMLInput(InputFormatPlugin): def process_pml(self, pml_path, html_path, close_all=False): pclose = False hclose = False - + if not hasattr(pml_path, 'read'): pml_stream = open(pml_path, 'rb') pclose = True else: pml_stream = pml_path pml_stream.seek(0) - + if not hasattr(html_path, 'write'): html_stream = open(html_path, 'wb') hclose = True else: html_stream = html_path - + ienc = pml_stream.encoding if pml_stream.encoding else 'cp1252' if self.options.input_encoding: ienc = self.options.input_encoding @@ -95,12 +95,12 @@ class PMLInput(InputFormatPlugin): with TemporaryDirectory('_unpmlz') as tdir: zf = ZipFile(stream) zf.extractall(tdir) - + pmls = glob.glob(os.path.join(tdir, '*.pml')) for pml in pmls: html_name = os.path.splitext(os.path.basename(pml))[0]+'.html' html_path = os.path.join(os.getcwd(), html_name) - + pages.append(html_name) log.debug('Processing PML item %s...' % pml) ttoc = self.process_pml(pml, html_path) @@ -119,7 +119,7 @@ class PMLInput(InputFormatPlugin): manifest_items = [] for item in pages+images: manifest_items.append((item, None)) - + from calibre.ebooks.metadata.meta import get_metadata log.debug('Reading metadata from input file...') mi = get_metadata(stream, 'pml') @@ -133,5 +133,5 @@ class PMLInput(InputFormatPlugin): with open('metadata.opf', 'wb') as opffile: with open('toc.ncx', 'wb') as tocfile: opf.render(opffile, tocfile, 'toc.ncx') - + return os.path.join(os.getcwd(), 'metadata.opf') From 1e6651393e9216b2eb9faea28e0f87d2e0ec60bf Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 3 Dec 2009 20:10:07 -0500 Subject: [PATCH 089/120] eReader Input: Use included cover when avaliable. --- src/calibre/ebooks/pdb/ereader/reader132.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py index adb77d478f..ffe414dd47 100644 --- a/src/calibre/ebooks/pdb/ereader/reader132.py +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -164,12 +164,15 @@ class Reader132(FormatReader): def create_opf(self, output_dir, images): with CurrentDir(output_dir): + if 'cover.png' in images: + self.mi.cover = os.path.join('images', 'cover.png') + opf = OPFCreator(output_dir, self.mi) manifest = [('index.html', None)] for i in images: - manifest.append((os.path.join('images/', i), None)) + manifest.append((os.path.join('images', i), None)) opf.create_manifest(manifest) opf.create_spine(['index.html']) From d5d8202c2a7a56a64afb582192c372b038a1dcb8 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 3 Dec 2009 20:38:33 -0500 Subject: [PATCH 090/120] PML2PMLZ input plugin. --- src/calibre/customize/builtins.py | 38 ++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 02bc0b621f..7dfba3a899 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -1,3 +1,4 @@ +import os.path __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' @@ -49,6 +50,41 @@ every time you add an HTML file to the library.\ 'include: cp1252, latin1, iso-8859-1 and utf-8.') +class PML2PMLZ(FileTypePlugin): + name = 'PML to ZIP' + author = 'John Schember' + description = textwrap.dedent(_('''\ +Create a PMLZ archive containging the PML file \ +and all images in the directory pmlname_img or images \ +file containing all linked files. This plugin is run \ +every time you add an PML file to the library.\ +''')) + version = numeric_version + file_types = set(['pml']) + supported_platforms = ['windows', 'osx', 'linux'] + on_import = True + + def run(self, pmlfile): + import zipfile + from calibre.ptempfile import PersistentTemporaryFile + + name = os.path.join(tdir, '_plugin_pml2pmlz.pmlz') + pmlz = zipfile.ZipFile(name, 'w') + pmlz.write(pmlfile) + + pml_img = os.path.basename(pmlfile)[0] + '_img' + img_dir = pml_img if os.path.exists(pml_img) else 'images' if os.path.exists(images) else '' + if img_dir: + for image in glob.glob(os.path.join(img_dir, '*.png')): + pmlz.write(image) + pmlz.close() + + return name + + def customization_help(self, gui=False): + return _('Character encoding for the input PML files. Should ways be: cp1252.') + + class ComicMetadataReader(MetadataReaderPlugin): name = 'Read comic metadata' @@ -387,7 +423,7 @@ from calibre.devices.nuut2.driver import NUUT2 from calibre.devices.iriver.driver import IRIVER_STORY from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon -plugins = [HTML2ZIP, GoogleBooks, ISBNDB, Amazon] +plugins = [HTML2ZIP, PML2PMLZ, GoogleBooks, ISBNDB, Amazon] plugins += [ ComicInput, EPUBInput, From 9cfd0b9a4fbba99f1fcf2c2706bc9482bf01f662 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 4 Dec 2009 06:13:17 -0500 Subject: [PATCH 091/120] PMLTOPMLZ: Fix name. --- src/calibre/customize/builtins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 7dfba3a899..948d4c9b7d 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -51,7 +51,7 @@ every time you add an HTML file to the library.\ class PML2PMLZ(FileTypePlugin): - name = 'PML to ZIP' + name = 'PML to PMLZ' author = 'John Schember' description = textwrap.dedent(_('''\ Create a PMLZ archive containging the PML file \ From 74d613eb375d0d0051f361562e3ddfc5b2ab88dd Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 4 Dec 2009 18:38:20 -0500 Subject: [PATCH 092/120] Fix PML2PMLZ FileTypePlugin. --- src/calibre/customize/builtins.py | 33 +++++++++++++------------------ 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index a8ad352b9b..e2f1055610 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -54,10 +54,10 @@ class PML2PMLZ(FileTypePlugin): name = 'PML to PMLZ' author = 'John Schember' description = textwrap.dedent(_('''\ - Create a PMLZ archive containing the PML file \ - and all images in the directory pmlname_img or images \ - file containing all linked files. This plugin is run \ - every time you add an PML file to the library.\ +Create a PMLZ archive containing the PML file \ +and all images in the directory pmlname_img or images \ +file containing all linked files. This plugin is run \ +every time you add an PML file to the library.\ ''')) version = numeric_version file_types = set(['pml']) @@ -66,25 +66,20 @@ class PML2PMLZ(FileTypePlugin): def run(self, pmlfile): import zipfile - from calibre.ptempfile import TemporaryDirectory - with TemporaryDirectory('_plugin_pml2pmlz') as tdir: - name = os.path.join(tdir, '_plugin_pml2pmlz.pmlz') - pmlz = zipfile.ZipFile(name, 'w') - pmlz.write(pmlfile) + of = self.temporary_file('_plugin_pml2pmlz.pmlz') + pmlz = zipfile.ZipFile(of.name, 'w') + pmlz.write(pmlfile, os.path.basename(pmlfile)) - pml_img = os.path.basename(pmlfile)[0] + '_img' - img_dir = pml_img if os.path.exists(pml_img) else 'images' if \ + pml_img = os.path.basename(pmlfile)[0] + '_img' + img_dir = pml_img if os.path.exists(pml_img) else 'images' if \ os.path.exists('images') else '' - if img_dir: - for image in glob.glob(os.path.join(img_dir, '*.png')): - pmlz.write(image) - pmlz.close() + if img_dir: + for image in glob.glob(os.path.join(img_dir, '*.png')): + pmlz.write(image, os.path.join('images', (os.path.basename(image)))) + pmlz.close() - return name - - def customization_help(self, gui=False): - return _('Character encoding for the input PML files. Should ways be: cp1252.') + return of.name class ComicMetadataReader(MetadataReaderPlugin): From 6634f3f13f80495c2884c257270bc81f501b95d6 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 4 Dec 2009 18:40:02 -0500 Subject: [PATCH 093/120] Fix PML2PMLZ FileTypePlugin description. --- src/calibre/customize/builtins.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index e2f1055610..bd9e52ae93 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -55,9 +55,9 @@ class PML2PMLZ(FileTypePlugin): author = 'John Schember' description = textwrap.dedent(_('''\ Create a PMLZ archive containing the PML file \ -and all images in the directory pmlname_img or images \ -file containing all linked files. This plugin is run \ -every time you add an PML file to the library.\ +and all images in the directory pmlname_img or \ +images. This plugin is run every time you add \ +a PML file to the library. \ ''')) version = numeric_version file_types = set(['pml']) From d03dc39fcba71ced4d4fb36c84c16be16865327f Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 5 Dec 2009 08:09:48 -0500 Subject: [PATCH 094/120] FB2 Output: Use h1 tags to create section titles used for TOC. --- src/calibre/ebooks/fb2/fb2ml.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 41b93f6d6b..31b0d8f0a2 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -46,6 +46,10 @@ TAG_LINKS = [ 'a', ] +TAG_TITLE = [ + 'h1', +] + STYLES = [ ('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}), ('font-style', {'italic' : 'emphasis'}), @@ -196,7 +200,6 @@ class FB2MLizer(object): return [u''] tag = barename(elem.tag) - tag_count = 0 if tag in TAG_IMAGES: if elem.attrib.get('src', None): @@ -218,7 +221,6 @@ class FB2MLizer(object): self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) href = self.link_hrefs[href] fb2_text.append('<a xlink:href="#%s">' % href) - tag_count += 1 tag_stack.append('a') # Anchor ids @@ -226,11 +228,20 @@ class FB2MLizer(object): if id_name: fb2_text.append(self.get_anchor(page, id_name)) + if tag in TAG_TITLE: + if 'p' in tag_stack: + ctag = [] + ctag.append(tag_stack.pop()) + while ctag[-1] != 'p': + ctag.append(tag_stack.pop()) + fb2_text += self.close_tags(ctag) + fb2_text.append('</section><section><title><p>') + tag_stack.append('title') + tag_stack.append('p') + fb2_tag = TAG_MAP.get(tag, None) if fb2_tag: - if fb2_tag not in tag_stack: - tag_count += 1 - else: + if fb2_tag in tag_stack: tag_stack.reverse() tag_stack.remove(fb2_tag) tag_stack.reverse() @@ -242,7 +253,6 @@ class FB2MLizer(object): for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag: - tag_count += 1 fb2_text.append('<%s>' % style_tag) tag_stack.append(style_tag) @@ -260,7 +270,7 @@ class FB2MLizer(object): fb2_text += self.dump_text(item, stylizer, page, tag_stack) close_tag_list = [] - for i in range(0, tag_count): + for i in range(0, len(tag_stack)): close_tag_list.insert(0, tag_stack.pop()) fb2_text += self.close_tags(close_tag_list) From 3263a8c3bacf757db2e74ca46b687cdf89267b78 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 5 Dec 2009 15:41:50 -0500 Subject: [PATCH 095/120] PML Input: Make footnotes and sidebars display better and add return link. --- src/calibre/ebooks/pdb/ereader/reader132.py | 10 +- src/calibre/ebooks/pml/pmlconverter.py | 146 ++++++++++++++------ 2 files changed, 110 insertions(+), 46 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py index ffe414dd47..d2a1c006e3 100644 --- a/src/calibre/ebooks/pdb/ereader/reader132.py +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -103,7 +103,7 @@ class Reader132(FormatReader): return self.decompress_text(number) def extract_content(self, output_dir): - from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html + from calibre.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html from calibre.ebooks.pml.pmlconverter import pml_to_html output_dir = os.path.abspath(output_dir) @@ -127,18 +127,14 @@ class Reader132(FormatReader): footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)): self.log.debug('Extracting footnote page %i' % i) - html += '<dl>' - html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) - html += '</dl>' + html += footnote_to_html(footnoteids[fid], self.decompress_text(i)) if self.header_record.sidebar_count > 0: html += '<br /><h1>%s</h1>' % _('Sidebar') sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)): self.log.debug('Extracting sidebar page %i' % i) - html += '<dl>' - html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) - html += '</dl>' + html += sidebar_to_html(sidebarids[sid], self.decompress_text(i)) html += '</body></html>' diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index c120f2faf9..2d85a6b251 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -30,6 +30,7 @@ class PML_HTMLizer(object): 'h5', 'h6', 'a', + 'ra', 'c', 'r', 't', @@ -37,15 +38,24 @@ class PML_HTMLizer(object): 'l', 'k', 'T', - 'Fn', - 'Sd', - 'FS' + 'FN', + 'SB', ] STATES_VALUE_REQ = [ 'a', 'T', - 'FS' + ] + + STATES_VALUE_REQ_2 = [ + 'ra', + 'FN', + 'SB', + ] + + STATES_CLOSE_VALUE_REQ = [ + 'FN', + 'SB', ] STATES_TAGS = { @@ -57,7 +67,8 @@ class PML_HTMLizer(object): 'h6': ('<h6>', '</h6>'), 'sp': ('<sup>', '</sup>'), 'sb': ('<sub>', '</sub>'), - 'a': ('<a href="%s">', '</a>'), + 'a': ('<a href="#%s">', '</a>'), + 'ra': ('<span id="r%s"></span><a href="#%s">', '</a>'), 'c': ('<div style="text-align: center; margin: auto;">', '</div>'), 'r': ('<div style="text-align: right;">', '</div>'), 't': ('<div style="margin-left: 5%;">', '</div>'), @@ -68,7 +79,8 @@ class PML_HTMLizer(object): 'b': ('<span style="font-weight: bold;">', '</span>'), 'l': ('<span style="font-size: 150%;">', '</span>'), 'k': ('<span style="font-size: 75%;">', '</span>'), - 'FS': ('<div id="%s">', '</div>'), + 'FN': ('<br /><br style="page-break-after: always;" /><div id="fn-%s"><dl><dt>%s</dt><dd><p>', '</p></dd></dl><small><a href="#rfn-%s">return</a></small></div>'), + 'SB': ('<br /><br style="page-break-after: always;" /><div id="sb-%s"><dl><dt><dt>%s</dt><dd><p>', '</p></dd></dl><small><a href="#rsb-%s">return</a></small></div>'), } CODE_STATES = { @@ -93,14 +105,20 @@ class PML_HTMLizer(object): 'B': 'b', 'l': 'l', 'k': 'k', - 'Fn': 'a', - 'Sd': 'a', - 'FN': 'FS', - 'SB': 'FS', + 'Fn': 'ra', + 'Sd': 'ra', + 'FN': 'FN', + 'SB': 'SB', } + LINK_STATES = [ + 'a', + 'ra', + ] + BLOCK_STATES = [ 'a', + 'ra', 'h1', 'h2', 'h3', @@ -116,7 +134,8 @@ class PML_HTMLizer(object): 'r', 't', 'T', - 'FS', + 'FN', + 'SB', ] SPAN_STATES = [ @@ -144,8 +163,8 @@ class PML_HTMLizer(object): pml = re.sub(r'(?mus)^[ ]*$', '', pml) # Footnotes and Sidebars - pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) - pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) + pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) + pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) # Convert &'s into entities so & in the text doesn't get turned into # &. It will display as & @@ -181,10 +200,12 @@ class PML_HTMLizer(object): for key, val in self.state.items(): if val[0]: - if key not in self.STATES_VALUE_REQ: - start += self.STATES_TAGS[key][0] - else: + if key in self.STATES_VALUE_REQ: start += self.STATES_TAGS[key][0] % val[1] + elif key in self.STATES_VALUE_REQ_2: + start += self.STATES_TAGS[key][0] % (val[1], val[1]) + else: + start += self.STATES_TAGS[key][0] return u'<p>%s' % start @@ -206,7 +227,10 @@ class PML_HTMLizer(object): else: other.append(key) for key in span+div+other: - end += self.STATES_TAGS[key][1] + if key in self.STATES_CLOSE_VALUE_REQ: + end += self.STATES_TAGS[key][1] % self.state[key][1] + else: + end += self.STATES_TAGS[key][1] return u'%s</p>' % end @@ -239,11 +263,17 @@ class PML_HTMLizer(object): text = u'' if self.state[code][0]: - text = self.STATES_TAGS[code][1] + if code in self.STATES_CLOSE_VALUE_REQ: + text = self.STATES_TAGS[code][1] % self.state[code][1] + else: + text = self.STATES_TAGS[code][1] else: - if code in self.STATES_VALUE_REQ: + if code in self.STATES_VALUE_REQ or code in self.STATES_VALUE_REQ_2: val = self.code_value(stream) - text += self.STATES_TAGS[code][0] % val + if code in self.STATES_VALUE_REQ: + text = self.STATES_TAGS[code][0] % val + else: + text = self.STATES_TAGS[code][0] % (val, val) self.state[code][1] = val else: text = self.STATES_TAGS[code][0] @@ -258,7 +288,10 @@ class PML_HTMLizer(object): # Close all. for c in self.SPAN_STATES+self.DIV_STATES: if self.state[c][0]: - text += self.STATES_TAGS[c][1] + if c in self.STATES_CLOSE_VALUE_REQ: + text += self.STATES_TAGS[c][1] % self.state[c][1] + else: + text += self.STATES_TAGS[c][1] # Reopen the based on state. for c in self.DIV_STATES+self.SPAN_STATES: if code == c: @@ -266,6 +299,8 @@ class PML_HTMLizer(object): if self.state[c][0]: if c in self.STATES_VALUE_REQ: text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] + elif c in self.STATES_VALUE_REQ_2: + text += self.STATES_TAGS[self.CODE_STATES[c]][0] % (self.state[c][1], self.state[c][1]) else: text += self.STATES_TAGS[c][0] # Open code. @@ -273,11 +308,17 @@ class PML_HTMLizer(object): # Close all spans. for c in self.SPAN_STATES: if self.state[c][0]: - text += self.STATES_TAGS[c][1] + if c in self.STATES_CLOSE_VALUE_REQ: + text += self.STATES_TAGS[c][1] % self.state[c][1] + else: + text += self.STATES_TAGS[c][1] # Process the code - if code in self.STATES_VALUE_REQ: + if code in self.STATES_VALUE_REQ or code in self.STATES_VALUE_REQ_2: val = self.code_value(stream) - text += self.STATES_TAGS[code][0] % val + if code in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[code][0] % val + else: + text += self.STATES_TAGS[code][0] % (val, val) self.state[code][1] = val else: text += self.STATES_TAGS[code][0] @@ -286,6 +327,8 @@ class PML_HTMLizer(object): if self.state[c][0]: if c in self.STATES_VALUE_REQ: text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] + elif c in self.STATES_VALUE_REQ_2: + text += self.STATES_TAGS[self.CODE_STATES[c]][0] % (self.state[c][1], self.state[c][1]) else: text += self.STATES_TAGS[c][0] @@ -299,7 +342,10 @@ class PML_HTMLizer(object): # Close all spans for c in self.SPAN_STATES: if self.state[c][0]: - text += self.STATES_TAGS[c][1] + if c in self.STATES_CLOSE_VALUE_REQ: + text += self.STATES_TAGS[c][1] % self.state[c][1] + else: + text += self.STATES_TAGS[c][1] # Re-open the spans based on state except for code which will be # left closed. for c in self.SPAN_STATES: @@ -308,13 +354,18 @@ class PML_HTMLizer(object): if self.state[c][0]: if c in self.STATES_VALUE_REQ: text += self.STATES_TAGS[code][0] % self.state[c][1] + elif c in self.STATES_VALUE_REQ_2: + text += self.STATES_TAGS[code][0] % (self.state[c][1], self.state[c][1]) else: text += self.STATES_TAGS[c][0] # Open code. else: - if code in self.STATES_VALUE_REQ: + if code in self.STATES_VALUE_REQ or code in self.STATES_VALUE_REQ_2: val = self.code_value(stream) - text += self.STATES_TAGS[code][0] % val + if code in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[code][0] % val + else: + text += self.STATES_TAGS[code][0] % (val, val) self.state[code][1] = val else: text += self.STATES_TAGS[code][0] @@ -327,19 +378,29 @@ class PML_HTMLizer(object): # Close all spans for c in self.SPAN_STATES: if self.state[c][0]: - text += self.STATES_TAGS[c][1] - + if c in self.STATES_CLOSE_VALUE_REQ: + text += self.STATES_TAGS[c][1] % self.state[c][1] + else: + text += self.STATES_TAGS[c][1] # Process the code if self.state[code][0]: # Close tag - text += self.STATES_TAGS[code][1] + if code in self.STATES_CLOSE_VALUE_REQ: + text += self.STATES_TAGS[code][1] % self.state[code][1] + else: + text += self.STATES_TAGS[code][1] else: # Open tag - if code in self.STATES_VALUE_REQ: + if code in self.STATES_VALUE_REQ or code in self.STATES_VALUE_REQ_2: val = self.code_value(stream) + if code in self.LINK_STATES: + val = val.lstrip('#') if pre: - val = '#%s-%s' % (pre, val) - text += self.STATES_TAGS[code][0] % val + val = '%s-%s' % (pre, val) + if code in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[code][0] % val + else: + text += self.STATES_TAGS[code][0] % (val, val) self.state[code][1] = val else: text += self.STATES_TAGS[code][0] @@ -349,6 +410,8 @@ class PML_HTMLizer(object): if self.state[c][0]: if c in self.STATES_VALUE_REQ: text += self.STATES_TAGS[code][0] % self.state[c][1] + elif c in self.STATES_VALUE_REQ_2: + text += self.STATES_TAGS[code][0] % (self.state[c][1], self.state[c][1]) else: text += self.STATES_TAGS[c][0] @@ -432,13 +495,13 @@ class PML_HTMLizer(object): elif c in 'FSX': l = line.read(1) if '%s%s' % (c, l) == 'Fn': - text = self.process_code('Fn', line, 'fns') + text = self.process_code('Fn', line, 'fn') elif '%s%s' % (c, l) == 'FN': text = self.process_code('FN', line) elif '%s%s' % (c, l) == 'SB': text = self.process_code('SB', line) elif '%s%s' % (c, l) == 'Sd': - text = self.process_code('Sd', line, 'fns') + text = self.process_code('Sd', line, 'sb') else: text = self.process_code('%s%s' % (c, l), line) elif c == 'm': @@ -496,8 +559,13 @@ def pml_to_html(pml): hizer = PML_HTMLizer() return hizer.parse_pml(pml) -def footnote_sidebar_to_html(id, pml): - if id.startswith('\x01'): - id = id[2:] - html = '<div id="fns-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml)) +def footnote_sidebar_to_html(pre_id, id, pml): + id = id.strip('\x01') + html = '<br /><br style="page-break-after: always;" /><div id="%s-%s"><dl><dt>%s</dt><dd><p>%s</p></dd></dl><small><a href="#r%s-%s">return</a></small></div>' % (pre_id, id, id, pml_to_html(pml), pre_id, id) return html + +def footnote_to_html(id, pml): + return footnote_sidebar_to_html('fn', id, pml) + +def sidebar_to_html(id, pml): + return footnote_sidebar_to_html('sb', id, pml) From 216c27bb3c8d45d9d72ff3f5d0d72eb0557a3abe Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 5 Dec 2009 15:45:23 -0500 Subject: [PATCH 096/120] PML Input: Don't use id as title for footnotes and sidebars. --- src/calibre/ebooks/pml/pmlconverter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 2d85a6b251..3484be5927 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -45,12 +45,12 @@ class PML_HTMLizer(object): STATES_VALUE_REQ = [ 'a', 'T', + 'FN', + 'SB', ] STATES_VALUE_REQ_2 = [ 'ra', - 'FN', - 'SB', ] STATES_CLOSE_VALUE_REQ = [ @@ -79,8 +79,8 @@ class PML_HTMLizer(object): 'b': ('<span style="font-weight: bold;">', '</span>'), 'l': ('<span style="font-size: 150%;">', '</span>'), 'k': ('<span style="font-size: 75%;">', '</span>'), - 'FN': ('<br /><br style="page-break-after: always;" /><div id="fn-%s"><dl><dt>%s</dt><dd><p>', '</p></dd></dl><small><a href="#rfn-%s">return</a></small></div>'), - 'SB': ('<br /><br style="page-break-after: always;" /><div id="sb-%s"><dl><dt><dt>%s</dt><dd><p>', '</p></dd></dl><small><a href="#rsb-%s">return</a></small></div>'), + 'FN': ('<br /><br style="page-break-after: always;" /><div id="fn-%s"><p>', '</p><<small><a href="#rfn-%s">return</a></small></div>'), + 'SB': ('<br /><br style="page-break-after: always;" /><div id="sb-%s"><p>', '</p><small><a href="#rsb-%s">return</a></small></div>'), } CODE_STATES = { @@ -561,7 +561,7 @@ def pml_to_html(pml): def footnote_sidebar_to_html(pre_id, id, pml): id = id.strip('\x01') - html = '<br /><br style="page-break-after: always;" /><div id="%s-%s"><dl><dt>%s</dt><dd><p>%s</p></dd></dl><small><a href="#r%s-%s">return</a></small></div>' % (pre_id, id, id, pml_to_html(pml), pre_id, id) + html = '<br /><br style="page-break-after: always;" /><div id="%s-%s"><p>%s</p><small><a href="#r%s-%s">return</a></small></div>' % (pre_id, id, pml_to_html(pml), pre_id, id) return html def footnote_to_html(id, pml): From 4a20c9a5829d4ec82655ff801edc8e37f853d11f Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 5 Dec 2009 21:39:33 -0500 Subject: [PATCH 097/120] PML Output: Remove unnecessary entity to unicode call. --- src/calibre/ebooks/pml/pmlml.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index b40870c0b5..6c217f524c 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -158,12 +158,6 @@ class PMLMLizer(object): text = text.replace(u'\xc2', '') text = text.replace(u'\xa0', ' ') - # Turn all html entities into unicode. This should not be necessary as - # lxml should have already done this but we want to be sure it happens. - for entity in set(re.findall('&.+?;', text)): - mo = re.search('(%s)' % entity[1:-1], text) - text = text.replace(entity, entity_to_unicode(mo)) - # Turn all characters that cannot be represented by themself into their # PML code equivelent text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text) From 0fff29bfb2551592e65ee994a65b99ad7f3a5179 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 5 Dec 2009 21:40:27 -0500 Subject: [PATCH 098/120] PML Output: Remove unnecessary import. --- src/calibre/ebooks/pml/pmlml.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 6c217f524c..b23cd40813 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -14,7 +14,6 @@ from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.pml import unipmlcode -from calibre import entity_to_unicode TAG_MAP = { 'b' : 'B', From 44a78a9f0ff7c675341b5b5f0fd74882edabe288 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Mon, 7 Dec 2009 07:21:27 -0500 Subject: [PATCH 099/120] Preliminary Nook driver. --- src/calibre/customize/builtins.py | 2 ++ src/calibre/devices/nook/__init__.py | 0 src/calibre/devices/nook/driver.py | 36 ++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 src/calibre/devices/nook/__init__.py create mode 100644 src/calibre/devices/nook/driver.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index c317decd76..25a5fd0910 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -409,6 +409,7 @@ from calibre.devices.iliad.driver import ILIAD from calibre.devices.irexdr.driver import IREXDR1000 from calibre.devices.jetbook.driver import JETBOOK from calibre.devices.kindle.driver import KINDLE, KINDLE2, KINDLE_DX +from calibre.devices.nook.driver import NOOK from calibre.devices.prs500.driver import PRS500 from calibre.devices.prs505.driver import PRS505 from calibre.devices.prs700.driver import PRS700 @@ -464,6 +465,7 @@ plugins += [ KINDLE, KINDLE2, KINDLE_DX, + NOOK, PRS505, PRS700, PRS500, diff --git a/src/calibre/devices/nook/__init__.py b/src/calibre/devices/nook/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/devices/nook/driver.py b/src/calibre/devices/nook/driver.py new file mode 100644 index 0000000000..45031082e1 --- /dev/null +++ b/src/calibre/devices/nook/driver.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john at nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +''' +Device driver for Barns and Nobel's Nook +''' + +from calibre.devices.usbms.driver import USBMS + +class NOOK(USBMS): + + name = 'Nook Iliad Device Interface' + description = _('Communicate with the Barns and Noble Nook eBook reader.') + author = _('John Schember') + supported_platforms = ['windows', 'linux'] + + # Ordered list of supported formats + # Be sure these have an entry in calibre.devices.mime + FORMATS = ['epub', 'pdb', 'pdf'] + + VENDOR_ID = [0x2080] + PRODUCT_ID = [0x001] + BCD = [0x322] + + VENDOR_NAME = 'B&N' + WINDOWS_MAIN_MEM = 'NOOK' + + #OSX_MAIN_MEM = '' + + MAIN_MEMORY_VOLUME_LABEL = 'BN Nook Main Memory' + + EBOOK_DIR_MAIN = 'my documents' + SUPPORTS_SUB_DIRS = True From 25d19004587cd651c524f0469251530b24ce9670 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Mon, 7 Dec 2009 07:35:57 -0500 Subject: [PATCH 100/120] NOOK Driver: Support stoarge card. --- src/calibre/devices/nook/driver.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/calibre/devices/nook/driver.py b/src/calibre/devices/nook/driver.py index 45031082e1..fe73edf84a 100644 --- a/src/calibre/devices/nook/driver.py +++ b/src/calibre/devices/nook/driver.py @@ -27,6 +27,7 @@ class NOOK(USBMS): VENDOR_NAME = 'B&N' WINDOWS_MAIN_MEM = 'NOOK' + WINDOWS_CARD_A_MEM = 'NOOK' #OSX_MAIN_MEM = '' @@ -34,3 +35,12 @@ class NOOK(USBMS): EBOOK_DIR_MAIN = 'my documents' SUPPORTS_SUB_DIRS = True + + def windows_sort_drives(self, drives): + main = drives.get('main', None) + card = drives.get('carda', None) + if card and main and card < main: + drives['main'] = card + drives['carda'] = main + + return drives From af4e0b0155c9000212a0e2ead02dd18a6d5ea8e5 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 12 Dec 2009 10:48:48 -0500 Subject: [PATCH 101/120] Fix errors preventing develop --clean-all from working properly. --- setup/extensions.py | 6 +++--- setup/installer/__init__.py | 2 +- src/calibre/ebooks/pdb/ereader/reader132.py | 2 +- src/calibre/ebooks/pdb/ereader/reader202.py | 4 ++-- src/calibre/ebooks/pdb/palmdoc/reader.py | 2 +- src/calibre/ebooks/pdb/palmdoc/writer.py | 3 ++- 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/setup/extensions.py b/setup/extensions.py index faa1a3d88a..0d465f4a0a 100644 --- a/setup/extensions.py +++ b/setup/extensions.py @@ -375,9 +375,9 @@ class Build(Command): for x in (dest, dest+'.manifest'): if os.path.exists(x): os.remove(x) - shutil.rmtree(self.j(self.d(self.SRC), 'build')) - - + build_dir = self.j(self.d(self.SRC), 'build') + if os.path.exists(build_dir): + shutil.rmtree(build_dir) class BuildPDF2XML(Command): diff --git a/setup/installer/__init__.py b/setup/installer/__init__.py index 99e7586aa3..2acc5d4649 100644 --- a/setup/installer/__init__.py +++ b/setup/installer/__init__.py @@ -126,5 +126,5 @@ class VMInstaller(Command): def clean(self): installer = self.installer() - if os.patyh.exists(installer): + if os.path.exists(installer): os.remove(installer) diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py index d2a1c006e3..cce1d40f8c 100644 --- a/src/calibre/ebooks/pdb/ereader/reader132.py +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -15,7 +15,6 @@ import zlib from calibre import CurrentDir from calibre.ebooks import DRMError -from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.formatreader import FormatReader @@ -79,6 +78,7 @@ class Reader132(FormatReader): def decompress_text(self, number): if self.header_record.compression == 2: + from calibre.ebooks.compression.palmdoc import decompress_doc return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') if self.header_record.compression == 10: return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') diff --git a/src/calibre/ebooks/pdb/ereader/reader202.py b/src/calibre/ebooks/pdb/ereader/reader202.py index ce7ad1263a..a674c5bf60 100644 --- a/src/calibre/ebooks/pdb/ereader/reader202.py +++ b/src/calibre/ebooks/pdb/ereader/reader202.py @@ -12,7 +12,6 @@ import struct from calibre import CurrentDir from calibre.ebooks.metadata.opf2 import OPFCreator -from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ereader import EreaderError @@ -55,7 +54,8 @@ class Reader202(FormatReader): return self.sections[number] def decompress_text(self, number): - return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding, 'replace') + from calibre.ebooks.compression.palmdoc import decompress_doc + return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding, 'replace') def get_image(self, number): name = None diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index 0a57e3f51a..ea7e6bbc2b 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -11,7 +11,6 @@ __docformat__ = 'restructuredtext en' import os import struct -from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted @@ -51,6 +50,7 @@ class Reader(FormatReader): if self.header_record.compression == 1: return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding) if self.header_record.compression == 2: + from calibre.ebooks.compression.palmdoc import decompress_doc return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') return '' diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index 91a5eb3d97..3f4a92fbed 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en' import struct -from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.pdb.header import PdbHeaderBuilder from calibre.ebooks.txt.txtml import TXTMLizer @@ -25,6 +24,8 @@ class Writer(FormatWriter): self.log = log def write_content(self, oeb_book, out_stream, metadata=None): + from calibre.ebooks.compression.palmdoc import compress_doc + title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') txt_records, txt_length = self._generate_text(oeb_book) From a8cb44249df81a516c03aaf8a8d1da374a49b515 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 12 Dec 2009 19:55:30 -0500 Subject: [PATCH 102/120] Work on FB2ML --- src/calibre/ebooks/fb2/fb2ml.py | 88 ++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 34 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 31b0d8f0a2..d991c0072b 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -10,6 +10,7 @@ Transform OEB content into FB2 markup import cStringIO from base64 import b64encode +import re try: from PIL import Image @@ -30,14 +31,15 @@ TAG_MAP = { 'i' : 'emphasis', 'p' : 'p', 'li' : 'p', - 'br' : 'p', + 'div': 'p', } -TAG_SPACE = [ - 'div', +TAG_FORCE_P = [ 'br', ] +TAG_SPACE = [] + TAG_IMAGES = [ 'img', ] @@ -79,8 +81,14 @@ class FB2MLizer(object): output.append(self.fb2mlize_images()) output.append(self.fb2_footer()) output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc()) + output = self.clean_text(output) return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True) + def clean_text(self, text): + text = re.sub('<p>[ ]*</p>', '', text) + + return text + def fb2_header(self): author_first = u'' author_middle = u'' @@ -124,7 +132,7 @@ class FB2MLizer(object): return output def get_toc(self): - toc = [u''] + toc = [] if self.opts.inline_toc: self.log.debug('Generating table of contents...') toc.append(u'<p>%s</p>' % _('Table of Contents:')) @@ -136,7 +144,7 @@ class FB2MLizer(object): return ''.join(toc) def get_text(self): - text = [u''] + text = [] for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) @@ -162,7 +170,7 @@ class FB2MLizer(object): return '<a id="%s" />' % aid def fb2mlize_images(self): - images = [u''] + images = [] for item in self.oeb_book.manifest: if item.media_type in OEB_RASTER_IMAGES: try: @@ -190,14 +198,15 @@ class FB2MLizer(object): def dump_text(self, elem, stylizer, page, tag_stack=[]): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: - return [u''] + return [] - fb2_text = [u''] style = stylizer.style(elem) - if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': - return [u''] + return [] + + fb2_text = [] + tags = [] tag = barename(elem.tag) @@ -221,14 +230,32 @@ class FB2MLizer(object): self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) href = self.link_hrefs[href] fb2_text.append('<a xlink:href="#%s">' % href) - tag_stack.append('a') + tags.append('a') # Anchor ids id_name = elem.get('id') if id_name: fb2_text.append(self.get_anchor(page, id_name)) - if tag in TAG_TITLE: + if tag in TAG_FORCE_P: + if 'p' in tag_stack+tags: + # Close all up to p. Close p. Reopen all closed tags including p. + all_tags = tag_stack+tags + closed_tags = [] + all_tags.reverse() + for t in all_tags: + fb2_text.append('</%s>' % t) + closed_tags.append(t) + if t == 'p': + break + closed_tags.reverse() + for t in closed_tags: + fb2_text.append('<%s>' % t) + else: + fb2_text.append('<p>') + tags.append('p') + + '''if tag in TAG_TITLE: if 'p' in tag_stack: ctag = [] ctag.append(tag_stack.pop()) @@ -237,42 +264,35 @@ class FB2MLizer(object): fb2_text += self.close_tags(ctag) fb2_text.append('</section><section><title><p>') tag_stack.append('title') - tag_stack.append('p') + tag_stack.append('p')''' fb2_tag = TAG_MAP.get(tag, None) - if fb2_tag: - if fb2_tag in tag_stack: - tag_stack.reverse() - tag_stack.remove(fb2_tag) - tag_stack.reverse() - fb2_text.append('</%s>' % fb2_tag) + if fb2_tag and fb2_tag not in tag_stack+tags: fb2_text.append('<%s>' % fb2_tag) - tag_stack.append(fb2_tag) + tags.append(fb2_tag) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) - if style_tag: + if style_tag and style_tag not in tag_stack+tags: fb2_text.append('<%s>' % style_tag) - tag_stack.append(style_tag) + tags.append(style_tag) if tag in TAG_SPACE: - if not fb2_text or fb2_text[-1] != ' ': + if not fb2_text or fb2_text[-1] != ' ' or not fb2_text[-1].endswith(' '): fb2_text.append(' ') if hasattr(elem, 'text') and elem.text: - if 'p' not in tag_stack: + if 'p' not in tag_stack+tags: fb2_text.append('<p>%s</p>' % prepare_string_for_xml(elem.text)) else: fb2_text.append(prepare_string_for_xml(elem.text)) for item in elem: - fb2_text += self.dump_text(item, stylizer, page, tag_stack) + fb2_text += self.dump_text(item, stylizer, page, tag_stack+tags) - close_tag_list = [] - for i in range(0, len(tag_stack)): - close_tag_list.insert(0, tag_stack.pop()) - fb2_text += self.close_tags(close_tag_list) + tags.reverse() + fb2_text += self.close_tags(tags) if hasattr(elem, 'tail') and elem.tail: if 'p' not in tag_stack: @@ -280,12 +300,12 @@ class FB2MLizer(object): else: fb2_text.append(prepare_string_for_xml(elem.tail)) + #print fb2_text return fb2_text def close_tags(self, tags): - fb2_text = [u''] - for i in range(0, len(tags)): - fb2_tag = tags.pop() - fb2_text.append('</%s>' % fb2_tag) + text = [] + for tag in tags: + text.append('</%s>' % tag) - return fb2_text + return text From ca89801730d9b736ef8b4baa6aa5e9094385739b Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 12 Dec 2009 20:21:43 -0500 Subject: [PATCH 103/120] More FB2 work. --- src/calibre/ebooks/fb2/fb2ml.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index d991c0072b..b454d7c7ab 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -48,10 +48,6 @@ TAG_LINKS = [ 'a', ] -TAG_TITLE = [ - 'h1', -] - STYLES = [ ('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}), ('font-style', {'italic' : 'emphasis'}), @@ -255,17 +251,6 @@ class FB2MLizer(object): fb2_text.append('<p>') tags.append('p') - '''if tag in TAG_TITLE: - if 'p' in tag_stack: - ctag = [] - ctag.append(tag_stack.pop()) - while ctag[-1] != 'p': - ctag.append(tag_stack.pop()) - fb2_text += self.close_tags(ctag) - fb2_text.append('</section><section><title><p>') - tag_stack.append('title') - tag_stack.append('p')''' - fb2_tag = TAG_MAP.get(tag, None) if fb2_tag and fb2_tag not in tag_stack+tags: fb2_text.append('<%s>' % fb2_tag) From 3216eccce77d792da3ab980cd44f43b23fb2dde5 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 12 Dec 2009 21:03:07 -0500 Subject: [PATCH 104/120] Add Todo to FB2MLizer. --- src/calibre/ebooks/fb2/fb2ml.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index b454d7c7ab..1c0e5c10be 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -54,6 +54,13 @@ STYLES = [ ] class FB2MLizer(object): + ''' + Todo: * Ensure all style tags are inside of the p tags. + * Include more FB2 specific tags in the conversion. + * Handle reopening of a tag properly. + * Figure out some way to turn oeb_book.toc items into <section><title> + <p> to allow for readers to generate toc from the document. + ''' def __init__(self, log): self.log = log @@ -285,7 +292,6 @@ class FB2MLizer(object): else: fb2_text.append(prepare_string_for_xml(elem.tail)) - #print fb2_text return fb2_text def close_tags(self, tags): From 2d2ec5fb51feded1e81e8dcf9214c1e075068387 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 12 Dec 2009 22:30:16 -0500 Subject: [PATCH 105/120] Support Ganaxa Ger2 ereader. --- src/calibre/customize/builtins.py | 5 +++-- src/calibre/devices/cybookg3/driver.py | 16 ++++++++++++++++ src/calibre/devices/eb600/driver.py | 17 ++++++++++++++++- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 25a5fd0910..1067d72357 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -404,7 +404,7 @@ from calibre.devices.bebook.driver import BEBOOK, BEBOOK_MINI from calibre.devices.blackberry.driver import BLACKBERRY from calibre.devices.cybookg3.driver import CYBOOKG3, CYBOOK_OPUS from calibre.devices.eb600.driver import EB600, COOL_ER, SHINEBOOK, \ - POCKETBOOK360 + POCKETBOOK360, GER2 from calibre.devices.iliad.driver import ILIAD from calibre.devices.irexdr.driver import IREXDR1000 from calibre.devices.jetbook.driver import JETBOOK @@ -476,7 +476,8 @@ plugins += [ ESLICK, NUUT2, IRIVER_STORY, - POCKETBOOK360 + POCKETBOOK360, + GER2, ] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 6b5e5ff4ed..54f7f93579 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -78,6 +78,14 @@ class CYBOOKG3(USBMS): return zip(paths, cycle([on_card])) + @classmethod + def can_handle(cls, device_info, debug=False): + USBMS.can_handle(device_info, debug) + if islinux: + if device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3': + return True + return False + class CYBOOK_OPUS(CYBOOKG3): @@ -103,3 +111,11 @@ class CYBOOK_OPUS(CYBOOKG3): EBOOK_DIR_MAIN = 'eBooks' EBOOK_DIR_CARD_A = 'eBooks' SUPPORTS_SUB_DIRS = True + + @classmethod + def can_handle(cls, device_info, debug=False): + USBMS.can_handle(device_info, debug) + if islinux: + if device_info[3] == 'Bookeen': + return True + return False diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py index 1e36775bb2..e0c031f30c 100644 --- a/src/calibre/devices/eb600/driver.py +++ b/src/calibre/devices/eb600/driver.py @@ -21,7 +21,7 @@ class EB600(USBMS): name = 'Netronix EB600 Device Interface' description = _('Communicate with the EB600 eBook reader.') - author = _('Kovid Goyal') + author = 'Kovid Goyal' supported_platforms = ['windows', 'osx', 'linux'] # Ordered list of supported formats @@ -97,3 +97,18 @@ class POCKETBOOK360(EB600): OSX_MAIN_MEM = 'Philips Mass Storge Media' OSX_CARD_A_MEM = 'Philips Mass Storge Media' +class GER2(EB600): + + name = 'Ganaxa GeR2 Device Interface' + gui_name = 'Ganaxa GeR2' + supported_platforms = ['windows'] + + FORMATS = ['pdf'] + + VENDOR_ID = [0xbda] + PRODUCT_ID = [0x703] + BCD = [0x132] + + VENDOR_NAME = 'GANAXA' + WINDOWS_MAIN_MEN = 'GER2_________-FD' + WINDOWS_CARD_A_MEM = 'GER2_________-SD' From eb6fd6a3b1e3edc5a3e75847ef1a5fd7e73eefe2 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 13 Dec 2009 09:25:14 -0500 Subject: [PATCH 106/120] Add missing import and supported OS. --- src/calibre/devices/cybookg3/driver.py | 1 + src/calibre/devices/eb600/driver.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 54f7f93579..82429cdffa 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -11,6 +11,7 @@ Device driver for Bookeen's Cybook Gen 3 import os from itertools import cycle +from calibre import islinux from calibre.devices.usbms.driver import USBMS import calibre.devices.cybookg3.t2b as t2b diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py index e0c031f30c..0963292b2d 100644 --- a/src/calibre/devices/eb600/driver.py +++ b/src/calibre/devices/eb600/driver.py @@ -101,7 +101,7 @@ class GER2(EB600): name = 'Ganaxa GeR2 Device Interface' gui_name = 'Ganaxa GeR2' - supported_platforms = ['windows'] + supported_platforms = ['windows', 'linux'] FORMATS = ['pdf'] From 65e6e04d65e9ab014d7649aa40c954f8fdf94651 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 13 Dec 2009 09:38:29 -0500 Subject: [PATCH 107/120] Add support for the Nokia 770 internet tablet. --- src/calibre/customize/builtins.py | 2 ++ src/calibre/devices/cybookg3/driver.py | 2 +- src/calibre/devices/nokia/__init__.py | 0 src/calibre/devices/nokia/driver.py | 35 ++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 src/calibre/devices/nokia/__init__.py create mode 100644 src/calibre/devices/nokia/driver.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 1067d72357..8f21d86b74 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -414,6 +414,7 @@ from calibre.devices.prs500.driver import PRS500 from calibre.devices.prs505.driver import PRS505 from calibre.devices.prs700.driver import PRS700 from calibre.devices.android.driver import ANDROID +from calibre.devices.nokia.driver import N770 from calibre.devices.eslick.driver import ESLICK from calibre.devices.nuut2.driver import NUUT2 from calibre.devices.iriver.driver import IRIVER_STORY @@ -470,6 +471,7 @@ plugins += [ PRS700, PRS500, ANDROID, + N770, CYBOOK_OPUS, COOL_ER, SHINEBOOK, diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 82429cdffa..f299fc30d6 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -20,7 +20,7 @@ class CYBOOKG3(USBMS): name = 'Cybook Gen 3 Device Interface' gui_name = 'Cybook Gen 3' description = _('Communicate with the Cybook Gen 3 eBook reader.') - author = _('John Schember') + author = 'John Schember' supported_platforms = ['windows', 'osx', 'linux'] # Ordered list of supported formats diff --git a/src/calibre/devices/nokia/__init__.py b/src/calibre/devices/nokia/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/devices/nokia/driver.py b/src/calibre/devices/nokia/driver.py new file mode 100644 index 0000000000..5f6191f751 --- /dev/null +++ b/src/calibre/devices/nokia/driver.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john at nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +''' +Device driver for Nokia's internet tablet devices +''' + +from calibre.devices.usbms.driver import USBMS + +class N770(USBMS): + + name = 'Nokia 770 Device Interface' + gui_name = 'Nokia 770' + description = _('Communicate with the Nokia Nokia 770 internet tablet.') + author = 'John Schember' + supported_platforms = ['windows', 'linux'] + + # Ordered list of supported formats + FORMATS = ['mobi', 'prc', 'epub', 'html', 'zip', 'fb2', 'chm', 'pdb', + 'tcr', 'txt', 'rtf'] + + VENDOR_ID = [0x111] + PRODUCT_ID = [0x1af] + BCD = [0x134] + + VENDOR_NAME = 'NOKIA' + WINDOWS_MAIN_MEM = '770' + + MAIN_MEMORY_VOLUME_LABEL = 'N770 Main Memory' + + EBOOK_DIR_MAIN = 'My Ebooks' + SUPPORTS_SUB_DIRS = True From 288b64529c4caf5472d532fedf3abaf04c86ec35 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Tue, 15 Dec 2009 05:55:36 -0500 Subject: [PATCH 108/120] Fix can_handle for Cybook Gen 3 and Opus. --- src/calibre/devices/cybookg3/driver.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index f299fc30d6..77deb6efa5 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -85,7 +85,8 @@ class CYBOOKG3(USBMS): if islinux: if device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3': return True - return False + return False + return True class CYBOOK_OPUS(CYBOOKG3): @@ -119,4 +120,5 @@ class CYBOOK_OPUS(CYBOOKG3): if islinux: if device_info[3] == 'Bookeen': return True - return False + return False + return True From 35fc570d2481347d27337557e40a6bd7d268e6e7 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Tue, 15 Dec 2009 18:11:36 -0500 Subject: [PATCH 109/120] PML Output: Generate \CX Tags as chapter anchors. PDB eReader Output: Use \CX tags to generate chapter index. --- src/calibre/ebooks/pdb/ereader/writer.py | 52 +++++++++--------------- src/calibre/ebooks/pml/pmlml.py | 29 ++++++++++--- 2 files changed, 43 insertions(+), 38 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 263f6964bf..a379899af5 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -42,8 +42,8 @@ class Writer(FormatWriter): pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') text, text_sizes = self._text(pml) - chapter_index = self._chapter_index(pml) - link_index = self._link_index(pml) + chapter_index = self._index_item(r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"', pml) + link_index = self._index_item(r'(?s)\\Q="(?P<text>.+?)"', pml) images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs) metadata = [self._metadata(metadata)] hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))] @@ -101,38 +101,24 @@ class Writer(FormatWriter): return pml_pages, text_sizes - def _index_item(self, mo): - index = '' - if 'text' in mo.groupdict().keys(): - index += struct.pack('>L', mo.start()) - text = mo.group('text') - # Strip all PML tags from text - text = re.sub(r'\\U[0-9a-z]{4}', '', text) - text = re.sub(r'\\a\d{3}', '', text) - text = re.sub(r'\\.', '', text) - # Add appropriate spacing to denote the various levels of headings - if 'val' in mo.groupdict().keys(): - text = '%s%s' % (' ' * 4 * int(mo.group('val')), text) - index += text - index += '\x00' - return index - - def _chapter_index(self, pml): - chapter_marks = [ - r'(?s)\\x(?P<text>.+?)\\x', - r'(?s)\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', - r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"', - ] + def _index_item(self, regex, pml): index = [] - for chapter_mark in chapter_marks: - for mo in re.finditer(chapter_mark, pml): - index.append(self._index_item(mo)) - return index - - def _link_index(self, pml): - index = [] - for mo in re.finditer(r'(?s)\\Q="(?P<text>.+?)"', pml): - index.append(self._index_item(mo)) + for mo in re.finditer(regex, pml): + item = '' + if 'text' in mo.groupdict().keys(): + item += struct.pack('>L', mo.start()) + text = mo.group('text') + # Strip all PML tags from text + text = re.sub(r'\\U[0-9a-z]{4}', '', text) + text = re.sub(r'\\a\d{3}', '', text) + text = re.sub(r'\\.', '', text) + # Add appropriate spacing to denote the various levels of headings + if 'val' in mo.groupdict().keys(): + text = '%s%s' % (' ' * 4 * int(mo.group('val')), text) + item += text + item += '\x00' + if item: + index.append(item) return index def _images(self, manifest, image_hrefs): diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index b23cd40813..ccce95fce6 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -79,6 +79,16 @@ class PMLMLizer(object): self.log.info('Converting XHTML to PML markup...') self.oeb_book = oeb_book self.opts = opts + + # This is used for adding \CX tags chapter markers. This is separate + # from the optional inline toc. + self.toc = {} + for item in oeb_book.toc: + page, mid, id = item.href.partition('#') + if not self.toc.get(page, None): + self.toc[page] = {} + self.toc[page][id] = item.title + return self.pmlmlize_spine() def pmlmlize_spine(self): @@ -107,7 +117,11 @@ class PMLMLizer(object): return output def get_toc(self): - toc = [u''] + ''' + Generation of inline TOC + ''' + + toc = [] if self.opts.inline_toc: self.log.debug('Generating table of contents...') toc.append(u'\\X0%s\\X0\n\n' % _('Table of Contents:')) @@ -177,14 +191,14 @@ class PMLMLizer(object): def dump_text(self, elem, stylizer, page, tag_stack=[]): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: - return [u''] + return [] - text = [u''] + text = [] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': - return [u''] + return [] tag = barename(elem.tag) tag_count = 0 @@ -213,6 +227,12 @@ class PMLMLizer(object): else: w += '="50%"' text.append(w) + toc_id = elem.attrib.get('id', None) + if toc_id: + if self.toc.get(page.href, None): + toc_title = self.toc[page.href].get(toc_id, None) + if toc_title: + text.append('\\C1="%s"' % toc_title) # Process style information that needs holds a single tag # Commented out because every page in an OEB book starts with this style @@ -287,4 +307,3 @@ class PMLMLizer(object): if tag != 'block': text.append('\\%s' % tag) return text - From 549d2f00d429aff132d35d2e80d64fbcce74abb2 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Tue, 15 Dec 2009 18:28:32 -0500 Subject: [PATCH 110/120] Add N810 driver. Fix device ids for GeR2 and N770. --- src/calibre/customize/builtins.py | 3 ++- src/calibre/devices/eb600/driver.py | 4 ++-- src/calibre/devices/nokia/driver.py | 18 +++++++++++++++--- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 1ea76a2189..ed942b5a9a 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -413,7 +413,7 @@ from calibre.devices.nook.driver import NOOK from calibre.devices.prs500.driver import PRS500 from calibre.devices.prs505.driver import PRS505, PRS700 from calibre.devices.android.driver import ANDROID -from calibre.devices.nokia.driver import N770 +from calibre.devices.nokia.driver import N770, N810 from calibre.devices.eslick.driver import ESLICK from calibre.devices.nuut2.driver import NUUT2 from calibre.devices.iriver.driver import IRIVER_STORY @@ -470,6 +470,7 @@ plugins += [ PRS500, ANDROID, N770, + N810, CYBOOK_OPUS, COOL_ER, ESLICK, diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py index 18e86fb238..07217ac78d 100644 --- a/src/calibre/devices/eb600/driver.py +++ b/src/calibre/devices/eb600/driver.py @@ -104,8 +104,8 @@ class GER2(EB600): FORMATS = ['pdf'] - VENDOR_ID = [0xbda] - PRODUCT_ID = [0x703] + VENDOR_ID = [0x3034] + PRODUCT_ID = [0x1795] BCD = [0x132] VENDOR_NAME = 'GANAXA' diff --git a/src/calibre/devices/nokia/driver.py b/src/calibre/devices/nokia/driver.py index e6944de4d9..7bd1dbb28d 100644 --- a/src/calibre/devices/nokia/driver.py +++ b/src/calibre/devices/nokia/driver.py @@ -22,9 +22,9 @@ class N770(USBMS): FORMATS = ['mobi', 'prc', 'epub', 'html', 'zip', 'fb2', 'chm', 'pdb', 'tcr', 'txt', 'rtf'] - VENDOR_ID = [0x111] - PRODUCT_ID = [0x1af] - BCD = [0x134] + VENDOR_ID = [0x421] + PRODUCT_ID = [0x431] + BCD = [0x308] VENDOR_NAME = 'NOKIA' WINDOWS_MAIN_MEM = '770' @@ -33,3 +33,15 @@ class N770(USBMS): EBOOK_DIR_MAIN = 'My Ebooks' SUPPORTS_SUB_DIRS = True + +class N810(N770): + name = 'Nokia 810 Device Interface' + gui_name = 'Nokia 810' + description = _('Communicate with the Nokia Nokia 810 internet tablet.') + + PRODUCT_ID = [0x96] + BCD = [0x316] + + WINDOWS_MAIN_MEM = 'N810' + + MAIN_MEMORY_VOLUME_LABEL = 'N810 Main Memory' From b62e3b03b0fb5d2f75d0eb50251c154743b7fecf Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 16 Dec 2009 07:02:12 -0500 Subject: [PATCH 111/120] PML Output: Change \C1 to \C0. --- src/calibre/ebooks/pml/pmlml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index ccce95fce6..c8acf2487e 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -232,7 +232,7 @@ class PMLMLizer(object): if self.toc.get(page.href, None): toc_title = self.toc[page.href].get(toc_id, None) if toc_title: - text.append('\\C1="%s"' % toc_title) + text.append('\\C0="%s"' % toc_title) # Process style information that needs holds a single tag # Commented out because every page in an OEB book starts with this style From 81c8e661ffbf47fbd5e7a3ff59d0ebc1722c2314 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 18 Dec 2009 06:01:45 -0500 Subject: [PATCH 112/120] Update eSlick formats for 2.0 firmware. --- src/calibre/devices/eslick/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/eslick/driver.py b/src/calibre/devices/eslick/driver.py index e891baa50b..5bdb1c04d2 100644 --- a/src/calibre/devices/eslick/driver.py +++ b/src/calibre/devices/eslick/driver.py @@ -18,7 +18,7 @@ class ESLICK(USBMS): supported_platforms = ['windows', 'osx', 'linux'] # Ordered list of supported formats - FORMATS = ['pdf', 'txt'] + FORMATS = ['epub', 'pdb', 'pdf', 'txt'] VENDOR_ID = [0x04cc] PRODUCT_ID = [0x1a64] From d877cd91a7457a66e88f7a9b6b39b743432f8d92 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 18 Dec 2009 06:06:10 -0500 Subject: [PATCH 113/120] Don't translate author name. --- src/calibre/devices/eslick/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/eslick/driver.py b/src/calibre/devices/eslick/driver.py index 5bdb1c04d2..4854c5c730 100644 --- a/src/calibre/devices/eslick/driver.py +++ b/src/calibre/devices/eslick/driver.py @@ -14,7 +14,7 @@ class ESLICK(USBMS): name = 'ESlick Device Interface' gui_name = 'Foxit ESlick' description = _('Communicate with the ESlick eBook reader.') - author = _('Kovid Goyal') + author = 'Kovid Goyal' supported_platforms = ['windows', 'osx', 'linux'] # Ordered list of supported formats From 11427001a3cca9aacbc0fb5950a707d9667ab290 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 18 Dec 2009 19:05:59 -0500 Subject: [PATCH 114/120] USBMS: Move windows sort drives before checks for main. Should fix a bug detecting iriver story. --- src/calibre/devices/iriver/driver.py | 2 +- src/calibre/devices/usbms/device.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/iriver/driver.py b/src/calibre/devices/iriver/driver.py index 030fe8f6bb..f8e7d41600 100644 --- a/src/calibre/devices/iriver/driver.py +++ b/src/calibre/devices/iriver/driver.py @@ -13,7 +13,7 @@ class IRIVER_STORY(USBMS): name = 'Iriver Story Device Interface' gui_name = 'Iriver Story' description = _('Communicate with the Iriver Story reader.') - author = _('Kovid Goyal') + author = 'Kovid Goyal' supported_platforms = ['windows', 'osx', 'linux'] # Ordered list of supported formats diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index 7cd702dd96..5effa0a8c6 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -302,13 +302,13 @@ class Device(DeviceConfig, DevicePlugin): drives['main'] = drives.pop('carda') drives = self.windows_open_callback(drives) + drives = self.windows_sort_drives(drives) if drives.get('main', None) is None: raise DeviceError( _('Unable to detect the %s disk drive. Try rebooting.') % self.__class__.__name__) - drives = self.windows_sort_drives(drives) self._main_prefix = drives.get('main') self._card_a_prefix = drives.get('carda', None) self._card_b_prefix = drives.get('cardb', None) From 9ee3e926c0cf29189194fd4c3edb74c91f5c2309 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 18 Dec 2009 23:35:19 -0500 Subject: [PATCH 115/120] Jetbook driver: add FB2 format. --- src/calibre/devices/jetbook/driver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/jetbook/driver.py b/src/calibre/devices/jetbook/driver.py index b1e0b17fb9..a55e76155f 100644 --- a/src/calibre/devices/jetbook/driver.py +++ b/src/calibre/devices/jetbook/driver.py @@ -20,13 +20,13 @@ from calibre.ebooks.metadata import authors_to_string, string_to_authors class JETBOOK(USBMS): name = 'Ectaco JetBook Device Interface' description = _('Communicate with the JetBook eBook reader.') - author = _('James Ralston') + author = 'James Ralston' supported_platforms = ['windows', 'osx', 'linux'] # Ordered list of supported formats # Be sure these have an entry in calibre.devices.mime - FORMATS = ['epub', 'mobi', 'prc', 'txt', 'rtf', 'pdf'] + FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'txt', 'rtf', 'pdf'] VENDOR_ID = [0x0525] PRODUCT_ID = [0xa4a5] From 86afb92057ee61953c1ea9cbb5e4cf7c5668962b Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 19 Dec 2009 15:58:51 -0500 Subject: [PATCH 116/120] Realign to trunk. --- session.vim | 7 +++ src/calibre/devices/cybookg3/driver.py | 15 +++-- src/calibre/devices/iriver/driver.py | 2 +- src/calibre/devices/jetbook/driver.py | 25 +++++---- src/calibre/devices/prs500/books.py | 14 +++-- src/calibre/devices/prs500/driver.py | 6 +- src/calibre/devices/prs505/books.py | 15 +++-- src/calibre/devices/prs505/driver.py | 20 +++---- src/calibre/devices/usbms/device.py | 62 ++++++++++++++++----- src/calibre/devices/usbms/driver.py | 14 ++--- src/calibre/gui2/device.py | 60 ++++++++++++-------- src/calibre/gui2/dialogs/config/add_save.ui | 2 +- src/calibre/manual/conversion.rst | 1 + 13 files changed, 152 insertions(+), 91 deletions(-) diff --git a/session.vim b/session.vim index 56705f9528..6b965cff2f 100644 --- a/session.vim +++ b/session.vim @@ -13,4 +13,11 @@ base_dir = os.path.join(src_dir, 'calibre') vipy.session.initialize(project_name='calibre', src_dir=src_dir, project_dir=project_dir, base_dir=base_dir) + +def recipe_title_callback(raw): + return eval(raw.decode('utf-8')) + +vipy.session.add_content_browser('.r', ',r', 'Recipe', + vipy.session.glob_based_iterator(os.path.join(project_dir, 'resources', 'recipes', '*.recipe')), + vipy.session.regexp_based_matcher(r'title\s*=\s*(?P<title>.+)', 'title', recipe_title_callback)) EOFPY diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index e1d8aaa0c7..04e5e7012c 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -47,25 +47,24 @@ class CYBOOKG3(USBMS): DELETE_EXTS = ['.mbp', '.dat', '_6090.t2b'] SUPPORTS_SUB_DIRS = True - def upload_books(self, files, metadatas, ids, on_card=None, - end_session=True): + def upload_books(self, files, names, on_card=None, end_session=True, + metadata=None): path = self._sanity_check(on_card, files) paths = [] - metadatas = iter(metadatas) - ids = iter(ids) + names = iter(names) + metadata = iter(metadata) for i, infile in enumerate(files): - mdata, id = metadatas.next(), ids.next() - ext = os.path.splitext(infile)[1] - filepath = self.create_upload_path(path, mdata, ext, id) + mdata, fname = metadata.next(), names.next() + filepath = self.create_upload_path(path, mdata, fname) paths.append(filepath) self.put_file(infile, filepath, replace_file=True) coverdata = None - cover = mdata.cover + cover = mdata.get('cover', None) if cover: coverdata = cover[2] diff --git a/src/calibre/devices/iriver/driver.py b/src/calibre/devices/iriver/driver.py index f8e7d41600..7373996213 100644 --- a/src/calibre/devices/iriver/driver.py +++ b/src/calibre/devices/iriver/driver.py @@ -35,7 +35,7 @@ class IRIVER_STORY(USBMS): SUPPORTS_SUB_DIRS = True - def windows_sort_drives(self, drives): + def windows_open_callback(self, drives): main = drives.get('main', None) card = drives.get('carda', None) if card and main and card < main: diff --git a/src/calibre/devices/jetbook/driver.py b/src/calibre/devices/jetbook/driver.py index a55e76155f..6a09c7c345 100644 --- a/src/calibre/devices/jetbook/driver.py +++ b/src/calibre/devices/jetbook/driver.py @@ -15,7 +15,7 @@ from itertools import cycle from calibre.devices.usbms.driver import USBMS from calibre.utils.filenames import ascii_filename as sanitize -from calibre.ebooks.metadata import authors_to_string, string_to_authors +from calibre.ebooks.metadata import string_to_authors class JETBOOK(USBMS): name = 'Ectaco JetBook Device Interface' @@ -50,22 +50,23 @@ class JETBOOK(USBMS): r'(?P<authors>.+)#(?P<title>.+)' ) - def upload_books(self, files, metadatas, ids, on_card=None, - end_session=True): - path = self._sanity_check(on_card, files) + def upload_books(self, files, names, on_card=False, end_session=True, + metadata=None): + + base_path = self._sanity_check(on_card, files) paths = [] - metadatas = iter(metadatas) - ids = iter(ids) + names = iter(names) + metadata = iter(metadata) for i, infile in enumerate(files): - mdata, id = metadatas.next(), ids.next() - ext = os.path.splitext(infile)[1] - path = self.create_upload_path(path, mdata, ext, id) + mdata, fname = metadata.next(), names.next() + path = os.path.dirname(self.create_upload_path(base_path, mdata, fname)) - author = sanitize(authors_to_string(mdata.authors)).replace(' ', '_') - title = sanitize(mdata.title).replace(' ', '_') - fname = '%s#%s%s' % (author, title, ext) + author = sanitize(mdata.get('authors','Unknown')).replace(' ', '_') + title = sanitize(mdata.get('title', 'Unknown')).replace(' ', '_') + fileext = os.path.splitext(os.path.basename(fname))[1] + fname = '%s#%s%s' % (author, title, fileext) filepath = os.path.join(path, fname) paths.append(filepath) diff --git a/src/calibre/devices/prs500/books.py b/src/calibre/devices/prs500/books.py index 382dcf135d..5eb8d7f011 100644 --- a/src/calibre/devices/prs500/books.py +++ b/src/calibre/devices/prs500/books.py @@ -9,7 +9,6 @@ from base64 import b64decode as decode from base64 import b64encode as encode import re -from calibre.ebooks.metadata import authors_to_string from calibre.devices.interface import BookList as _BookList from calibre.devices import strftime, strptime @@ -263,9 +262,9 @@ class BookList(_BookList): cid = self.max_id()+1 sourceid = str(self[0].sourceid) if len(self) else "1" attrs = { - "title" : info.title, - 'titleSorter' : sortable_title(info.title), - "author" : authors_to_string(info.authors), \ + "title" : info["title"], + 'titleSorter' : sortable_title(info['title']), + "author" : info["authors"] if info['authors'] else 'Unknown', \ "page":"0", "part":"0", "scale":"0", \ "sourceid":sourceid, "id":str(cid), "date":"", \ "mime":mime, "path":name, "size":str(size) @@ -274,7 +273,7 @@ class BookList(_BookList): node.setAttributeNode(self.document.createAttribute(attr)) node.setAttribute(attr, attrs[attr]) try: - w, h, data = info.cover + w, h, data = info["cover"] except TypeError: w, h, data = None, None, None @@ -291,7 +290,10 @@ class BookList(_BookList): book.datetime = ctime self.append(book) self.set_next_id(cid+1) - self.set_playlists(book.id, info.tags) + if self.prefix and info.has_key('tags'): # Playlists only supportted in main memory + if info.has_key('tag order'): + self.tag_order.update(info['tag order']) + self.set_playlists(book.id, info['tags']) def playlist_by_title(self, title): diff --git a/src/calibre/devices/prs500/driver.py b/src/calibre/devices/prs500/driver.py index 616a1c387d..8d2c4cc9d4 100644 --- a/src/calibre/devices/prs500/driver.py +++ b/src/calibre/devices/prs500/driver.py @@ -867,14 +867,14 @@ class PRS500(DeviceConfig, DevicePlugin): self.upload_book_list(booklists[1], end_session=False) @safe - def upload_books(self, files, metadatas, ids, on_card=None, - end_session=True): + def upload_books(self, files, names, on_card=False, end_session=True, + metadata=None): card = self.card(end_session=False) prefix = card + '/' + self.CARD_PATH_PREFIX +'/' if on_card else '/Data/media/books/' if on_card and not self._exists(prefix)[0]: self.mkdir(prefix[:-1], False) paths, ctimes = [], [] - names = iter([m.title for m in metatdatas]) + names = iter(names) infiles = [file if hasattr(file, 'read') else open(file, 'rb') for file in files] for f in infiles: f.seek(0, 2) sizes = [f.tell() for f in infiles] diff --git a/src/calibre/devices/prs505/books.py b/src/calibre/devices/prs505/books.py index 4b8a952816..6e268e734a 100644 --- a/src/calibre/devices/prs505/books.py +++ b/src/calibre/devices/prs505/books.py @@ -8,7 +8,7 @@ import xml.dom.minidom as dom from base64 import b64decode as decode from base64 import b64encode as encode -from calibre.ebooks.metadata import authors_to_string + from calibre.devices.interface import BookList as _BookList from calibre.devices import strftime as _strftime from calibre.devices import strptime @@ -194,9 +194,9 @@ class BookList(_BookList): except: sourceid = '1' attrs = { - "title" : info.title, - 'titleSorter' : sortable_title(info.title), - "author" : authors_to_string(info.authors), + "title" : info["title"], + 'titleSorter' : sortable_title(info['title']), + "author" : info["authors"] if info['authors'] else _('Unknown'), "page":"0", "part":"0", "scale":"0", \ "sourceid":sourceid, "id":str(cid), "date":"", \ "mime":mime, "path":name, "size":str(size) @@ -205,7 +205,7 @@ class BookList(_BookList): node.setAttributeNode(self.document.createAttribute(attr)) node.setAttribute(attr, attrs[attr]) try: - w, h, data = info.cover + w, h, data = info["cover"] except TypeError: w, h, data = None, None, None @@ -221,7 +221,10 @@ class BookList(_BookList): book = Book(node, self.mountpath, [], prefix=self.prefix) book.datetime = ctime self.append(book) - self.set_tags(book, info.tags) + if info.has_key('tags'): + if info.has_key('tag order'): + self.tag_order.update(info['tag order']) + self.set_tags(book, info['tags']) def _delete_book(self, node): nid = node.getAttribute('id') diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index 17de805756..ab61f76b61 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -114,22 +114,20 @@ class PRS505(CLI, Device): self.report_progress(1.0, _('Getting list of books on device...')) return bl - def upload_books(self, files, metadatas, ids, on_card=None, - end_session=True): + def upload_books(self, files, names, on_card=None, end_session=True, + metadata=None): path = self._sanity_check(on_card, files) - paths = [] - metadatas = iter(metadatas) - ids = iter(ids) - + paths, ctimes, sizes = [], [], [] + names = iter(names) + metadata = iter(metadata) for i, infile in enumerate(files): - mdata, id = metadatas.next(), ids.next() - ext = os.path.splitext(infile)[1] - filepath = self.create_upload_path(path, mdata, ext, id) - paths.append(filepath) + mdata, fname = metadata.next(), names.next() + filepath = self.create_upload_path(path, mdata, fname) - self.put_file(infile, filepath, replace_file=True) + paths.append(filepath) + self.put_file(infile, paths[-1], replace_file=True) ctimes.append(os.path.getctime(paths[-1])) sizes.append(os.stat(paths[-1]).st_size) diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index 5effa0a8c6..33ba104e38 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -23,7 +23,7 @@ from calibre.devices.interface import DevicePlugin from calibre.devices.errors import DeviceError, FreeSpaceError from calibre.devices.usbms.deviceconfig import DeviceConfig from calibre import iswindows, islinux, isosx, __appname__ -from calibre.utils.filenames import shorten_components_to +from calibre.utils.filenames import ascii_filename as sanitize, shorten_components_to class Device(DeviceConfig, DevicePlugin): @@ -295,20 +295,20 @@ class Device(DeviceConfig, DevicePlugin): # This is typically needed when the device has the same # WINDOWS_MAIN_MEM and WINDOWS_CARD_A_MEM in which case - # if the devices is connected without a crad, the above + # if the devices is connected without a card, the above # will incorrectly identify the main mem as carda # See for example the driver for the Nook if 'main' not in drives and 'carda' in drives: drives['main'] = drives.pop('carda') drives = self.windows_open_callback(drives) - drives = self.windows_sort_drives(drives) if drives.get('main', None) is None: raise DeviceError( _('Unable to detect the %s disk drive. Try rebooting.') % self.__class__.__name__) + drives = self.windows_sort_drives(drives) self._main_prefix = drives.get('main') self._card_a_prefix = drives.get('carda', None) self._card_b_prefix = drives.get('cardb', None) @@ -739,18 +739,54 @@ class Device(DeviceConfig, DevicePlugin): raise FreeSpaceError(_("There is insufficient free space on the storage card")) return path - def create_upload_path(self, root, mdata, ext, id): - from calibre.library.save_to_disk import config, get_components - opts = config().parse() - components = get_components(opts.template, mdata, id, opts.timefmt, 250) - components = [str(x) for x in components] - components = shorten_components_to(250 - len(root), components) - filepath = '%s%s' % (os.path.join(root, *components), ext) + def create_upload_path(self, path, mdata, fname): + path = os.path.abspath(path) + newpath = path + extra_components = [] + + if self.SUPPORTS_SUB_DIRS and self.settings().use_subdirs: + if 'tags' in mdata.keys(): + for tag in mdata['tags']: + if tag.startswith(_('News')): + extra_components.append('news') + c = sanitize(mdata.get('title', '')) + if c: + extra_components.append(c) + c = sanitize(mdata.get('timestamp', '')) + if c: + extra_components.append(c) + break + elif tag.startswith('/'): + for c in tag.split('/'): + c = sanitize(c) + if not c: continue + extra_components.append(c) + break + + if not extra_components: + c = sanitize(mdata.get('authors', _('Unknown'))) + if c: + extra_components.append(c) + c = sanitize(mdata.get('title', _('Unknown'))) + if c: + extra_components.append(c) + newpath = os.path.join(newpath, c) + + fname = sanitize(fname) + extra_components.append(fname) + extra_components = [str(x) for x in extra_components] + def remove_trailing_periods(x): + ans = x + while ans.endswith('.'): + ans = ans[:-1] + if not ans: + ans = 'x' + return ans + extra_components = list(map(remove_trailing_periods, extra_components)) + components = shorten_components_to(250 - len(path), extra_components) + filepath = os.path.join(path, *components) filedir = os.path.dirname(filepath) - if not self.SUPPORTS_SUB_DIRS or not self.settings().use_subdirs: - filedir = root - filepath = os.path.join(root, os.path.basename(filepath)) if not os.path.exists(filedir): os.makedirs(filedir) diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index 1228781579..8d2416511c 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -95,19 +95,19 @@ class USBMS(CLI, Device): return bl - def upload_books(self, files, metadatas, ids, on_card=None, - end_session=True): + def upload_books(self, files, names, on_card=None, end_session=True, + metadata=None): path = self._sanity_check(on_card, files) paths = [] - metadatas = iter(metadatas) - ids = iter(ids) + names = iter(names) + metadata = iter(metadata) for i, infile in enumerate(files): - mdata, id = metadatas.next(), ids.next() - ext = os.path.splitext(infile)[1] - filepath = self.create_upload_path(path, mdata, ext, id) + mdata, fname = metadata.next(), names.next() + filepath = self.create_upload_path(path, mdata, fname) + paths.append(filepath) self.put_file(infile, filepath, replace_file=True) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index c33e279912..4471f285dc 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -223,17 +223,18 @@ class DeviceManager(Thread): return self.create_job(self._sync_booklists, done, args=[booklists], description=_('Send metadata to device')) - def _upload_books(self, files, metadata, ids, on_card=None): + def _upload_books(self, files, names, on_card=None, metadata=None): '''Upload books to device: ''' - return self.device.upload_books(files, metadata, ids, on_card, - end_session=False) + return self.device.upload_books(files, names, on_card, + metadata=metadata, end_session=False) - def upload_books(self, done, files, metadata, ids, on_card=None, titles=None): - desc = _('Upload %d books to device')%len(files) + def upload_books(self, done, files, names, on_card=None, titles=None, + metadata=None): + desc = _('Upload %d books to device')%len(names) if titles: desc += u':' + u', '.join(titles) - return self.create_job(self._upload_books, done, args=[files, metadata, ids], - kwargs={'on_card':on_card}, description=desc) + return self.create_job(self._upload_books, done, args=[files, names], + kwargs={'on_card':on_card,'metadata':metadata}, description=desc) def add_books_to_metadata(self, locations, metadata, booklists): self.device.add_books_to_metadata(locations, metadata, booklists) @@ -707,18 +708,18 @@ class DeviceGUI(object): dynamic.set('news_to_be_synced', set([])) return metadata = self.library_view.model().get_metadata(ids, - rows_are_ids=True, full_metadata=True)[1] + rows_are_ids=True) names = [] for mi in metadata: - prefix = ascii_filename(mi.title) + prefix = ascii_filename(mi['title']) if not isinstance(prefix, unicode): prefix = prefix.decode(preferred_encoding, 'replace') prefix = ascii_filename(prefix) names.append('%s_%d%s'%(prefix, id, os.path.splitext(f.name)[1])) - cdata = mi.cover + cdata = mi['cover'] if cdata: - mi.cover = self.cover_to_thumbnail(cdata) + mi['cover'] = self.cover_to_thumbnail(cdata) dynamic.set('news_to_be_synced', set([])) if config['upload_news_to_device'] and files: remove = ids if \ @@ -727,7 +728,8 @@ class DeviceGUI(object): self.location_view.model().free[1] : 'carda', self.location_view.model().free[2] : 'cardb' } on_card = space.get(sorted(space.keys(), reverse=True)[0], None) - self.upload_books(files, metadata, ids, on_card=on_card, + self.upload_books(files, names, metadata, + on_card=on_card, memory=[[f.name for f in files], remove]) self.status_bar.showMessage(_('Sending news to device.'), 5000) @@ -749,28 +751,38 @@ class DeviceGUI(object): else: _auto_ids = [] - metadata = self.library_view.model().get_metadata(ids, True, full_metadata=True)[1] + metadata = self.library_view.model().get_metadata(ids, True) ids = iter(ids) for mi in metadata: - cdata = mi.cover + cdata = mi['cover'] if cdata: mi['cover'] = self.cover_to_thumbnail(cdata) metadata = iter(metadata) files = [getattr(f, 'name', None) for f in _files] - bad, mdata, gf, fids, remove_ids = [], [], [], [], [] + bad, good, gf, names, remove_ids = [], [], [], [], [] for f in files: mi = metadata.next() id = ids.next() if f is None: - bad.append(mi.title) + bad.append(mi['title']) else: remove_ids.append(id) + good.append(mi) gf.append(f) - mdata.append(mi) - fids.append(id) + t = mi['title'] + if not t: + t = _('Unknown') + a = mi['authors'] + if not a: + a = _('Unknown') + prefix = ascii_filename(t+' - '+a) + if not isinstance(prefix, unicode): + prefix = prefix.decode(preferred_encoding, 'replace') + prefix = ascii_filename(prefix) + names.append('%s_%d%s'%(prefix, id, os.path.splitext(f)[1])) remove = remove_ids if delete_from_library else [] - self.upload_books(gf, mdata, fids, on_card, memory=(_files, remove)) + self.upload_books(gf, names, good, on_card, memory=(_files, remove)) self.status_bar.showMessage(_('Sending books to device.'), 5000) auto = [] @@ -833,15 +845,17 @@ class DeviceGUI(object): cp, fs = job.result self.location_view.model().update_devices(cp, fs) - def upload_books(self, files, metadata, ids, on_card=None, memory=None): + def upload_books(self, files, names, metadata, on_card=None, memory=None): ''' Upload books to device. :param files: List of either paths to files or file like objects ''' - titles = [i.title for i in metadata] + titles = [i['title'] for i in metadata] job = self.device_manager.upload_books( Dispatcher(self.books_uploaded), - files, metadata, ids, on_card=on_card, titles=titles) + files, names, on_card=on_card, + metadata=metadata, titles=titles + ) self.upload_memory[job] = (metadata, on_card, memory, files) def books_uploaded(self, job): @@ -854,7 +868,7 @@ class DeviceGUI(object): if isinstance(job.exception, FreeSpaceError): where = 'in main memory.' if 'memory' in str(job.exception) \ else 'on the storage card.' - titles = '\n'.join(['<li>'+mi.title+'</li>' \ + titles = '\n'.join(['<li>'+mi['title']+'</li>' \ for mi in metadata]) d = error_dialog(self, _('No space on device'), _('<p>Cannot upload books to device there ' diff --git a/src/calibre/gui2/dialogs/config/add_save.ui b/src/calibre/gui2/dialogs/config/add_save.ui index ef1a867cd2..513be73e54 100644 --- a/src/calibre/gui2/dialogs/config/add_save.ui +++ b/src/calibre/gui2/dialogs/config/add_save.ui @@ -70,7 +70,7 @@ <item row="0" column="0" colspan="2"> <widget class="QLabel" name="label"> <property name="text"> - <string>Here you can control how calibre will save your books when you click the Save to Disk or Send to Device buttons:</string> + <string>Here you can control how calibre will save your books when you click the Save to Disk button:</string> </property> <property name="wordWrap"> <bool>true</bool> diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index 72e30eedbb..ccdb8d6cdd 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -497,6 +497,7 @@ TXT input supports a number of options to differentiate how paragraphs are detec Convert PDF documents +~~~~~~~~~~~~~~~~~~~~~~~~~~~ PDF documents are one of the worst formats to convert from. They are a fixed page size and text placement format. Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap From 2debc774d935956b06a164466fee527511e7516e Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 19 Dec 2009 19:42:42 -0500 Subject: [PATCH 117/120] Refactor sending cover to device. Nook: Send cover with book to device. --- src/calibre/devices/cybookg3/driver.py | 31 ++------------------------ src/calibre/devices/nook/driver.py | 5 +++++ src/calibre/devices/usbms/driver.py | 13 +++++++++++ 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 04e5e7012c..e0caff36f8 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -47,36 +47,9 @@ class CYBOOKG3(USBMS): DELETE_EXTS = ['.mbp', '.dat', '_6090.t2b'] SUPPORTS_SUB_DIRS = True - def upload_books(self, files, names, on_card=None, end_session=True, - metadata=None): - - path = self._sanity_check(on_card, files) - - paths = [] - names = iter(names) - metadata = iter(metadata) - - for i, infile in enumerate(files): - mdata, fname = metadata.next(), names.next() - filepath = self.create_upload_path(path, mdata, fname) - paths.append(filepath) - - self.put_file(infile, filepath, replace_file=True) - - coverdata = None - cover = mdata.get('cover', None) - if cover: - coverdata = cover[2] - - t2bfile = open('%s_6090.t2b' % (os.path.splitext(filepath)[0]), 'wb') + def upload_cover(self, path, name, coverdata): + with open('%s_6090.t2b' % os.path.join(path, name), 'wb') as t2bfile: t2b.write_t2b(t2bfile, coverdata) - t2bfile.close() - - self.report_progress(i / float(len(files)), _('Transferring books to device...')) - - self.report_progress(1.0, _('Transferring books to device...')) - - return zip(paths, cycle([on_card])) @classmethod def can_handle(cls, device_info, debug=False): diff --git a/src/calibre/devices/nook/driver.py b/src/calibre/devices/nook/driver.py index 001cc06b8e..c3f3267401 100644 --- a/src/calibre/devices/nook/driver.py +++ b/src/calibre/devices/nook/driver.py @@ -38,6 +38,11 @@ class NOOK(USBMS): EBOOK_DIR_MAIN = 'my documents' SUPPORTS_SUB_DIRS = True + def upload_cover(self, path, name, coverdata): + if coverdata: + with open('%s.jpg' % os.path.join(path, name), 'wb') as coverfile: + coverfile.write(coverdata) + def windows_sort_drives(self, drives): main = drives.get('main', None) card = drives.get('carda', None) diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index 8d2416511c..ee746de9cc 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -112,12 +112,25 @@ class USBMS(CLI, Device): self.put_file(infile, filepath, replace_file=True) + coverdata = mdata.get('cover', None) + if coverdata: + coverdata = coverdata[2] + self.upload_cover(os.path.dirname(filepath), os.path.splitext(os.path.basename(filepath))[0], coverdata) + self.report_progress((i+1) / float(len(files)), _('Transferring books to device...')) self.report_progress(1.0, _('Transferring books to device...')) return zip(paths, cycle([on_card])) + def upload_cover(self, path, name, coverdata): + ''' + :path: the full path were the associated book is located. + :name: the name of the book file without the extension. + :coverdata: cover data in jpeg format. + ''' + pass + def add_books_to_metadata(self, locations, metadata, booklists): for i, location in enumerate(locations): self.report_progress((i+1) / float(len(locations)), _('Adding books to device metadata listing...')) From 32b02e627beae6e236ca7df0eb8e934c001895e7 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 19 Dec 2009 20:32:40 -0500 Subject: [PATCH 118/120] Fix bug #4252: Nook driver writes cover image and uses default image when no cover is associated with book. --- src/calibre/devices/cybookg3/driver.py | 2 +- src/calibre/devices/nook/driver.py | 30 ++++++++++++++++++++++---- src/calibre/devices/usbms/driver.py | 2 +- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index e0caff36f8..cd9545f231 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -47,7 +47,7 @@ class CYBOOKG3(USBMS): DELETE_EXTS = ['.mbp', '.dat', '_6090.t2b'] SUPPORTS_SUB_DIRS = True - def upload_cover(self, path, name, coverdata): + def upload_cover(self, path, name, coverdata, metadata): with open('%s_6090.t2b' % os.path.join(path, name), 'wb') as t2bfile: t2b.write_t2b(t2bfile, coverdata) diff --git a/src/calibre/devices/nook/driver.py b/src/calibre/devices/nook/driver.py index c3f3267401..4cf65c866e 100644 --- a/src/calibre/devices/nook/driver.py +++ b/src/calibre/devices/nook/driver.py @@ -8,6 +8,14 @@ __docformat__ = 'restructuredtext en' Device driver for Barns and Nobel's Nook ''' +try: + from PIL import Image, ImageDraw + Image +except ImportError: + import Image + +import cStringIO + from calibre.devices.usbms.driver import USBMS class NOOK(USBMS): @@ -38,10 +46,24 @@ class NOOK(USBMS): EBOOK_DIR_MAIN = 'my documents' SUPPORTS_SUB_DIRS = True - def upload_cover(self, path, name, coverdata): - if coverdata: - with open('%s.jpg' % os.path.join(path, name), 'wb') as coverfile: - coverfile.write(coverdata) + def upload_cover(self, path, name, coverdata, metadata): + if not coverdata: + coverdata = open(I('library.png'), 'rb').read() + + im = Image.open(cStringIO.StringIO(coverdata)) + im.thumbnail((96, 144), Image.ANTIALIAS) + + if not coverdata: + draw = ImageDraw.Draw(im) + draw.text((0, 29), metadata.title) + draw.text((0, 115), ', '.join(metadata.authors)) + + data = cStringIO.StringIO() + im.save(data, 'JPG') + coverdata = data.getvalue() + + with open('%s.jpg' % os.path.join(path, name), 'wb') as coverfile: + coverfile.write(coverdata) def windows_sort_drives(self, drives): main = drives.get('main', None) diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index ee746de9cc..e37ea62525 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -115,7 +115,7 @@ class USBMS(CLI, Device): coverdata = mdata.get('cover', None) if coverdata: coverdata = coverdata[2] - self.upload_cover(os.path.dirname(filepath), os.path.splitext(os.path.basename(filepath))[0], coverdata) + self.upload_cover(os.path.dirname(filepath), os.path.splitext(os.path.basename(filepath))[0], coverdata, mdata) self.report_progress((i+1) / float(len(files)), _('Transferring books to device...')) From 80ce4c5b4d7b95466fe872cca95baaee428fa47b Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 19 Dec 2009 20:44:58 -0500 Subject: [PATCH 119/120] Nook input/output profile. Make name for Hanlin profile clearer. --- src/calibre/customize/profiles.py | 37 ++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 456cc21d3c..752a908c1c 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -88,9 +88,9 @@ class MobipocketInput(InputProfile): class HanlinV3Input(InputProfile): - name = 'Hanlin V3' + name = 'Hanlin V3/V5' short_name = 'hanlinv3' - description = _('This profile is intended for the Hanlin V3 and its clones.') + description = _('This profile is intended for the Hanlin V3/V5 and its clones.') # Screen size is a best guess screen_size = (584, 754) @@ -159,9 +159,23 @@ class IRexDR1000Input(InputProfile): fbase = 16 fsizes = [12, 14, 16, 18, 20, 22, 24] + +class NookInput(InputProfile): + + author = 'John Schember' + name = 'Nook' + short_name = 'nook' + description = _('This profile is intended for the B&N Nook.') + + # Screen size is a best guess + screen_size = (600, 800) + dpi = 167 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + input_profiles = [InputProfile, SonyReaderInput, MSReaderInput, MobipocketInput, HanlinV3Input, CybookG3Input, CybookOpusInput, KindleInput, - IlliadInput, IRexDR1000Input] + IlliadInput, IRexDR1000Input, NookInput] class OutputProfile(Plugin): @@ -248,7 +262,7 @@ class MobipocketOutput(OutputProfile): class HanlinV3Output(OutputProfile): - name = 'Hanlin V3' + name = 'Hanlin V3/V5' short_name = 'hanlinv3' description = _('This profile is intended for the Hanlin V3/V5 and its clones.') @@ -341,7 +355,20 @@ class IRexDR1000Output(OutputProfile): fbase = 16 fsizes = [12, 14, 16, 18, 20, 22, 24] +class NookOutput(OutputProfile): + + author = 'John Schember' + name = 'Nook' + short_name = 'nook' + description = _('This profile is intended for the B&N Nook.') + + # Screen size is a best guess + screen_size = (600, 800) + dpi = 167 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + output_profiles = [OutputProfile, SonyReaderOutput, MSReaderOutput, MobipocketOutput, HanlinV3Output, CybookG3Output, CybookOpusOutput, KindleOutput, SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput, - IRexDR1000Output, JetBook5Output] + IRexDR1000Output, JetBook5Output, NookOutput] From 170e57967390a70b234c9da7ae7e0d397a6cb725 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 20 Dec 2009 08:52:40 -0500 Subject: [PATCH 120/120] Working cover upload for the Nook. --- src/calibre/devices/cybookg3/driver.py | 7 ++++-- src/calibre/devices/nook/driver.py | 30 +++++++++++++++----------- src/calibre/devices/usbms/driver.py | 12 ++++------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index cd9545f231..00cf99a8c4 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -47,8 +47,11 @@ class CYBOOKG3(USBMS): DELETE_EXTS = ['.mbp', '.dat', '_6090.t2b'] SUPPORTS_SUB_DIRS = True - def upload_cover(self, path, name, coverdata, metadata): - with open('%s_6090.t2b' % os.path.join(path, name), 'wb') as t2bfile: + def upload_cover(self, path, filename, metadata): + coverdata = metadata.get('cover', None) + if coverdata: + coverdata = coverdata[2] + with open('%s_6090.t2b' % os.path.join(path, filename), 'wb') as t2bfile: t2b.write_t2b(t2bfile, coverdata) @classmethod diff --git a/src/calibre/devices/nook/driver.py b/src/calibre/devices/nook/driver.py index 4cf65c866e..cc3f26d730 100644 --- a/src/calibre/devices/nook/driver.py +++ b/src/calibre/devices/nook/driver.py @@ -10,9 +10,8 @@ Device driver for Barns and Nobel's Nook try: from PIL import Image, ImageDraw - Image except ImportError: - import Image + import Image, ImageDraw import cStringIO @@ -46,23 +45,30 @@ class NOOK(USBMS): EBOOK_DIR_MAIN = 'my documents' SUPPORTS_SUB_DIRS = True - def upload_cover(self, path, name, coverdata, metadata): - if not coverdata: + def upload_cover(self, path, filename, metadata): + coverdata = metadata.get('cover', None) + if coverdata: + cover = Image.open(cStringIO.StringIO(coverdata[2])) + cover.thumbnail((96, 144), Image.ANTIALIAS) + else: coverdata = open(I('library.png'), 'rb').read() - im = Image.open(cStringIO.StringIO(coverdata)) - im.thumbnail((96, 144), Image.ANTIALIAS) + cover = Image.new('RGB', (96, 144), 'black') + im = Image.open(cStringIO.StringIO(coverdata)) + im.thumbnail((96, 144), Image.ANTIALIAS) - if not coverdata: - draw = ImageDraw.Draw(im) - draw.text((0, 29), metadata.title) - draw.text((0, 115), ', '.join(metadata.authors)) + x, y = im.size + cover.paste(im, ((96-x)/2, (144-y)/2)) + + draw = ImageDraw.Draw(cover) + draw.text((1, 15), metadata.title) + draw.text((1, 115), ', '.join(metadata.authors)) data = cStringIO.StringIO() - im.save(data, 'JPG') + cover.save(data, 'JPEG') coverdata = data.getvalue() - with open('%s.jpg' % os.path.join(path, name), 'wb') as coverfile: + with open('%s.jpg' % os.path.join(path, filename), 'wb') as coverfile: coverfile.write(coverdata) def windows_sort_drives(self, drives): diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index e37ea62525..f637ee2fa8 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -111,11 +111,7 @@ class USBMS(CLI, Device): paths.append(filepath) self.put_file(infile, filepath, replace_file=True) - - coverdata = mdata.get('cover', None) - if coverdata: - coverdata = coverdata[2] - self.upload_cover(os.path.dirname(filepath), os.path.splitext(os.path.basename(filepath))[0], coverdata, mdata) + self.upload_cover(os.path.dirname(filepath), os.path.splitext(os.path.basename(filepath))[0], mdata) self.report_progress((i+1) / float(len(files)), _('Transferring books to device...')) @@ -123,11 +119,11 @@ class USBMS(CLI, Device): return zip(paths, cycle([on_card])) - def upload_cover(self, path, name, coverdata): + def upload_cover(self, path, filename, metadata): ''' :path: the full path were the associated book is located. - :name: the name of the book file without the extension. - :coverdata: cover data in jpeg format. + :filename: the name of the book file without the extension. + :metatdata: metadata belonging to the book. metadata.cover[2] for coverdata. ''' pass