From 0c858e43bcd7a0774c5555e6a8df8496df30c894 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 25 Apr 2009 13:55:45 -0400 Subject: [PATCH] ereader writer mostly working. --- src/calibre/ebooks/pdb/ereader/__init__.py | 16 ++++ src/calibre/ebooks/pdb/ereader/output.py | 26 ++++-- .../ebooks/pdb/ereader/pmlconverter.py | 63 ++++++++++---- src/calibre/ebooks/pdb/ereader/reader.py | 2 +- src/calibre/ebooks/pdb/ereader/writer.py | 83 +++++++++++++++++-- src/calibre/ebooks/pdb/header.py | 26 ++++-- src/calibre/ebooks/txt/output.py | 1 + 7 files changed, 182 insertions(+), 35 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/__init__.py b/src/calibre/ebooks/pdb/ereader/__init__.py index 89d9dfdd35..b39467c6e3 100644 --- a/src/calibre/ebooks/pdb/ereader/__init__.py +++ b/src/calibre/ebooks/pdb/ereader/__init__.py @@ -5,5 +5,21 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' +import os + class EreaderError(Exception): pass + +def image_name(name): + name = os.path.basename(name) + + if len(name) > 32: + cut = len(name) - 32 + names = name[:10] + namee = name[10+cut:] + name = names + namee + + name = name.ljust(32, '\x00')[:32] + + return name + diff --git a/src/calibre/ebooks/pdb/ereader/output.py b/src/calibre/ebooks/pdb/ereader/output.py index 034508b0da..4b188ae2f1 100644 --- a/src/calibre/ebooks/pdb/ereader/output.py +++ b/src/calibre/ebooks/pdb/ereader/output.py @@ -5,9 +5,8 @@ __docformat__ = 'restructuredtext en' import os -from calibre.customize.conversion import OutputFormatPlugin, \ - OptionRecommendation -from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata +from calibre.customize.conversion import OutputFormatPlugin +from calibre.ebooks.pdb.ereader.writer import Writer from calibre.ebooks.metadata import authors_to_string class EREADEROutput(OutputFormatPlugin): @@ -17,7 +16,22 @@ class EREADEROutput(OutputFormatPlugin): file_type = 'erpdb' def convert(self, oeb_book, output_path, input_plugin, opts, log): - from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml + writer = Writer(log) -# print html_to_pml('

β€œA hundred kisses from the Princess,” said he, β€œor else let everyone keep his own!”

') - print html_to_pml(str(oeb_book.spine[3])) + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + out_stream.seek(0) + out_stream.truncate() + + writer.dump(oeb_book, out_stream) + + if close: + out_stream.close() + diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 347bde951c..88c841b81f 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en' import re +from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.htmlsymbols import HTML_SYMBOLS from BeautifulSoup import BeautifulSoup @@ -61,35 +62,69 @@ PML_HTML_RULES = [ ] HTML_PML_RULES = [ + (re.compile(r'\\'), lambda match: '\\\\'), (re.compile('(?<=[^\n])[ ]*'), lambda match: '\n

'), - (re.compile('

(^\n|\r\n)'), lambda match: '\n'), + (re.compile('

(?=^\n|^\r\n)'), lambda match: '\n'), + + + # Clean up HTML + (re.compile('@page.*?}'), lambda match: ''), + (re.compile('.*?', re.DOTALL), lambda match: ''), + (re.compile('.*?', re.DOTALL), lambda match: ''), + + # Reflow paragraphs + (re.compile('(?P.*?)

', re.DOTALL), lambda match: match.group('text').replace('\r\n', ' ').replace('\n', ' ')), + + # HTML to PML (re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))), (re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))), (re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), - (re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), + #(re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), + (re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % image_name(match.group('name'))), #(re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%s' % match.group('num')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), (re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), (re.compile('\d+)%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), - (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), + (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), - (re.compile(''), lambda match: '\\p'), + (re.compile('
'), lambda match: '\\p'), + (re.compile('
'), lambda match: '\\p'), + + # Remove remaining HTML tags (re.compile('<.*?>'), lambda match: ''), + + # Remove redundant page break markers (re.compile(r'(\\p){2,}'), lambda match: r'\p'), + + # Remove whitespace on empty lines + (re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''), + + # Remove excess newlines at the beginning and end + (re.compile('^(\r\n){1,}'), lambda match: ''), + (re.compile('^\n{1,}'), lambda match: ''), + (re.compile('(\r\n){3,}$'), lambda match: ''), + (re.compile('\n{3,}$'), lambda match: ''), ] def pml_to_html(pml): @@ -111,13 +146,13 @@ def html_to_pml(html): pml = '' for dom_tree in BeautifulSoup(html).findAll('body'): - body = unicode(dom_tree.pretty_print()) + body = unicode(dom_tree.prettify()) for rule in HTML_PML_RULES: - body = rule[0].sub(rule[1], pml) + body = rule[0].sub(rule[1], body) pml += body - + # Replace symbols outside of cp1512 wtih \Uxxxx return pml diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index e0e42e40fd..c6f520ecb2 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -40,7 +40,7 @@ class HeaderRecord(object): self.sidebar_offset, = struct.unpack('>H', raw[50:52]) self.last_data_offset, = struct.unpack('>H', raw[52:54]) - self.num_text_pages = self.non_text_offset -1 + self.num_text_pages = self.non_text_offset - 1 self.num_image_pages = self.metadata_offset - self.image_data_offset diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index c9493d2915..1605e15f32 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -4,17 +4,90 @@ from __future__ import with_statement Write content to ereader pdb file. ''' +import struct, zlib + +from calibre.ebooks.oeb.base import OEB_IMAGES +from calibre.ebooks.pdb.header import PdbHeaderBuilder +from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml +IDENTITY = 'PNPdPPrs' + class Writer(object): def __init__(self, log): - self.oeb_book = oeb_book + self.log = log - def dump(oeb_book): + def dump(self, oeb_book, out_stream, metadata=None): + text = self._text(oeb_book.spine) + images = self._images(oeb_book.manifest) + metadata = [self._metadata(metadata)] + + hr = [self._header_record(len(text), len(images))] + + sections = hr+text+images+metadata + + lengths = [len(i) for i in sections] + + pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '') + pdbHeaderBuilder.build_header(lengths, out_stream) + + for item in sections: + out_stream.write(item) + + def _text(self, pages): pml_pages = [] - for page in oeb_book.spine: - pml_pages.append(html_to_pml(page)) + for page in pages: + pml_pages.append(zlib.compress(html_to_pml(unicode(page)))) + + return pml_pages - \ No newline at end of file + def _images(self, manifest): + images = [] + + for item in manifest: + if item.media_type in OEB_IMAGES: + image = '\x00\x00\x00\x00' + + image += image_name(item.href) + image = image.ljust(62, '\x00') + image += item.data + + images.append(image) + + return images + + def _metadata(self, metadata): + return '' + + def _header_record(self, text_items, image_items): + ''' + text_items = the number of text pages + image_items = the number of images + ''' + version = 10 + non_text_offset = text_items + + if image_items > 0: + image_data_offset = text_items + 1 + meta_data_offset = image_data_offset + image_items + else: + meta_data_offset = text_items + 1 + image_data_offset = meta_data_offset + + record = u'' + + # Version + record += struct.pack('>H', version) + record = record.ljust(12, '\x00') + record += struct.pack('>H', non_text_offset) + record = record.ljust(40, '\x00') + record += struct.pack('>H', image_data_offset) + record = record.ljust(44, '\x00') + record += struct.pack('>H', meta_data_offset) + record = record.ljust(52, '\x00') + record += struct.pack('>H', meta_data_offset) + + return record + diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index d270c0ef71..8a9b7b105c 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -8,7 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, struct +import os, re, struct, time class PdbHeaderReader(object): @@ -60,18 +60,26 @@ class PdbHeaderReader(object): return self.stream.read(end - start) -class PdbHeaderWriter(object): +class PdbHeaderBuilder(object): def __init__(self, identity, title): self.identity = identity.ljust(3, '\x00')[:8] - self.title = title.ljust(32, '\x00')[:32] + self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32] - def build_header(self, offsets): + def build_header(self, section_lengths, out_stream): ''' - Offsets is a list of section offsets + section_lengths = Lenght of each section in file. ''' + + now = int(time.time()) + nrecords = len(section_lengths) + + out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0)) + out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords)) + + offset = 78 + (8 * nrecords) + 2 + for id, record in enumerate(section_lengths): + out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0)) + offset += record + out_stream.write('\x00\x00') - - - - return header diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index dd87394507..62c07c3d04 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -55,3 +55,4 @@ class TXTOutput(OutputFormatPlugin): if close: out_stream.close() +