From dd69e247476fa000e7f5b2d4edb60034d862dbdd Mon Sep 17 00:00:00 2001 From: Li Fanxi Date: Sat, 9 Oct 2010 22:30:38 +0800 Subject: [PATCH] [SNBOutput] Add basic output support for SNB file. --- src/calibre/ebooks/snb/output.py | 193 ++++++++++++++++--- src/calibre/ebooks/snb/snbfile.py | 300 ++++++++++++++++++++++++++++++ src/calibre/ebooks/snb/snbml.py | 160 ++++++++++++++++ 3 files changed, 629 insertions(+), 24 deletions(-) create mode 100644 src/calibre/ebooks/snb/snbfile.py create mode 100644 src/calibre/ebooks/snb/snbml.py diff --git a/src/calibre/ebooks/snb/output.py b/src/calibre/ebooks/snb/output.py index 4b94b65405..c302c17729 100644 --- a/src/calibre/ebooks/snb/output.py +++ b/src/calibre/ebooks/snb/output.py @@ -4,10 +4,29 @@ __license__ = 'GPL 3' __copyright__ = '2010, Li Fanxi ' __docformat__ = 'restructuredtext en' -import os +import os, string -from calibre.customize.conversion import OutputFormatPlugin, \ - OptionRecommendation +from lxml import etree +from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation +from calibre.ptempfile import TemporaryDirectory +from calibre.constants import __appname__, __version__ +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.snb.snbfile import SNBFile +from calibre.ebooks.snb.snbml import SNBMLizer + +def ProcessFileName(fileName): + # Flat the path + fileName = fileName.replace("/", "_").replace(os.sep, "_") + # Handle bookmark for HTML file + fileName = fileName.replace("#", "_") + # Make it lower case + fileName = fileName.lower() + # Change extension from jpeg to jpg + root, ext = os.path.splitext(fileName) + if ext in [ '.jpeg', '.jpg', '.gif', '.svg' ]: + fileName = root + '.png' + return fileName + class SNBOutput(OutputFormatPlugin): @@ -45,26 +64,152 @@ class SNBOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): + # Create temp dir + with TemporaryDirectory('_snb_output') as tdir: + # Create stub directories + snbfDir = os.path.join(tdir, 'snbf') + snbcDir = os.path.join(tdir, 'snbc') + snbiDir = os.path.join(tdir, 'snbc/images') + os.mkdir(snbfDir) + os.mkdir(snbcDir) + os.mkdir(snbiDir) + + # Process Meta data + meta = oeb_book.metadata + if meta.title: + title = unicode(meta.title[0]) + else: + title = '' + authors = [unicode(x) for x in meta.creator if x.role == 'aut'] + if meta.publisher: + publishers = unicode(meta.publisher[0]) + else: + publishers = '' + if meta.language: + lang = unicode(meta.language[0]).upper() + else: + lang = '' + if meta.description: + abstract = unicode(meta.description[0]) + else: + abstract = '' + + # Process Cover + from calibre.ebooks.oeb.base import urldefrag + g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine + href = None + if 'titlepage' not in g: + if 'cover' in g: + href = g['cover'].href + + # Output book info file + bookInfoTree = etree.Element("book-snbf", version="1.0") + headTree = etree.SubElement(bookInfoTree, "head") + etree.SubElement(headTree, "name").text = title + etree.SubElement(headTree, "author").text = ' '.join(authors) + etree.SubElement(headTree, "language").text = lang + etree.SubElement(headTree, "rights") + etree.SubElement(headTree, "publisher").text = publishers + etree.SubElement(headTree, "generator").text = __appname__ + ' ' + __version__ + etree.SubElement(headTree, "created") + etree.SubElement(headTree, "abstract").text = abstract + if href != None: + etree.SubElement(headTree, "cover").text = ProcessFileName(href) + else: + etree.SubElement(headTree, "cover") + bookInfoFile = open(os.path.join(snbfDir, 'book.snbf'), 'wb') + bookInfoFile.write(etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8')) + bookInfoFile.close() + + # Output TOC + tocInfoTree = etree.Element("toc-snbf") + tocHead = etree.SubElement(tocInfoTree, "head") + tocBody = etree.SubElement(tocInfoTree, "body") + outputFiles = { } + if oeb_book.toc.count() == 0: + log.warn('This SNB file has no Table of Contents. ' + 'Creating a default TOC') + first = iter(oeb_book.spine).next() + oeb_book.toc.add(_('Start'), first.href) + + for tocitem in oeb_book.toc: + ch = etree.SubElement(tocBody, "chapter") + ch.set("src", ProcessFileName(tocitem.href) + ".snbc") + ch.text = tocitem.title + if tocitem.href.find('#') != -1: + item = string.split(tocitem.href, '#') + if len(item) != 2: + log.error('Error in TOC item: %s' % tocitem) + else: + if item[0] in outputFiles: + outputFiles[item[0]].append((item[1], tocitem.title)) + else: + outputFiles[item[0]] = [] + outputFiles[item[0]].append((item[1], tocitem.title)) + else: + if tocitem.href in outputFiles: + outputFiles[tocitem.href].append(("", tocitem)) + else: + outputFiles[tocitem.href] = [] + outputFiles[tocitem.href].append(("", tocitem)) + + etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody) + + tocInfoFile = open(os.path.join(snbfDir, 'toc.snbf'), 'wb') + tocInfoFile.write(etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8')) + tocInfoFile.close() + + # Output Files + for item in s: + from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES, PNG_MIME + if m.hrefs[item.href].media_type in OEB_DOCS: + if not item.href in outputFiles: + log.debug('Skipping %s because unused in TOC.' % item.href) + continue + log.debug('Converting %s to snbc...' % item.href) + snbwriter = SNBMLizer(log) + snbcTrees = snbwriter.extract_content(oeb_book, item, outputFiles[item.href], opts) + for subName in snbcTrees: + postfix = '' + if subName != '': + postfix = '_' + subName + outputFile = open(os.path.join(snbcDir, ProcessFileName(item.href + postfix + ".snbc")), 'wb') + outputFile.write(etree.tostring(snbcTrees[subName], pretty_print=True, encoding='utf-8')) + outputFile.close() + for item in m: + if m.hrefs[item.href].media_type in OEB_IMAGES: + log.debug('Converting image: %s ...' % item.href) + content = m.hrefs[item.href].data + if m.hrefs[item.href].media_type != PNG_MIME: + # Convert + from calibre.utils.magick import Image + img = Image() + img.load(content) + img.save(os.path.join(snbiDir, ProcessFileName(item.href))) + else: + outputFile = open(os.path.join(snbiDir, ProcessFileName(item.href)), 'wb') + outputFile.write(content) + outputFile.close() + + # Package as SNB File + snbFile = SNBFile() + snbFile.FromDir(tdir) + snbFile.Output(output_path) + +if __name__ == '__main__': + from calibre.ebooks.oeb.reader import OEBReader + from calibre.ebooks.oeb.base import OEBBook + from calibre.ebooks.conversion.preprocess import HTMLPreProcessor + from calibre.customize.profiles import HanlinV3Output + class OptionValues(object): pass - # writer = TXTMLizer(log) - # txt = writer.extract_content(oeb_book, opts) - - # log.debug('\tReplacing newlines with selected type...') - # txt = specified_newlines(TxtNewlines(opts.newline).newline, txt) - - # close = False - # if not hasattr(output_path, 'write'): - # close = True - # if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': - # os.makedirs(os.path.dirname(output_path)) - # out_stream = open(output_path, 'wb') - # else: - # out_stream = output_path - - # out_stream.seek(0) - # out_stream.truncate() - # out_stream.write(txt.encode(opts.output_encoding, 'replace')) - - # if close: - # out_stream.close() + opts = OptionValues() + opts.output_profile = HanlinV3Output(None) + + html_preprocessor = HTMLPreProcessor(None, None, opts) + from calibre.utils.logging import default_log + oeb = OEBBook(default_log, html_preprocessor) + reader = OEBReader + reader()(oeb, '/tmp/bbb/processed/') + SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log); diff --git a/src/calibre/ebooks/snb/snbfile.py b/src/calibre/ebooks/snb/snbfile.py new file mode 100644 index 0000000000..aa690fb92b --- /dev/null +++ b/src/calibre/ebooks/snb/snbfile.py @@ -0,0 +1,300 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2010, Li Fanxi ' +__docformat__ = 'restructuredtext en' + +import sys, struct, zlib, bz2, os, math + +class FileStream: + def IsBinary(self): + return self.attr & 0x41000000 != 0x41000000 + +def compareFileStream(file1, file2): + return cmp(file1.fileName, file2.fileName) + +class BlockData: + pass + +class SNBFile: + + files = [] + blocks = [] + + MAGIC = 'SNBP000B' + REV80 = 0x00008000 + REVA3 = 0x00A3A3A3 + REVZ1 = 0x00000000 + REVZ2 = 0x00000000 + + def __init__(self, inputFile = None): + if inputFile != None: + self.Parse(inputFile); + + def Parse(self, inputFile): + self.fileName = inputFile + + snbFile = open(self.fileName, "rb") + snbFile.seek(0) + + # Read header + vmbr = snbFile.read(44) + (self.magic, self.rev80, self.revA3, self.revZ1, + self.fileCount, self.vfatSize, self.vfatCompressed, + self.binStreamSize, self.plainStreamSizeUncompressed, + self.revZ2) = struct.unpack('>8siiiiiiiii', vmbr) + + # Read FAT + self.vfat = zlib.decompress(snbFile.read(self.vfatCompressed)) + self.ParseFile(self.vfat, self.fileCount) + + # Read tail + snbFile.seek(-16, os.SEEK_END) + #plainStreamEnd = snbFile.tell() + tailblock = snbFile.read(16) + (self.tailSize, self.tailOffset, self.tailMagic) = struct.unpack('>ii8s', tailblock) + snbFile.seek(self.tailOffset) + self.vTailUncompressed = zlib.decompress(snbFile.read(self.tailSize)) + self.tailSizeUncompressed = len(self.vTailUncompressed) + self.ParseTail(self.vTailUncompressed, self.fileCount) + + # Uncompress file data + # Read files + binPos = 0 + plainPos = 0 + uncompressedData = None + for f in self.files: + if f.attr & 0x41000000 == 0x41000000: + # Compressed Files + if uncompressedData == None: + uncompressedData = "" + for i in range(self.plainBlock): + bzdc = bz2.BZ2Decompressor() + if (i < self.plainBlock - 1): + bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset; + else: + bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset; + snbFile.seek(self.blocks[self.binBlock + i].Offset); + try: + data = snbFile.read(bSize) + uncompressedData += bzdc.decompress(data) + except EOFError, e: + print e + f.fileBody = uncompressedData[plainPos:plainPos+f.fileSize] + plainPos += f.fileSize + elif f.attr & 0x01000000 == 0x01000000: + # Binary Files + snbFile.seek(44 + self.vfatCompressed + binPos) + f.fileBody = snbFile.read(f.fileSize) + binPos += f.fileSize + else: + print f.attr, f.fileName + raise Exception("Invalid file") + snbFile.close() + + def ParseFile(self, vfat, fileCount): + fileNames = vfat[fileCount*12:].split('\0'); + for i in range(fileCount): + f = FileStream() + (f.attr, f.fileNameOffset, f.fileSize) = struct.unpack('>iii', vfat[i * 12 : (i+1)*12]) + f.fileName = fileNames[i] + self.files.append(f) + + def ParseTail(self, vtail, fileCount): + self.binBlock = (self.binStreamSize + 0x8000 - 1) / 0x8000; + self.plainBlock = (self.plainStreamSizeUncompressed + 0x8000 - 1) / 0x8000; + for i in range(self.binBlock + self.plainBlock): + block = BlockData() + (block.Offset,) = struct.unpack('>i', vtail[i * 4 : (i+1) * 4]) + self.blocks.append(block) + for i in range(fileCount): + (self.files[i].blockIndex, self.files[i].contentOffset) = struct.unpack('>ii', vtail[(self.binBlock + self.plainBlock) * 4 + i * 8 : (self.binBlock + self.plainBlock) * 4 + (i+1) * 8]) + + def IsValid(self): + if self.magic != SNBFile.MAGIC: + return False + if self.rev80 != SNBFile.REV80: + return False + if self.revA3 != SNBFile.REVA3: + return False + if self.revZ1 != SNBFile.REVZ1: + return False + if self.revZ2 != SNBFile.REVZ2: + return False + if self.vfatSize != len(self.vfat): + return False + if self.fileCount != len(self.files): + return False + if (self.binBlock + self.plainBlock) * 4 + self.fileCount * 8 != self.tailSizeUncompressed: + return False + if self.tailMagic != SNBFile.MAGIC: + print self.tailMagic + return False + return True + + def FromDir(self, tdir): + for root, dirs, files in os.walk(tdir): + for name in files: + print name + p, ext = os.path.splitext(name) + if ext in [ ".snbf", ".snbc" ]: + self.AppendPlain(os.path.relpath(os.path.join(root, name), tdir), tdir) + else: + self.AppendBinary(os.path.relpath(os.path.join(root, name), tdir), tdir) + + def AppendPlain(self, fileName, tdir): + f = FileStream() + f.attr = 0x41000000 + f.fileSize = os.path.getsize(os.path.join(tdir,fileName)) + f.fileBody = open(os.path.join(tdir,fileName), 'rb').read() + f.fileName = fileName + print f.fileSize + self.files.append(f) + + def AppendBinary(self, fileName, tdir): + f = FileStream() + f.attr = 0x01000000 + f.fileSize = os.path.getsize(os.path.join(tdir,fileName)) + f.fileBody = open(os.path.join(tdir,fileName), 'rb').read() + f.fileName = fileName + print f.fileSize + self.files.append(f) + + def Output(self, outputFile): + + # Sort the files in file buffer, + # requried by the SNB file format + self.files.sort(compareFileStream) + + outputFile = open(outputFile, 'wb') + # File header part 1 + vmbrp1 = struct.pack('>8siiii', SNBFile.MAGIC, SNBFile.REV80, SNBFile.REVA3, SNBFile.REVZ1, len(self.files)) + + # Create VFAT & file stream + vfat = '' + fileNameTable = '' + plainStream = '' + binStream = '' + for f in self.files: + vfat += struct.pack('>iii', f.attr, len(fileNameTable), f.fileSize); + fileNameTable += (f.fileName + '\0') + + if f.attr & 0x41000000 == 0x41000000: + # Plain Files + f.contentOffset = len(plainStream) + plainStream += f.fileBody + elif f.attr & 0x01000000 == 0x01000000: + # Binary Files + f.contentOffset = len(binStream) + binStream += f.fileBody + else: + print f.attr, f.fileName + raise Exception("Unknown file type") + vfatCompressed = zlib.compress(vfat+fileNameTable) + + # File header part 2 + vmbrp2 = struct.pack('>iiiii', len(vfat+fileNameTable), len(vfatCompressed), len(binStream), len(plainStream), SNBFile.REVZ2) + # Write header + outputFile.write(vmbrp1 + vmbrp2) + # Write vfat + outputFile.write(vfatCompressed) + + # Generate block information + binBlockOffset = 0x2C + len(vfatCompressed) + plainBlockOffset = binBlockOffset + len(binStream) + + binBlock = (len(binStream) + 0x8000 - 1) / 0x8000 + plainBlock = (len(plainStream) + 0x8000 - 1) / 0x8000 + + offset = 0 + tailBlock = '' + for i in range(binBlock): + tailBlock += struct.pack('>i', binBlockOffset + offset) + offset += 0x8000; + tailRec = '' + for f in self.files: + t = 0 + if f.IsBinary(): + t = 0 + else: + t = binBlock + tailRec += struct.pack('>ii', f.contentOffset / 0x8000 + t, f.contentOffset % 0x8000); + + # Write binary stream + outputFile.write(binStream) + + # Write plain stream + pos = 0 + offset = 0 + while pos < len(plainStream): + tailBlock += struct.pack('>i', plainBlockOffset + offset); + block = plainStream[pos:pos+0x8000]; + compressed = bz2.compress(block) + outputFile.write(compressed) + offset += len(compressed) + pos += 0x8000 + + # Write tail block + compressedTail = zlib.compress(tailBlock + tailRec) + outputFile.write(compressedTail) + + # Write tail pointer + veom = struct.pack('>ii', len(compressedTail), plainBlockOffset + offset) + outputFile.write(veom) + + # Write file end mark + outputFile.write(SNBFile.MAGIC); + + # Close + outputFile.close() + return + + def Dump(self): + print "File Name:\t", self.fileName + print "File Count:\t", self.fileCount + print "VFAT Size(Compressed):\t%d(%d)" % (self.vfatSize, self.vfatCompressed) + print "Binary Stream Size:\t", self.binStreamSize + print "Plain Stream Uncompressed Size:\t", self.plainStreamSizeUncompressed + print "Binary Block Count:\t", self.binBlock + print "Plain Block Count:\t", self.plainBlock + for i in range(self.fileCount): + print "File ", i + f = self.files[i] + print "File Name: ", f.fileName + print "File Attr: ", f.attr + print "File Size: ", f.fileSize + print "Block Index: ", f.blockIndex + print "Content Offset: ", f.contentOffset + tempFile = open("/tmp/" + f.fileName, 'wb') + tempFile.write(f.fileBody) + tempFile.close() + +def usage(): + print "This unit test is for INTERNAL usage only!" + print "This unit test accept two parameters." + print "python snbfile.py " + print "The input file will be extracted and write to dest file. " + print "Meta data of the file will be shown during this process." + +def main(): + if len(sys.argv) != 3: + usage() + sys.exit(0) + inputFile = sys.argv[1] + outputFile = sys.argv[2] + + print "Input file: ", inputFile + print "Output file: ", outputFile + + snbFile = SNBFile(inputFile) + if snbFile.IsValid(): + snbFile.Dump() + snbFile.Output(outputFile) + else: + print "The input file is invalid." + return 1 + return 0 + +if __name__ == "__main__": + """SNB file unit test""" + sys.exit(main()) diff --git a/src/calibre/ebooks/snb/snbml.py b/src/calibre/ebooks/snb/snbml.py new file mode 100644 index 0000000000..e1956b5937 --- /dev/null +++ b/src/calibre/ebooks/snb/snbml.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2010, Li Fanxi ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into SNB format +''' + +import os +import re + +from lxml import etree + +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.stylizer import Stylizer + +def ProcessFileName(fileName): + # Flat the path + fileName = fileName.replace("/", "_").replace(os.sep, "_") + # Handle bookmark for HTML file + fileName = fileName.replace("#", "_") + # Make it lower case + fileName = fileName.lower() + # Change extension from jpeg to jpg + root, ext = os.path.splitext(fileName) + if ext in [ '.jpeg', '.jpg', '.gif', '.svg' ]: + fileName = root + '.png' + return fileName + + +BLOCK_TAGS = [ + 'div', + 'p', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'li', + 'tr', +] + +BLOCK_STYLES = [ + 'block', +] + +SPACE_TAGS = [ + 'td', +] + +CLIABRE_SNB_IMG_TAG = "" + +class SNBMLizer(object): + + curSubItem = "" + curText = [ ] + + def __init__(self, log): + self.log = log + + def extract_content(self, oeb_book, item, subitems, opts): + self.log.info('Converting XHTML to SNBC...') + self.oeb_book = oeb_book + self.opts = opts + self.item = item + self.subitems = subitems + return self.mlize(); + + + def mlize(self): + stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile) + content = unicode(etree.tostring(self.item.data.find(XHTML('body')), encoding=unicode)) + content = self.remove_newlines(content) + trees = { } + for subitem, subtitle in self.subitems: + snbcTree = etree.Element("snbc") + etree.SubElement(etree.SubElement(snbcTree, "head"), "title").text = subtitle + etree.SubElement(snbcTree, "body") + trees[subitem] = snbcTree + + self.dump_text(trees, self.subitems, etree.fromstring(content), stylizer) + self.Output(trees) + return trees + + def remove_newlines(self, text): + self.log.debug('\tRemove newlines for processing...') + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + + return text + + def dump_text(self, trees, subitems, elem, stylizer, end=''): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + @end: The last two characters of the text from the previous element. + This is used to determine if a blank line is needed when starting + a new block element. + ''' + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + return [''] + + if elem.attrib.get('id') != None and elem.attrib['id'] in [ href for href, title in subitems ]: + if self.curSubItem != None and self.curSubItem != elem.attrib['id']: + self.Output(trees) + self.curSubItem = elem.attrib['id'] + self.curText = [ ] + + style = stylizer.style(elem) + + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return [''] + + tag = barename(elem.tag) + in_block = False + + # Are we in a paragraph block? + if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: + in_block = True + if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text: + self.curText.append(u'\n\n') + + if tag in SPACE_TAGS: + if not end.endswith('u ') and hasattr(elem, 'text') and elem.text: + self.curText.append(u' ') + + if tag == 'img': + self.curText.append(u'%s%s' % (CLIABRE_SNB_IMG_TAG, ProcessFileName(elem.attrib['src']))) + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + self.curText.append(elem.text) + + for item in elem: + en = u'' + if len(self.curText) >= 2: + en = self.curText[-1][-2:] + self.dump_text(trees, subitems, item, stylizer, en) + + if in_block: + self.curText.append(u'\n\n') + + if hasattr(elem, 'tail') and elem.tail: + self.curText.append(elem.tail) + + def Output(self, trees): + if self.curSubItem == None or not self.curSubItem in trees: + return + for t in self.curText: + if len(t.strip(' \t\n\r')) != 0: + if t.find(CLIABRE_SNB_IMG_TAG) == 0: + etree.SubElement(trees[self.curSubItem], "img").text = t[len(CLIABRE_SNB_IMG_TAG):] + else: + etree.SubElement(trees[self.curSubItem], "text").text = etree.CDATA(unicode('' + t))