mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
[SNBOutput] Add basic output support for SNB file.
This commit is contained in:
parent
e602d726ab
commit
dd69e24747
@ -4,10 +4,29 @@ __license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import os, string
|
||||
|
||||
from lxml import etree
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import __appname__, __version__
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||
from calibre.ebooks.snb.snbfile import SNBFile
|
||||
from calibre.ebooks.snb.snbml import SNBMLizer
|
||||
|
||||
def ProcessFileName(fileName):
|
||||
# Flat the path
|
||||
fileName = fileName.replace("/", "_").replace(os.sep, "_")
|
||||
# Handle bookmark for HTML file
|
||||
fileName = fileName.replace("#", "_")
|
||||
# Make it lower case
|
||||
fileName = fileName.lower()
|
||||
# Change extension from jpeg to jpg
|
||||
root, ext = os.path.splitext(fileName)
|
||||
if ext in [ '.jpeg', '.jpg', '.gif', '.svg' ]:
|
||||
fileName = root + '.png'
|
||||
return fileName
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
|
||||
class SNBOutput(OutputFormatPlugin):
|
||||
|
||||
@ -45,26 +64,152 @@ class SNBOutput(OutputFormatPlugin):
|
||||
])
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
# Create temp dir
|
||||
with TemporaryDirectory('_snb_output') as tdir:
|
||||
# Create stub directories
|
||||
snbfDir = os.path.join(tdir, 'snbf')
|
||||
snbcDir = os.path.join(tdir, 'snbc')
|
||||
snbiDir = os.path.join(tdir, 'snbc/images')
|
||||
os.mkdir(snbfDir)
|
||||
os.mkdir(snbcDir)
|
||||
os.mkdir(snbiDir)
|
||||
|
||||
# Process Meta data
|
||||
meta = oeb_book.metadata
|
||||
if meta.title:
|
||||
title = unicode(meta.title[0])
|
||||
else:
|
||||
title = ''
|
||||
authors = [unicode(x) for x in meta.creator if x.role == 'aut']
|
||||
if meta.publisher:
|
||||
publishers = unicode(meta.publisher[0])
|
||||
else:
|
||||
publishers = ''
|
||||
if meta.language:
|
||||
lang = unicode(meta.language[0]).upper()
|
||||
else:
|
||||
lang = ''
|
||||
if meta.description:
|
||||
abstract = unicode(meta.description[0])
|
||||
else:
|
||||
abstract = ''
|
||||
|
||||
# Process Cover
|
||||
from calibre.ebooks.oeb.base import urldefrag
|
||||
g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine
|
||||
href = None
|
||||
if 'titlepage' not in g:
|
||||
if 'cover' in g:
|
||||
href = g['cover'].href
|
||||
|
||||
# Output book info file
|
||||
bookInfoTree = etree.Element("book-snbf", version="1.0")
|
||||
headTree = etree.SubElement(bookInfoTree, "head")
|
||||
etree.SubElement(headTree, "name").text = title
|
||||
etree.SubElement(headTree, "author").text = ' '.join(authors)
|
||||
etree.SubElement(headTree, "language").text = lang
|
||||
etree.SubElement(headTree, "rights")
|
||||
etree.SubElement(headTree, "publisher").text = publishers
|
||||
etree.SubElement(headTree, "generator").text = __appname__ + ' ' + __version__
|
||||
etree.SubElement(headTree, "created")
|
||||
etree.SubElement(headTree, "abstract").text = abstract
|
||||
if href != None:
|
||||
etree.SubElement(headTree, "cover").text = ProcessFileName(href)
|
||||
else:
|
||||
etree.SubElement(headTree, "cover")
|
||||
bookInfoFile = open(os.path.join(snbfDir, 'book.snbf'), 'wb')
|
||||
bookInfoFile.write(etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8'))
|
||||
bookInfoFile.close()
|
||||
|
||||
# Output TOC
|
||||
tocInfoTree = etree.Element("toc-snbf")
|
||||
tocHead = etree.SubElement(tocInfoTree, "head")
|
||||
tocBody = etree.SubElement(tocInfoTree, "body")
|
||||
outputFiles = { }
|
||||
if oeb_book.toc.count() == 0:
|
||||
log.warn('This SNB file has no Table of Contents. '
|
||||
'Creating a default TOC')
|
||||
first = iter(oeb_book.spine).next()
|
||||
oeb_book.toc.add(_('Start'), first.href)
|
||||
|
||||
for tocitem in oeb_book.toc:
|
||||
ch = etree.SubElement(tocBody, "chapter")
|
||||
ch.set("src", ProcessFileName(tocitem.href) + ".snbc")
|
||||
ch.text = tocitem.title
|
||||
if tocitem.href.find('#') != -1:
|
||||
item = string.split(tocitem.href, '#')
|
||||
if len(item) != 2:
|
||||
log.error('Error in TOC item: %s' % tocitem)
|
||||
else:
|
||||
if item[0] in outputFiles:
|
||||
outputFiles[item[0]].append((item[1], tocitem.title))
|
||||
else:
|
||||
outputFiles[item[0]] = []
|
||||
outputFiles[item[0]].append((item[1], tocitem.title))
|
||||
else:
|
||||
if tocitem.href in outputFiles:
|
||||
outputFiles[tocitem.href].append(("", tocitem))
|
||||
else:
|
||||
outputFiles[tocitem.href] = []
|
||||
outputFiles[tocitem.href].append(("", tocitem))
|
||||
|
||||
etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody)
|
||||
|
||||
tocInfoFile = open(os.path.join(snbfDir, 'toc.snbf'), 'wb')
|
||||
tocInfoFile.write(etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8'))
|
||||
tocInfoFile.close()
|
||||
|
||||
# Output Files
|
||||
for item in s:
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES, PNG_MIME
|
||||
if m.hrefs[item.href].media_type in OEB_DOCS:
|
||||
if not item.href in outputFiles:
|
||||
log.debug('Skipping %s because unused in TOC.' % item.href)
|
||||
continue
|
||||
log.debug('Converting %s to snbc...' % item.href)
|
||||
snbwriter = SNBMLizer(log)
|
||||
snbcTrees = snbwriter.extract_content(oeb_book, item, outputFiles[item.href], opts)
|
||||
for subName in snbcTrees:
|
||||
postfix = ''
|
||||
if subName != '':
|
||||
postfix = '_' + subName
|
||||
outputFile = open(os.path.join(snbcDir, ProcessFileName(item.href + postfix + ".snbc")), 'wb')
|
||||
outputFile.write(etree.tostring(snbcTrees[subName], pretty_print=True, encoding='utf-8'))
|
||||
outputFile.close()
|
||||
for item in m:
|
||||
if m.hrefs[item.href].media_type in OEB_IMAGES:
|
||||
log.debug('Converting image: %s ...' % item.href)
|
||||
content = m.hrefs[item.href].data
|
||||
if m.hrefs[item.href].media_type != PNG_MIME:
|
||||
# Convert
|
||||
from calibre.utils.magick import Image
|
||||
img = Image()
|
||||
img.load(content)
|
||||
img.save(os.path.join(snbiDir, ProcessFileName(item.href)))
|
||||
else:
|
||||
outputFile = open(os.path.join(snbiDir, ProcessFileName(item.href)), 'wb')
|
||||
outputFile.write(content)
|
||||
outputFile.close()
|
||||
|
||||
# Package as SNB File
|
||||
snbFile = SNBFile()
|
||||
snbFile.FromDir(tdir)
|
||||
snbFile.Output(output_path)
|
||||
|
||||
if __name__ == '__main__':
|
||||
from calibre.ebooks.oeb.reader import OEBReader
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||
from calibre.customize.profiles import HanlinV3Output
|
||||
class OptionValues(object):
|
||||
pass
|
||||
# writer = TXTMLizer(log)
|
||||
# txt = writer.extract_content(oeb_book, opts)
|
||||
|
||||
# log.debug('\tReplacing newlines with selected type...')
|
||||
# txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
|
||||
|
||||
# close = False
|
||||
# if not hasattr(output_path, 'write'):
|
||||
# close = True
|
||||
# if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||
# os.makedirs(os.path.dirname(output_path))
|
||||
# out_stream = open(output_path, 'wb')
|
||||
# else:
|
||||
# out_stream = output_path
|
||||
|
||||
# out_stream.seek(0)
|
||||
# out_stream.truncate()
|
||||
# out_stream.write(txt.encode(opts.output_encoding, 'replace'))
|
||||
|
||||
# if close:
|
||||
# out_stream.close()
|
||||
opts = OptionValues()
|
||||
opts.output_profile = HanlinV3Output(None)
|
||||
|
||||
html_preprocessor = HTMLPreProcessor(None, None, opts)
|
||||
from calibre.utils.logging import default_log
|
||||
oeb = OEBBook(default_log, html_preprocessor)
|
||||
reader = OEBReader
|
||||
reader()(oeb, '/tmp/bbb/processed/')
|
||||
SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log);
|
||||
|
300
src/calibre/ebooks/snb/snbfile.py
Normal file
300
src/calibre/ebooks/snb/snbfile.py
Normal file
@ -0,0 +1,300 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, struct, zlib, bz2, os, math
|
||||
|
||||
class FileStream:
|
||||
def IsBinary(self):
|
||||
return self.attr & 0x41000000 != 0x41000000
|
||||
|
||||
def compareFileStream(file1, file2):
|
||||
return cmp(file1.fileName, file2.fileName)
|
||||
|
||||
class BlockData:
|
||||
pass
|
||||
|
||||
class SNBFile:
|
||||
|
||||
files = []
|
||||
blocks = []
|
||||
|
||||
MAGIC = 'SNBP000B'
|
||||
REV80 = 0x00008000
|
||||
REVA3 = 0x00A3A3A3
|
||||
REVZ1 = 0x00000000
|
||||
REVZ2 = 0x00000000
|
||||
|
||||
def __init__(self, inputFile = None):
|
||||
if inputFile != None:
|
||||
self.Parse(inputFile);
|
||||
|
||||
def Parse(self, inputFile):
|
||||
self.fileName = inputFile
|
||||
|
||||
snbFile = open(self.fileName, "rb")
|
||||
snbFile.seek(0)
|
||||
|
||||
# Read header
|
||||
vmbr = snbFile.read(44)
|
||||
(self.magic, self.rev80, self.revA3, self.revZ1,
|
||||
self.fileCount, self.vfatSize, self.vfatCompressed,
|
||||
self.binStreamSize, self.plainStreamSizeUncompressed,
|
||||
self.revZ2) = struct.unpack('>8siiiiiiiii', vmbr)
|
||||
|
||||
# Read FAT
|
||||
self.vfat = zlib.decompress(snbFile.read(self.vfatCompressed))
|
||||
self.ParseFile(self.vfat, self.fileCount)
|
||||
|
||||
# Read tail
|
||||
snbFile.seek(-16, os.SEEK_END)
|
||||
#plainStreamEnd = snbFile.tell()
|
||||
tailblock = snbFile.read(16)
|
||||
(self.tailSize, self.tailOffset, self.tailMagic) = struct.unpack('>ii8s', tailblock)
|
||||
snbFile.seek(self.tailOffset)
|
||||
self.vTailUncompressed = zlib.decompress(snbFile.read(self.tailSize))
|
||||
self.tailSizeUncompressed = len(self.vTailUncompressed)
|
||||
self.ParseTail(self.vTailUncompressed, self.fileCount)
|
||||
|
||||
# Uncompress file data
|
||||
# Read files
|
||||
binPos = 0
|
||||
plainPos = 0
|
||||
uncompressedData = None
|
||||
for f in self.files:
|
||||
if f.attr & 0x41000000 == 0x41000000:
|
||||
# Compressed Files
|
||||
if uncompressedData == None:
|
||||
uncompressedData = ""
|
||||
for i in range(self.plainBlock):
|
||||
bzdc = bz2.BZ2Decompressor()
|
||||
if (i < self.plainBlock - 1):
|
||||
bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset;
|
||||
else:
|
||||
bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset;
|
||||
snbFile.seek(self.blocks[self.binBlock + i].Offset);
|
||||
try:
|
||||
data = snbFile.read(bSize)
|
||||
uncompressedData += bzdc.decompress(data)
|
||||
except EOFError, e:
|
||||
print e
|
||||
f.fileBody = uncompressedData[plainPos:plainPos+f.fileSize]
|
||||
plainPos += f.fileSize
|
||||
elif f.attr & 0x01000000 == 0x01000000:
|
||||
# Binary Files
|
||||
snbFile.seek(44 + self.vfatCompressed + binPos)
|
||||
f.fileBody = snbFile.read(f.fileSize)
|
||||
binPos += f.fileSize
|
||||
else:
|
||||
print f.attr, f.fileName
|
||||
raise Exception("Invalid file")
|
||||
snbFile.close()
|
||||
|
||||
def ParseFile(self, vfat, fileCount):
|
||||
fileNames = vfat[fileCount*12:].split('\0');
|
||||
for i in range(fileCount):
|
||||
f = FileStream()
|
||||
(f.attr, f.fileNameOffset, f.fileSize) = struct.unpack('>iii', vfat[i * 12 : (i+1)*12])
|
||||
f.fileName = fileNames[i]
|
||||
self.files.append(f)
|
||||
|
||||
def ParseTail(self, vtail, fileCount):
|
||||
self.binBlock = (self.binStreamSize + 0x8000 - 1) / 0x8000;
|
||||
self.plainBlock = (self.plainStreamSizeUncompressed + 0x8000 - 1) / 0x8000;
|
||||
for i in range(self.binBlock + self.plainBlock):
|
||||
block = BlockData()
|
||||
(block.Offset,) = struct.unpack('>i', vtail[i * 4 : (i+1) * 4])
|
||||
self.blocks.append(block)
|
||||
for i in range(fileCount):
|
||||
(self.files[i].blockIndex, self.files[i].contentOffset) = struct.unpack('>ii', vtail[(self.binBlock + self.plainBlock) * 4 + i * 8 : (self.binBlock + self.plainBlock) * 4 + (i+1) * 8])
|
||||
|
||||
def IsValid(self):
|
||||
if self.magic != SNBFile.MAGIC:
|
||||
return False
|
||||
if self.rev80 != SNBFile.REV80:
|
||||
return False
|
||||
if self.revA3 != SNBFile.REVA3:
|
||||
return False
|
||||
if self.revZ1 != SNBFile.REVZ1:
|
||||
return False
|
||||
if self.revZ2 != SNBFile.REVZ2:
|
||||
return False
|
||||
if self.vfatSize != len(self.vfat):
|
||||
return False
|
||||
if self.fileCount != len(self.files):
|
||||
return False
|
||||
if (self.binBlock + self.plainBlock) * 4 + self.fileCount * 8 != self.tailSizeUncompressed:
|
||||
return False
|
||||
if self.tailMagic != SNBFile.MAGIC:
|
||||
print self.tailMagic
|
||||
return False
|
||||
return True
|
||||
|
||||
def FromDir(self, tdir):
|
||||
for root, dirs, files in os.walk(tdir):
|
||||
for name in files:
|
||||
print name
|
||||
p, ext = os.path.splitext(name)
|
||||
if ext in [ ".snbf", ".snbc" ]:
|
||||
self.AppendPlain(os.path.relpath(os.path.join(root, name), tdir), tdir)
|
||||
else:
|
||||
self.AppendBinary(os.path.relpath(os.path.join(root, name), tdir), tdir)
|
||||
|
||||
def AppendPlain(self, fileName, tdir):
|
||||
f = FileStream()
|
||||
f.attr = 0x41000000
|
||||
f.fileSize = os.path.getsize(os.path.join(tdir,fileName))
|
||||
f.fileBody = open(os.path.join(tdir,fileName), 'rb').read()
|
||||
f.fileName = fileName
|
||||
print f.fileSize
|
||||
self.files.append(f)
|
||||
|
||||
def AppendBinary(self, fileName, tdir):
|
||||
f = FileStream()
|
||||
f.attr = 0x01000000
|
||||
f.fileSize = os.path.getsize(os.path.join(tdir,fileName))
|
||||
f.fileBody = open(os.path.join(tdir,fileName), 'rb').read()
|
||||
f.fileName = fileName
|
||||
print f.fileSize
|
||||
self.files.append(f)
|
||||
|
||||
def Output(self, outputFile):
|
||||
|
||||
# Sort the files in file buffer,
|
||||
# requried by the SNB file format
|
||||
self.files.sort(compareFileStream)
|
||||
|
||||
outputFile = open(outputFile, 'wb')
|
||||
# File header part 1
|
||||
vmbrp1 = struct.pack('>8siiii', SNBFile.MAGIC, SNBFile.REV80, SNBFile.REVA3, SNBFile.REVZ1, len(self.files))
|
||||
|
||||
# Create VFAT & file stream
|
||||
vfat = ''
|
||||
fileNameTable = ''
|
||||
plainStream = ''
|
||||
binStream = ''
|
||||
for f in self.files:
|
||||
vfat += struct.pack('>iii', f.attr, len(fileNameTable), f.fileSize);
|
||||
fileNameTable += (f.fileName + '\0')
|
||||
|
||||
if f.attr & 0x41000000 == 0x41000000:
|
||||
# Plain Files
|
||||
f.contentOffset = len(plainStream)
|
||||
plainStream += f.fileBody
|
||||
elif f.attr & 0x01000000 == 0x01000000:
|
||||
# Binary Files
|
||||
f.contentOffset = len(binStream)
|
||||
binStream += f.fileBody
|
||||
else:
|
||||
print f.attr, f.fileName
|
||||
raise Exception("Unknown file type")
|
||||
vfatCompressed = zlib.compress(vfat+fileNameTable)
|
||||
|
||||
# File header part 2
|
||||
vmbrp2 = struct.pack('>iiiii', len(vfat+fileNameTable), len(vfatCompressed), len(binStream), len(plainStream), SNBFile.REVZ2)
|
||||
# Write header
|
||||
outputFile.write(vmbrp1 + vmbrp2)
|
||||
# Write vfat
|
||||
outputFile.write(vfatCompressed)
|
||||
|
||||
# Generate block information
|
||||
binBlockOffset = 0x2C + len(vfatCompressed)
|
||||
plainBlockOffset = binBlockOffset + len(binStream)
|
||||
|
||||
binBlock = (len(binStream) + 0x8000 - 1) / 0x8000
|
||||
plainBlock = (len(plainStream) + 0x8000 - 1) / 0x8000
|
||||
|
||||
offset = 0
|
||||
tailBlock = ''
|
||||
for i in range(binBlock):
|
||||
tailBlock += struct.pack('>i', binBlockOffset + offset)
|
||||
offset += 0x8000;
|
||||
tailRec = ''
|
||||
for f in self.files:
|
||||
t = 0
|
||||
if f.IsBinary():
|
||||
t = 0
|
||||
else:
|
||||
t = binBlock
|
||||
tailRec += struct.pack('>ii', f.contentOffset / 0x8000 + t, f.contentOffset % 0x8000);
|
||||
|
||||
# Write binary stream
|
||||
outputFile.write(binStream)
|
||||
|
||||
# Write plain stream
|
||||
pos = 0
|
||||
offset = 0
|
||||
while pos < len(plainStream):
|
||||
tailBlock += struct.pack('>i', plainBlockOffset + offset);
|
||||
block = plainStream[pos:pos+0x8000];
|
||||
compressed = bz2.compress(block)
|
||||
outputFile.write(compressed)
|
||||
offset += len(compressed)
|
||||
pos += 0x8000
|
||||
|
||||
# Write tail block
|
||||
compressedTail = zlib.compress(tailBlock + tailRec)
|
||||
outputFile.write(compressedTail)
|
||||
|
||||
# Write tail pointer
|
||||
veom = struct.pack('>ii', len(compressedTail), plainBlockOffset + offset)
|
||||
outputFile.write(veom)
|
||||
|
||||
# Write file end mark
|
||||
outputFile.write(SNBFile.MAGIC);
|
||||
|
||||
# Close
|
||||
outputFile.close()
|
||||
return
|
||||
|
||||
def Dump(self):
|
||||
print "File Name:\t", self.fileName
|
||||
print "File Count:\t", self.fileCount
|
||||
print "VFAT Size(Compressed):\t%d(%d)" % (self.vfatSize, self.vfatCompressed)
|
||||
print "Binary Stream Size:\t", self.binStreamSize
|
||||
print "Plain Stream Uncompressed Size:\t", self.plainStreamSizeUncompressed
|
||||
print "Binary Block Count:\t", self.binBlock
|
||||
print "Plain Block Count:\t", self.plainBlock
|
||||
for i in range(self.fileCount):
|
||||
print "File ", i
|
||||
f = self.files[i]
|
||||
print "File Name: ", f.fileName
|
||||
print "File Attr: ", f.attr
|
||||
print "File Size: ", f.fileSize
|
||||
print "Block Index: ", f.blockIndex
|
||||
print "Content Offset: ", f.contentOffset
|
||||
tempFile = open("/tmp/" + f.fileName, 'wb')
|
||||
tempFile.write(f.fileBody)
|
||||
tempFile.close()
|
||||
|
||||
def usage():
|
||||
print "This unit test is for INTERNAL usage only!"
|
||||
print "This unit test accept two parameters."
|
||||
print "python snbfile.py <INPUTFILE> <DESTFILE>"
|
||||
print "The input file will be extracted and write to dest file. "
|
||||
print "Meta data of the file will be shown during this process."
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
usage()
|
||||
sys.exit(0)
|
||||
inputFile = sys.argv[1]
|
||||
outputFile = sys.argv[2]
|
||||
|
||||
print "Input file: ", inputFile
|
||||
print "Output file: ", outputFile
|
||||
|
||||
snbFile = SNBFile(inputFile)
|
||||
if snbFile.IsValid():
|
||||
snbFile.Dump()
|
||||
snbFile.Output(outputFile)
|
||||
else:
|
||||
print "The input file is invalid."
|
||||
return 1
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""SNB file unit test"""
|
||||
sys.exit(main())
|
160
src/calibre/ebooks/snb/snbml.py
Normal file
160
src/calibre/ebooks/snb/snbml.py
Normal file
@ -0,0 +1,160 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Transform OEB content into SNB format
|
||||
'''
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
|
||||
def ProcessFileName(fileName):
|
||||
# Flat the path
|
||||
fileName = fileName.replace("/", "_").replace(os.sep, "_")
|
||||
# Handle bookmark for HTML file
|
||||
fileName = fileName.replace("#", "_")
|
||||
# Make it lower case
|
||||
fileName = fileName.lower()
|
||||
# Change extension from jpeg to jpg
|
||||
root, ext = os.path.splitext(fileName)
|
||||
if ext in [ '.jpeg', '.jpg', '.gif', '.svg' ]:
|
||||
fileName = root + '.png'
|
||||
return fileName
|
||||
|
||||
|
||||
BLOCK_TAGS = [
|
||||
'div',
|
||||
'p',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'li',
|
||||
'tr',
|
||||
]
|
||||
|
||||
BLOCK_STYLES = [
|
||||
'block',
|
||||
]
|
||||
|
||||
SPACE_TAGS = [
|
||||
'td',
|
||||
]
|
||||
|
||||
CLIABRE_SNB_IMG_TAG = "<calibre_snb_temp_img>"
|
||||
|
||||
class SNBMLizer(object):
|
||||
|
||||
curSubItem = ""
|
||||
curText = [ ]
|
||||
|
||||
def __init__(self, log):
|
||||
self.log = log
|
||||
|
||||
def extract_content(self, oeb_book, item, subitems, opts):
|
||||
self.log.info('Converting XHTML to SNBC...')
|
||||
self.oeb_book = oeb_book
|
||||
self.opts = opts
|
||||
self.item = item
|
||||
self.subitems = subitems
|
||||
return self.mlize();
|
||||
|
||||
|
||||
def mlize(self):
|
||||
stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||
content = unicode(etree.tostring(self.item.data.find(XHTML('body')), encoding=unicode))
|
||||
content = self.remove_newlines(content)
|
||||
trees = { }
|
||||
for subitem, subtitle in self.subitems:
|
||||
snbcTree = etree.Element("snbc")
|
||||
etree.SubElement(etree.SubElement(snbcTree, "head"), "title").text = subtitle
|
||||
etree.SubElement(snbcTree, "body")
|
||||
trees[subitem] = snbcTree
|
||||
|
||||
self.dump_text(trees, self.subitems, etree.fromstring(content), stylizer)
|
||||
self.Output(trees)
|
||||
return trees
|
||||
|
||||
def remove_newlines(self, text):
|
||||
self.log.debug('\tRemove newlines for processing...')
|
||||
text = text.replace('\r\n', ' ')
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\r', ' ')
|
||||
|
||||
return text
|
||||
|
||||
def dump_text(self, trees, subitems, elem, stylizer, end=''):
|
||||
'''
|
||||
@elem: The element in the etree that we are working on.
|
||||
@stylizer: The style information attached to the element.
|
||||
@end: The last two characters of the text from the previous element.
|
||||
This is used to determine if a blank line is needed when starting
|
||||
a new block element.
|
||||
'''
|
||||
if not isinstance(elem.tag, basestring) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
return ['']
|
||||
|
||||
if elem.attrib.get('id') != None and elem.attrib['id'] in [ href for href, title in subitems ]:
|
||||
if self.curSubItem != None and self.curSubItem != elem.attrib['id']:
|
||||
self.Output(trees)
|
||||
self.curSubItem = elem.attrib['id']
|
||||
self.curText = [ ]
|
||||
|
||||
style = stylizer.style(elem)
|
||||
|
||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||
or style['visibility'] == 'hidden':
|
||||
return ['']
|
||||
|
||||
tag = barename(elem.tag)
|
||||
in_block = False
|
||||
|
||||
# Are we in a paragraph block?
|
||||
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
|
||||
in_block = True
|
||||
if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text:
|
||||
self.curText.append(u'\n\n')
|
||||
|
||||
if tag in SPACE_TAGS:
|
||||
if not end.endswith('u ') and hasattr(elem, 'text') and elem.text:
|
||||
self.curText.append(u' ')
|
||||
|
||||
if tag == 'img':
|
||||
self.curText.append(u'%s%s' % (CLIABRE_SNB_IMG_TAG, ProcessFileName(elem.attrib['src'])))
|
||||
|
||||
# Process tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text:
|
||||
self.curText.append(elem.text)
|
||||
|
||||
for item in elem:
|
||||
en = u''
|
||||
if len(self.curText) >= 2:
|
||||
en = self.curText[-1][-2:]
|
||||
self.dump_text(trees, subitems, item, stylizer, en)
|
||||
|
||||
if in_block:
|
||||
self.curText.append(u'\n\n')
|
||||
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
self.curText.append(elem.tail)
|
||||
|
||||
def Output(self, trees):
|
||||
if self.curSubItem == None or not self.curSubItem in trees:
|
||||
return
|
||||
for t in self.curText:
|
||||
if len(t.strip(' \t\n\r')) != 0:
|
||||
if t.find(CLIABRE_SNB_IMG_TAG) == 0:
|
||||
etree.SubElement(trees[self.curSubItem], "img").text = t[len(CLIABRE_SNB_IMG_TAG):]
|
||||
else:
|
||||
etree.SubElement(trees[self.curSubItem], "text").text = etree.CDATA(unicode('' + t))
|
Loading…
x
Reference in New Issue
Block a user