mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Further improvements to Mobi markup conversion.
This commit is contained in:
parent
35320295f8
commit
b637be07be
@ -21,6 +21,7 @@ def MBP(name): return '{%s}%s' % (MBP_NS, name)
|
||||
|
||||
HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
||||
NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td'])
|
||||
SPECIAL_TAGS = set(['hr', 'br'])
|
||||
CONTENT_TAGS = set(['img', 'hr', 'br'])
|
||||
|
||||
PAGE_BREAKS = set(['always', 'odd', 'even'])
|
||||
@ -59,6 +60,9 @@ class FormatState(object):
|
||||
and self.href == other.href \
|
||||
and self.valign == other.valign
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self.__eq__(other)
|
||||
|
||||
|
||||
class MobiMLizer(object):
|
||||
def __init__(self):
|
||||
@ -86,14 +90,26 @@ class MobiMLizer(object):
|
||||
return self.fnums[self.fmap[ptsize]]
|
||||
|
||||
def mobimlize_measure(self, ptsize):
|
||||
# All MobiML measures occur in the default font-space
|
||||
if isinstance(ptsize, basestring):
|
||||
return ptsize
|
||||
return "%dem" % int(round(ptsize / self.profile.fbase))
|
||||
# All MobiML measures occur in the default font-space
|
||||
fbase = self.profile.fbase
|
||||
if ptsize < fbase:
|
||||
return "%dpt" % int(round(ptsize * 2))
|
||||
return "%dem" % int(round(ptsize / fbase))
|
||||
|
||||
def mobimlize_content(self, tag, text, bstate, istates):
|
||||
istate = istates[-1]
|
||||
if bstate.para is None:
|
||||
if istate.ids:
|
||||
body = bstate.body
|
||||
index = max((0, len(body) - 2))
|
||||
for id in istate.ids:
|
||||
body.insert(index, etree.Element('a', attrib={'id': id}))
|
||||
istate.ids.clear()
|
||||
para = bstate.para
|
||||
if tag in SPECIAL_TAGS and not text:
|
||||
para = para if para is not None else bstate.body
|
||||
elif para is None:
|
||||
bstate.istate = None
|
||||
if bstate.pbreak:
|
||||
etree.SubElement(bstate.body, MBP('pagebreak'))
|
||||
@ -102,7 +118,7 @@ class MobiMLizer(object):
|
||||
parent = bstate.nested[-1] if bstate.nested else bstate.body
|
||||
para = wrapper = etree.SubElement(parent, tag)
|
||||
bstate.nested.append(para)
|
||||
elif bstate.left > 0:
|
||||
elif bstate.left > 0 and istate.indent >= 0:
|
||||
para = wrapper = etree.SubElement(bstate.body, 'blockquote')
|
||||
left = int(round(bstate.left / self.profile.fbase)) - 1
|
||||
while left > 0:
|
||||
@ -118,16 +134,7 @@ class MobiMLizer(object):
|
||||
para.attrib['width'] = self.mobimlize_measure(istate.indent)
|
||||
if istate.halign != 'auto':
|
||||
wrapper.attrib['align'] = istate.halign
|
||||
if istate.ids:
|
||||
wrapper.attrib['id'] = istate.ids.pop()
|
||||
pstate = bstate.istate
|
||||
para = bstate.para
|
||||
if istate.ids:
|
||||
body = bstate.body
|
||||
index = max((0, len(body) - 2))
|
||||
for id in istate.ids:
|
||||
body.insert(index, etree.Element('a', attrib={'id': id}))
|
||||
istate.ids.clear()
|
||||
if tag in CONTENT_TAGS:
|
||||
bstate.inline = para
|
||||
pstate = bstate.istate = None
|
||||
@ -143,7 +150,7 @@ class MobiMLizer(object):
|
||||
inline = etree.SubElement(inline, 'sup')
|
||||
elif valign == 'sub':
|
||||
inline = etree.SubElement(inline, 'sub')
|
||||
elif fsize != 3:
|
||||
if fsize != 3:
|
||||
inline = etree.SubElement(inline, 'font', size=str(fsize))
|
||||
if istate.italic:
|
||||
inline = etree.SubElement(inline, 'i')
|
||||
|
@ -313,7 +313,10 @@ class MobiReader(object):
|
||||
while flags:
|
||||
if flags & 1:
|
||||
num += sizeof_trailing_entry(data, size - num)
|
||||
flags >>= 1
|
||||
flags >>= 1
|
||||
# Flag indicates overlapping multibyte character data
|
||||
if self.book_header.extra_flags & 1:
|
||||
num += ord(data[size - num - 1]) + 1
|
||||
return num
|
||||
|
||||
def text_section(self, index):
|
||||
|
@ -50,9 +50,7 @@ PALMDOC = 2
|
||||
HUFFDIC = 17480
|
||||
|
||||
def encode(data):
|
||||
# Using UTF-8 means needing to worry about multibyte characters crossing
|
||||
# record boundaries, so let's not for now.
|
||||
return data.encode('ascii', 'xmlcharrefreplace')
|
||||
return data.encode('utf-8')
|
||||
|
||||
# Almost like the one for MS LIT, but not quite.
|
||||
def decint(value):
|
||||
@ -193,8 +191,8 @@ class Serializer(object):
|
||||
|
||||
|
||||
class MobiWriter(object):
|
||||
def __init__(self, compress=None, logger=FauxLogger()):
|
||||
self._compress = compress or UNCOMPRESSED
|
||||
def __init__(self, compression=None, logger=FauxLogger()):
|
||||
self._compression = compression or UNCOMPRESSED
|
||||
self._logger = logger
|
||||
|
||||
def dump(self, oeb, path):
|
||||
@ -231,7 +229,39 @@ class MobiWriter(object):
|
||||
if item.media_type.startswith('image/'):
|
||||
images[item.href] = index
|
||||
index += 1
|
||||
|
||||
|
||||
def _read_text_record(self, text):
|
||||
pos = text.tell()
|
||||
text.seek(0, 2)
|
||||
npos = min((pos + RECORD_SIZE, text.tell()))
|
||||
last = ''
|
||||
while not last.decode('utf-8', 'ignore'):
|
||||
size = len(last) + 1
|
||||
text.seek(npos - size)
|
||||
last = text.read(size)
|
||||
try:
|
||||
last.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
else:
|
||||
text.seek(pos)
|
||||
return text.read(RECORD_SIZE)
|
||||
prev = len(last)
|
||||
while True:
|
||||
text.seek(npos - prev)
|
||||
last = text.read(len(last) + 1)
|
||||
try:
|
||||
last.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
extra = len(last) - prev
|
||||
text.seek(pos)
|
||||
data = text.read(RECORD_SIZE + extra)
|
||||
text.seek(npos)
|
||||
return data
|
||||
|
||||
def _generate_text(self):
|
||||
serializer = Serializer(self._oeb, self._images)
|
||||
breaks = serializer.breaks
|
||||
@ -240,14 +270,14 @@ class MobiWriter(object):
|
||||
text = StringIO(text)
|
||||
nrecords = 0
|
||||
offset = 0
|
||||
data = text.read(RECORD_SIZE)
|
||||
data = self._read_text_record(text)
|
||||
while len(data) > 0:
|
||||
if self._compress == PALMDOC:
|
||||
size = len(data)
|
||||
if self._compression == PALMDOC:
|
||||
data = compress_doc(data)
|
||||
record = StringIO()
|
||||
record.write(data)
|
||||
# Without the NUL Mobipocket Desktop 6.2 will thrash. Why?
|
||||
record.write('\0')
|
||||
record.write(pack('>B', max((0, size - RECORD_SIZE))))
|
||||
nextra = 0
|
||||
pbreak = 0
|
||||
running = offset
|
||||
@ -261,7 +291,7 @@ class MobiWriter(object):
|
||||
self._records.append(record.getvalue())
|
||||
nrecords += 1
|
||||
offset += RECORD_SIZE
|
||||
data = text.read(RECORD_SIZE)
|
||||
data = self._read_text_record(text)
|
||||
self._text_nrecords = nrecords
|
||||
|
||||
def _rescale_image(self, data, maxsizeb, dimen=None):
|
||||
@ -304,8 +334,8 @@ class MobiWriter(object):
|
||||
metadata = self._oeb.metadata
|
||||
exth = self._build_exth()
|
||||
record0 = StringIO()
|
||||
record0.write(pack('>HHIHHHH', self._compress, 0, self._text_length,
|
||||
self._text_nrecords, RECORD_SIZE, 0, 0))
|
||||
record0.write(pack('>HHIHHHH', self._compression, 0,
|
||||
self._text_length, self._text_nrecords, RECORD_SIZE, 0, 0))
|
||||
uid = random.randint(0, 0xffffffff)
|
||||
title = str(metadata.title[0])
|
||||
record0.write('MOBI')
|
||||
@ -320,7 +350,11 @@ class MobiWriter(object):
|
||||
record0.write(pack('>I', 0x50))
|
||||
record0.write('\0' * 32)
|
||||
record0.write(pack('>IIII', 0xffffffff, 0xffffffff, 0, 0))
|
||||
# TODO: What the hell are these fields?
|
||||
# The '5' is a bitmask of extra record data at the end:
|
||||
# - 0x1: <extra multibyte bytes><size> (?)
|
||||
# - 0x4: <uncrossable breaks><size>
|
||||
# Of course, the formats aren't quite the same.
|
||||
# TODO: What the hell are the rest of these fields?
|
||||
record0.write(pack('>IIIIIIIIIIIIIIIII',
|
||||
0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff,
|
||||
0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 5, 0xffffffff))
|
||||
@ -391,9 +425,10 @@ class MobiWriter(object):
|
||||
def main(argv=sys.argv):
|
||||
from calibre.ebooks.oeb.base import DirWriter
|
||||
inpath, outpath = argv[1:]
|
||||
context = Context('MSReader', 'Cybook3')
|
||||
context = Context('MSReader', 'MobiDesktop')
|
||||
oeb = OEBBook(inpath)
|
||||
writer = MobiWriter(compress=PALMDOC)
|
||||
#writer = MobiWriter(compression=PALMDOC)
|
||||
writer = MobiWriter(compression=UNCOMPRESSED)
|
||||
#writer = DirWriter()
|
||||
fbase = context.dest.fbase
|
||||
fkey = context.dest.fnums.values()
|
||||
|
@ -39,6 +39,11 @@ PROFILES = {
|
||||
Profile(width=480, height=652, dpi=100.0, fbase=13,
|
||||
fsizes=[10, 11, 13, 16, 18, 20, 22, 26]),
|
||||
|
||||
# Not really, but let's pretend
|
||||
'MobiDesktop':
|
||||
Profile(width=340, height=400, dpi=100, fbase=12,
|
||||
fsizes=[9, 10, 11, 12, 14, 17, 20, 24]),
|
||||
|
||||
# No clue on usable screen size and DPI
|
||||
'Cybook3':
|
||||
Profile(width=584, height=754, dpi=168.451, fbase=12,
|
||||
|
@ -170,6 +170,9 @@ class CSSFlattener(object):
|
||||
left -= style['text-indent']
|
||||
if self.unfloat and 'float' in cssdict and tag != 'img':
|
||||
del cssdict['float']
|
||||
if 'vertical-align' in cssdict:
|
||||
if cssdict['vertical-align'] == 'sup':
|
||||
cssdict['vertical-align'] = 'super'
|
||||
if self.lineh and 'line-height' not in cssdict:
|
||||
lineh = self.lineh / psize
|
||||
cssdict['line-height'] = "%0.5fem" % lineh
|
||||
|
Loading…
x
Reference in New Issue
Block a user