diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 3583ea2f4a..7eb2de1415 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -21,6 +21,7 @@ def MBP(name): return '{%s}%s' % (MBP_NS, name) HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td']) +SPECIAL_TAGS = set(['hr', 'br']) CONTENT_TAGS = set(['img', 'hr', 'br']) PAGE_BREAKS = set(['always', 'odd', 'even']) @@ -59,6 +60,9 @@ class FormatState(object): and self.href == other.href \ and self.valign == other.valign + def __ne__(self, other): + return not self.__eq__(other) + class MobiMLizer(object): def __init__(self): @@ -86,14 +90,26 @@ class MobiMLizer(object): return self.fnums[self.fmap[ptsize]] def mobimlize_measure(self, ptsize): - # All MobiML measures occur in the default font-space if isinstance(ptsize, basestring): return ptsize - return "%dem" % int(round(ptsize / self.profile.fbase)) + # All MobiML measures occur in the default font-space + fbase = self.profile.fbase + if ptsize < fbase: + return "%dpt" % int(round(ptsize * 2)) + return "%dem" % int(round(ptsize / fbase)) def mobimlize_content(self, tag, text, bstate, istates): istate = istates[-1] - if bstate.para is None: + if istate.ids: + body = bstate.body + index = max((0, len(body) - 2)) + for id in istate.ids: + body.insert(index, etree.Element('a', attrib={'id': id})) + istate.ids.clear() + para = bstate.para + if tag in SPECIAL_TAGS and not text: + para = para if para is not None else bstate.body + elif para is None: bstate.istate = None if bstate.pbreak: etree.SubElement(bstate.body, MBP('pagebreak')) @@ -102,7 +118,7 @@ class MobiMLizer(object): parent = bstate.nested[-1] if bstate.nested else bstate.body para = wrapper = etree.SubElement(parent, tag) bstate.nested.append(para) - elif bstate.left > 0: + elif bstate.left > 0 and istate.indent >= 0: para = wrapper = etree.SubElement(bstate.body, 'blockquote') left = int(round(bstate.left / self.profile.fbase)) - 1 while left > 0: @@ -118,16 +134,7 @@ class MobiMLizer(object): para.attrib['width'] = self.mobimlize_measure(istate.indent) if istate.halign != 'auto': wrapper.attrib['align'] = istate.halign - if istate.ids: - wrapper.attrib['id'] = istate.ids.pop() pstate = bstate.istate - para = bstate.para - if istate.ids: - body = bstate.body - index = max((0, len(body) - 2)) - for id in istate.ids: - body.insert(index, etree.Element('a', attrib={'id': id})) - istate.ids.clear() if tag in CONTENT_TAGS: bstate.inline = para pstate = bstate.istate = None @@ -143,7 +150,7 @@ class MobiMLizer(object): inline = etree.SubElement(inline, 'sup') elif valign == 'sub': inline = etree.SubElement(inline, 'sub') - elif fsize != 3: + if fsize != 3: inline = etree.SubElement(inline, 'font', size=str(fsize)) if istate.italic: inline = etree.SubElement(inline, 'i') diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 420159299e..e92bb6b28a 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -313,7 +313,10 @@ class MobiReader(object): while flags: if flags & 1: num += sizeof_trailing_entry(data, size - num) - flags >>= 1 + flags >>= 1 + # Flag indicates overlapping multibyte character data + if self.book_header.extra_flags & 1: + num += ord(data[size - num - 1]) + 1 return num def text_section(self, index): diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index fb5c1118a2..3a529deac8 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -50,9 +50,7 @@ PALMDOC = 2 HUFFDIC = 17480 def encode(data): - # Using UTF-8 means needing to worry about multibyte characters crossing - # record boundaries, so let's not for now. - return data.encode('ascii', 'xmlcharrefreplace') + return data.encode('utf-8') # Almost like the one for MS LIT, but not quite. def decint(value): @@ -193,8 +191,8 @@ class Serializer(object): class MobiWriter(object): - def __init__(self, compress=None, logger=FauxLogger()): - self._compress = compress or UNCOMPRESSED + def __init__(self, compression=None, logger=FauxLogger()): + self._compression = compression or UNCOMPRESSED self._logger = logger def dump(self, oeb, path): @@ -231,7 +229,39 @@ class MobiWriter(object): if item.media_type.startswith('image/'): images[item.href] = index index += 1 - + + def _read_text_record(self, text): + pos = text.tell() + text.seek(0, 2) + npos = min((pos + RECORD_SIZE, text.tell())) + last = '' + while not last.decode('utf-8', 'ignore'): + size = len(last) + 1 + text.seek(npos - size) + last = text.read(size) + try: + last.decode('utf-8') + except UnicodeDecodeError: + pass + else: + text.seek(pos) + return text.read(RECORD_SIZE) + prev = len(last) + while True: + text.seek(npos - prev) + last = text.read(len(last) + 1) + try: + last.decode('utf-8') + except UnicodeDecodeError: + pass + else: + break + extra = len(last) - prev + text.seek(pos) + data = text.read(RECORD_SIZE + extra) + text.seek(npos) + return data + def _generate_text(self): serializer = Serializer(self._oeb, self._images) breaks = serializer.breaks @@ -240,14 +270,14 @@ class MobiWriter(object): text = StringIO(text) nrecords = 0 offset = 0 - data = text.read(RECORD_SIZE) + data = self._read_text_record(text) while len(data) > 0: - if self._compress == PALMDOC: + size = len(data) + if self._compression == PALMDOC: data = compress_doc(data) record = StringIO() record.write(data) - # Without the NUL Mobipocket Desktop 6.2 will thrash. Why? - record.write('\0') + record.write(pack('>B', max((0, size - RECORD_SIZE)))) nextra = 0 pbreak = 0 running = offset @@ -261,7 +291,7 @@ class MobiWriter(object): self._records.append(record.getvalue()) nrecords += 1 offset += RECORD_SIZE - data = text.read(RECORD_SIZE) + data = self._read_text_record(text) self._text_nrecords = nrecords def _rescale_image(self, data, maxsizeb, dimen=None): @@ -304,8 +334,8 @@ class MobiWriter(object): metadata = self._oeb.metadata exth = self._build_exth() record0 = StringIO() - record0.write(pack('>HHIHHHH', self._compress, 0, self._text_length, - self._text_nrecords, RECORD_SIZE, 0, 0)) + record0.write(pack('>HHIHHHH', self._compression, 0, + self._text_length, self._text_nrecords, RECORD_SIZE, 0, 0)) uid = random.randint(0, 0xffffffff) title = str(metadata.title[0]) record0.write('MOBI') @@ -320,7 +350,11 @@ class MobiWriter(object): record0.write(pack('>I', 0x50)) record0.write('\0' * 32) record0.write(pack('>IIII', 0xffffffff, 0xffffffff, 0, 0)) - # TODO: What the hell are these fields? + # The '5' is a bitmask of extra record data at the end: + # - 0x1: (?) + # - 0x4: + # Of course, the formats aren't quite the same. + # TODO: What the hell are the rest of these fields? record0.write(pack('>IIIIIIIIIIIIIIIII', 0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 5, 0xffffffff)) @@ -391,9 +425,10 @@ class MobiWriter(object): def main(argv=sys.argv): from calibre.ebooks.oeb.base import DirWriter inpath, outpath = argv[1:] - context = Context('MSReader', 'Cybook3') + context = Context('MSReader', 'MobiDesktop') oeb = OEBBook(inpath) - writer = MobiWriter(compress=PALMDOC) + #writer = MobiWriter(compression=PALMDOC) + writer = MobiWriter(compression=UNCOMPRESSED) #writer = DirWriter() fbase = context.dest.fbase fkey = context.dest.fnums.values() diff --git a/src/calibre/ebooks/oeb/profile.py b/src/calibre/ebooks/oeb/profile.py index 1d58900b28..5991a0f484 100644 --- a/src/calibre/ebooks/oeb/profile.py +++ b/src/calibre/ebooks/oeb/profile.py @@ -39,6 +39,11 @@ PROFILES = { Profile(width=480, height=652, dpi=100.0, fbase=13, fsizes=[10, 11, 13, 16, 18, 20, 22, 26]), + # Not really, but let's pretend + 'MobiDesktop': + Profile(width=340, height=400, dpi=100, fbase=12, + fsizes=[9, 10, 11, 12, 14, 17, 20, 24]), + # No clue on usable screen size and DPI 'Cybook3': Profile(width=584, height=754, dpi=168.451, fbase=12, diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 11fdaea066..8a2bc2a4fa 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -170,6 +170,9 @@ class CSSFlattener(object): left -= style['text-indent'] if self.unfloat and 'float' in cssdict and tag != 'img': del cssdict['float'] + if 'vertical-align' in cssdict: + if cssdict['vertical-align'] == 'sup': + cssdict['vertical-align'] = 'super' if self.lineh and 'line-height' not in cssdict: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh