mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Significantly improve Mobipocket content conversion.
This commit is contained in:
parent
fc59f7b63d
commit
c4582ed2a8
@ -23,6 +23,8 @@ HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
|||||||
NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td'])
|
NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td'])
|
||||||
CONTENT_TAGS = set(['img', 'hr', 'br'])
|
CONTENT_TAGS = set(['img', 'hr', 'br'])
|
||||||
|
|
||||||
|
PAGE_BREAKS = set(['always', 'odd', 'even'])
|
||||||
|
|
||||||
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
|
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
|
||||||
|
|
||||||
class BlockState(object):
|
class BlockState(object):
|
||||||
@ -34,6 +36,7 @@ class BlockState(object):
|
|||||||
self.vpadding = 0.
|
self.vpadding = 0.
|
||||||
self.vmargin = 0.
|
self.vmargin = 0.
|
||||||
self.left = 0.
|
self.left = 0.
|
||||||
|
self.pbreak = False
|
||||||
self.istate = None
|
self.istate = None
|
||||||
|
|
||||||
class FormatState(object):
|
class FormatState(object):
|
||||||
@ -92,6 +95,9 @@ class MobiMLizer(object):
|
|||||||
istate = istates[-1]
|
istate = istates[-1]
|
||||||
if bstate.para is None:
|
if bstate.para is None:
|
||||||
bstate.istate = None
|
bstate.istate = None
|
||||||
|
if bstate.pbreak:
|
||||||
|
etree.SubElement(bstate.body, MBP('pagebreak'))
|
||||||
|
bstate.pbreak = False
|
||||||
if tag in NESTABLE_TAGS:
|
if tag in NESTABLE_TAGS:
|
||||||
parent = bstate.nested[-1] if bstate.nested else bstate.body
|
parent = bstate.nested[-1] if bstate.nested else bstate.body
|
||||||
para = wrapper = etree.SubElement(parent, tag)
|
para = wrapper = etree.SubElement(parent, tag)
|
||||||
@ -117,8 +123,11 @@ class MobiMLizer(object):
|
|||||||
pstate = bstate.istate
|
pstate = bstate.istate
|
||||||
para = bstate.para
|
para = bstate.para
|
||||||
if istate.ids:
|
if istate.ids:
|
||||||
|
body = bstate.body
|
||||||
|
index = max((0, len(body) - 2))
|
||||||
for id in istate.ids:
|
for id in istate.ids:
|
||||||
etree.SubElement(para, 'a', id=id)
|
body.insert(index, etree.Element('a', attrib={'id': id}))
|
||||||
|
istate.ids.clear()
|
||||||
if tag in CONTENT_TAGS:
|
if tag in CONTENT_TAGS:
|
||||||
bstate.inline = para
|
bstate.inline = para
|
||||||
pstate = bstate.istate = None
|
pstate = bstate.istate = None
|
||||||
@ -176,9 +185,12 @@ class MobiMLizer(object):
|
|||||||
left = margin + padding
|
left = margin + padding
|
||||||
bstate.left += left
|
bstate.left += left
|
||||||
bstate.vmargin = max((bstate.vmargin, style['margin-top']))
|
bstate.vmargin = max((bstate.vmargin, style['margin-top']))
|
||||||
if style['padding-top']:
|
padding = style['padding-top']
|
||||||
|
if isinstance(padding, (int, float)) and padding > 0:
|
||||||
bstate.vpadding += bstate.vmargin
|
bstate.vpadding += bstate.vmargin
|
||||||
bstate.vpadding = style['padding-top']
|
bstate.vpadding = padding
|
||||||
|
if style['page-break-before'] in PAGE_BREAKS:
|
||||||
|
bstate.pbreak = True
|
||||||
istate.fsize = self.mobimlize_font(style['font-size'])
|
istate.fsize = self.mobimlize_font(style['font-size'])
|
||||||
istate.italic = True if style['font-style'] == 'italic' else False
|
istate.italic = True if style['font-style'] == 'italic' else False
|
||||||
weight = style['font-weight']
|
weight = style['font-weight']
|
||||||
@ -207,6 +219,7 @@ class MobiMLizer(object):
|
|||||||
istate.attrib.clear()
|
istate.attrib.clear()
|
||||||
if tag == 'img' and 'src' in elem.attrib:
|
if tag == 'img' and 'src' in elem.attrib:
|
||||||
istate.attrib['src'] = elem.attrib['src']
|
istate.attrib['src'] = elem.attrib['src']
|
||||||
|
istate.attrib['align'] = 'baseline'
|
||||||
if tag == 'hr' and 'width' in style.cssdict():
|
if tag == 'hr' and 'width' in style.cssdict():
|
||||||
istate.attrib['width'] = mobimlize_measure(style['width'])
|
istate.attrib['width'] = mobimlize_measure(style['width'])
|
||||||
text = None
|
text = None
|
||||||
@ -225,19 +238,25 @@ class MobiMLizer(object):
|
|||||||
if child.tail:
|
if child.tail:
|
||||||
if istate.preserve:
|
if istate.preserve:
|
||||||
tail = child.tail
|
tail = child.tail
|
||||||
elif bstate.para is None and child.text.isspace():
|
elif bstate.para is None and child.tail.isspace():
|
||||||
tail = None
|
tail = None
|
||||||
else:
|
else:
|
||||||
tail = COLLAPSE.sub(' ', child.tail)
|
tail = COLLAPSE.sub(' ', child.tail)
|
||||||
if tail:
|
if tail:
|
||||||
self.mobimlize_content(tag, tail, bstate, istates)
|
self.mobimlize_content(tag, tail, bstate, istates)
|
||||||
|
if style['page-break-after'] in PAGE_BREAKS:
|
||||||
|
bstate.pbreak = True
|
||||||
if isblock:
|
if isblock:
|
||||||
|
para = bstate.para
|
||||||
|
if para is not None and para.text == u'\xa0':
|
||||||
|
para.getparent().replace(para, etree.Element('br'))
|
||||||
bstate.para = None
|
bstate.para = None
|
||||||
bstate.left -= left
|
bstate.left -= left
|
||||||
bstate.vmargin = max((bstate.vmargin, style['margin-bottom']))
|
bstate.vmargin = max((bstate.vmargin, style['margin-bottom']))
|
||||||
if style['padding-bottom']:
|
padding = style['padding-bottom']
|
||||||
|
if isinstance(padding, (int, float)) and padding > 0:
|
||||||
bstate.vpadding += bstate.vmargin
|
bstate.vpadding += bstate.vmargin
|
||||||
bstate.vpadding = style['padding-bottom']
|
bstate.vpadding = padding
|
||||||
if bstate.nested:
|
if bstate.nested:
|
||||||
bstate.nested.pop()
|
bstate.nested.pop()
|
||||||
istates.pop()
|
istates.pop()
|
||||||
|
@ -50,7 +50,8 @@ PALMDOC = 2
|
|||||||
HUFFDIC = 17480
|
HUFFDIC = 17480
|
||||||
|
|
||||||
def encode(data):
|
def encode(data):
|
||||||
return data.encode('ascii', 'xmlcharrefreplace')
|
#return data.encode('ascii', 'xmlcharrefreplace')
|
||||||
|
return data.encode('utf-8')
|
||||||
|
|
||||||
# Almost like the one for MS LIT, but not quite.
|
# Almost like the one for MS LIT, but not quite.
|
||||||
def decint(value):
|
def decint(value):
|
||||||
@ -92,12 +93,16 @@ class Serializer(object):
|
|||||||
|
|
||||||
def serialize_guide(self):
|
def serialize_guide(self):
|
||||||
buffer = self.buffer
|
buffer = self.buffer
|
||||||
|
hrefs = self.oeb.manifest.hrefs
|
||||||
buffer.write('<guide>')
|
buffer.write('<guide>')
|
||||||
for ref in self.oeb.guide.values():
|
for ref in self.oeb.guide.values():
|
||||||
|
path, frag = urldefrag(ref.href)
|
||||||
|
if hrefs[path].media_type not in OEB_DOCS:
|
||||||
|
continue
|
||||||
buffer.write('<reference title="%s" type="%s" '
|
buffer.write('<reference title="%s" type="%s" '
|
||||||
% (ref.title, ref.type))
|
% (ref.title, ref.type))
|
||||||
self.serialize_href(ref.href)
|
self.serialize_href(ref.href)
|
||||||
buffer.write('/>')
|
buffer.write(' />')
|
||||||
buffer.write('</guide>')
|
buffer.write('</guide>')
|
||||||
|
|
||||||
def serialize_href(self, href, base=None):
|
def serialize_href(self, href, base=None):
|
||||||
@ -238,19 +243,21 @@ class MobiWriter(object):
|
|||||||
while len(data) > 0:
|
while len(data) > 0:
|
||||||
if self._compress == PALMDOC:
|
if self._compress == PALMDOC:
|
||||||
data = compress_doc(data)
|
data = compress_doc(data)
|
||||||
|
record = StringIO()
|
||||||
|
record.write(data)
|
||||||
# Without the NUL Mobipocket Desktop 6.2 will thrash. Why?
|
# Without the NUL Mobipocket Desktop 6.2 will thrash. Why?
|
||||||
record = [data, '\0']
|
record.write('\0')
|
||||||
nextra = 0
|
nextra = 0
|
||||||
pbreak = 0
|
pbreak = 0
|
||||||
running = offset
|
running = offset
|
||||||
while breaks and (breaks[0] - offset) < RECORD_SIZE:
|
while breaks and (breaks[0] - offset) < RECORD_SIZE:
|
||||||
pbreak = (breaks.pop(0) - running) >> 3
|
pbreak = (breaks.pop(0) - running) >> 3
|
||||||
encoded = decint(pbreak)
|
encoded = decint(pbreak)
|
||||||
record.append(encoded)
|
record.write(encoded)
|
||||||
running += pbreak << 3
|
running += pbreak << 3
|
||||||
nextra += len(encoded)
|
nextra += len(encoded)
|
||||||
record.append(decint(nextra + 1))
|
record.write(decint(nextra + 1))
|
||||||
self._records.append(''.join(record))
|
self._records.append(record.getvalue())
|
||||||
nrecords += 1
|
nrecords += 1
|
||||||
offset += RECORD_SIZE
|
offset += RECORD_SIZE
|
||||||
data = text.read(RECORD_SIZE)
|
data = text.read(RECORD_SIZE)
|
||||||
@ -385,7 +392,7 @@ def main(argv=sys.argv):
|
|||||||
inpath, outpath = argv[1:]
|
inpath, outpath = argv[1:]
|
||||||
context = Context('MSReader', 'Cybook3')
|
context = Context('MSReader', 'Cybook3')
|
||||||
oeb = OEBBook(inpath)
|
oeb = OEBBook(inpath)
|
||||||
writer = MobiWriter()
|
writer = MobiWriter(compress=PALMDOC)
|
||||||
#writer = DirWriter()
|
#writer = DirWriter()
|
||||||
fbase = context.dest.fbase
|
fbase = context.dest.fbase
|
||||||
fkey = context.dest.fnums.values()
|
fkey = context.dest.fnums.values()
|
||||||
|
@ -42,7 +42,7 @@ PROFILES = {
|
|||||||
# No clue on usable screen size and DPI
|
# No clue on usable screen size and DPI
|
||||||
'Cybook3':
|
'Cybook3':
|
||||||
Profile(width=584, height=754, dpi=168.451, fbase=12,
|
Profile(width=584, height=754, dpi=168.451, fbase=12,
|
||||||
fsizes=[9, 10, 11, 12, 14, 16, 18, 20]),
|
fsizes=[9, 10, 11, 12, 14, 17, 20, 24]),
|
||||||
|
|
||||||
'Firefox':
|
'Firefox':
|
||||||
Profile(width=800, height=600, dpi=100.0, fbase=12,
|
Profile(width=800, height=600, dpi=100.0, fbase=12,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user