MOBI Input: Handle MOBI files that specify anchor point exactly at pagebreaks. These are apparently produced by John Wiley and Sons. Fixes #914036 (MOBI to EPUB conversion bad TOC item pagebreaks)

This commit is contained in:
Kovid Goyal 2012-01-12 12:27:15 +05:30
parent 2d9e47db52
commit 5d664cf830

View File

@ -244,7 +244,9 @@ class MetadataHeader(BookHeader):
class MobiReader(object):
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
PAGE_BREAK_PAT = re.compile(
r'<\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
@ -539,6 +541,9 @@ class MobiReader(object):
x.getparent().remove(x)
svg_tags = []
forwardable_anchors = []
pagebreak_anchors = []
BLOCK_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'div', 'p'}
for i, tag in enumerate(root.iter(etree.Element)):
tag.attrib.pop('xmlns', '')
for x in tag.attrib:
@ -657,6 +662,10 @@ class MobiReader(object):
if not tag.text:
tag.tag = 'div'
if (attrib.get('class', None) == 'mbp_pagebreak' and tag.tag ==
'div' and 'filepos-id' in attrib):
pagebreak_anchors.append(tag)
if 'filepos-id' in attrib:
attrib['id'] = attrib.pop('filepos-id')
if 'name' in attrib and attrib['name'] != attrib['id']:
@ -670,8 +679,7 @@ class MobiReader(object):
if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos')
and not tag.text and (tag.tail is None or not
tag.tail.strip()) and getattr(tag.getnext(), 'tag',
None) in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'div', 'p')):
None) in BLOCK_TAGS):
# This is an empty anchor immediately before a block tag, move
# the id onto the block tag instead
forwardable_anchors.append(tag)
@ -704,6 +712,18 @@ class MobiReader(object):
if hasattr(parent, 'remove'):
parent.remove(tag)
for tag in pagebreak_anchors:
anchor = tag.attrib['id']
del tag.attrib['id']
if 'name' in tag.attrib:
del tag.attrib['name']
p = tag.getparent()
a = p.makeelement('a')
a.attrib['id'] = anchor
p.insert(p.index(tag)+1, a)
if getattr(a.getnext(), 'tag', None) in BLOCK_TAGS:
forwardable_anchors.append(a)
for tag in forwardable_anchors:
block = tag.getnext()
tag.getparent().remove(tag)
@ -919,7 +939,7 @@ class MobiReader(object):
def replace_page_breaks(self):
self.processed_html = self.PAGE_BREAK_PAT.sub(
'<div class="mbp_pagebreak" />',
r'<div \1 class="mbp_pagebreak" />',
self.processed_html)
def add_anchors(self):
@ -1047,3 +1067,19 @@ def get_metadata(stream):
im.convert('RGB').save(obuf, format='JPEG')
mi.cover_data = ('jpg', obuf.getvalue())
return mi
def test_mbp_regex():
for raw, m in {
'<mbp:pagebreak></mbp:pagebreak>':'',
'<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy',
'<mbp:pagebreak> </mbp:pagebreak>':'',
'<mbp:pagebreak>xxx':'xxx',
'<mbp:pagebreak/>xxx':'xxx',
'<mbp:pagebreak sdf/ >xxx':' sdfxxx',
'<mbp:pagebreak / >':' ',
}.iteritems():
ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
if ans != m:
raise Exception('%r != %r for %r'%(ans, m, raw))