MOBI Input: Handle MOBI files that specify anchor point exactly at pagebreaks. These are apparently produced by John Wiley and Sons. Fixes #914036 (MOBI to EPUB conversion bad TOC item pagebreaks)

This commit is contained in:
Kovid Goyal 2012-01-12 12:27:15 +05:30
parent 2d9e47db52
commit 5d664cf830

View File

@ -244,7 +244,9 @@ class MetadataHeader(BookHeader):
class MobiReader(object): class MobiReader(object):
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) PAGE_BREAK_PAT = re.compile(
r'<\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None, def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
@ -539,6 +541,9 @@ class MobiReader(object):
x.getparent().remove(x) x.getparent().remove(x)
svg_tags = [] svg_tags = []
forwardable_anchors = [] forwardable_anchors = []
pagebreak_anchors = []
BLOCK_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'div', 'p'}
for i, tag in enumerate(root.iter(etree.Element)): for i, tag in enumerate(root.iter(etree.Element)):
tag.attrib.pop('xmlns', '') tag.attrib.pop('xmlns', '')
for x in tag.attrib: for x in tag.attrib:
@ -657,6 +662,10 @@ class MobiReader(object):
if not tag.text: if not tag.text:
tag.tag = 'div' tag.tag = 'div'
if (attrib.get('class', None) == 'mbp_pagebreak' and tag.tag ==
'div' and 'filepos-id' in attrib):
pagebreak_anchors.append(tag)
if 'filepos-id' in attrib: if 'filepos-id' in attrib:
attrib['id'] = attrib.pop('filepos-id') attrib['id'] = attrib.pop('filepos-id')
if 'name' in attrib and attrib['name'] != attrib['id']: if 'name' in attrib and attrib['name'] != attrib['id']:
@ -670,8 +679,7 @@ class MobiReader(object):
if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos') if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos')
and not tag.text and (tag.tail is None or not and not tag.text and (tag.tail is None or not
tag.tail.strip()) and getattr(tag.getnext(), 'tag', tag.tail.strip()) and getattr(tag.getnext(), 'tag',
None) in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', None) in BLOCK_TAGS):
'div', 'p')):
# This is an empty anchor immediately before a block tag, move # This is an empty anchor immediately before a block tag, move
# the id onto the block tag instead # the id onto the block tag instead
forwardable_anchors.append(tag) forwardable_anchors.append(tag)
@ -704,6 +712,18 @@ class MobiReader(object):
if hasattr(parent, 'remove'): if hasattr(parent, 'remove'):
parent.remove(tag) parent.remove(tag)
for tag in pagebreak_anchors:
anchor = tag.attrib['id']
del tag.attrib['id']
if 'name' in tag.attrib:
del tag.attrib['name']
p = tag.getparent()
a = p.makeelement('a')
a.attrib['id'] = anchor
p.insert(p.index(tag)+1, a)
if getattr(a.getnext(), 'tag', None) in BLOCK_TAGS:
forwardable_anchors.append(a)
for tag in forwardable_anchors: for tag in forwardable_anchors:
block = tag.getnext() block = tag.getnext()
tag.getparent().remove(tag) tag.getparent().remove(tag)
@ -919,7 +939,7 @@ class MobiReader(object):
def replace_page_breaks(self): def replace_page_breaks(self):
self.processed_html = self.PAGE_BREAK_PAT.sub( self.processed_html = self.PAGE_BREAK_PAT.sub(
'<div class="mbp_pagebreak" />', r'<div \1 class="mbp_pagebreak" />',
self.processed_html) self.processed_html)
def add_anchors(self): def add_anchors(self):
@ -1047,3 +1067,19 @@ def get_metadata(stream):
im.convert('RGB').save(obuf, format='JPEG') im.convert('RGB').save(obuf, format='JPEG')
mi.cover_data = ('jpg', obuf.getvalue()) mi.cover_data = ('jpg', obuf.getvalue())
return mi return mi
def test_mbp_regex():
for raw, m in {
'<mbp:pagebreak></mbp:pagebreak>':'',
'<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy',
'<mbp:pagebreak> </mbp:pagebreak>':'',
'<mbp:pagebreak>xxx':'xxx',
'<mbp:pagebreak/>xxx':'xxx',
'<mbp:pagebreak sdf/ >xxx':' sdfxxx',
'<mbp:pagebreak / >':' ',
}.iteritems():
ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
if ans != m:
raise Exception('%r != %r for %r'%(ans, m, raw))