mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
MOBI Input: Handle MOBI files that specify anchor point exactly at pagebreaks. These are apparently produced by John Wiley and Sons. Fixes #914036 (MOBI to EPUB conversion bad TOC item pagebreaks)
This commit is contained in:
parent
2d9e47db52
commit
5d664cf830
@ -244,7 +244,9 @@ class MetadataHeader(BookHeader):
|
||||
|
||||
|
||||
class MobiReader(object):
|
||||
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
|
||||
PAGE_BREAK_PAT = re.compile(
|
||||
r'<\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
|
||||
re.IGNORECASE)
|
||||
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
|
||||
|
||||
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
|
||||
@ -539,6 +541,9 @@ class MobiReader(object):
|
||||
x.getparent().remove(x)
|
||||
svg_tags = []
|
||||
forwardable_anchors = []
|
||||
pagebreak_anchors = []
|
||||
BLOCK_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'div', 'p'}
|
||||
for i, tag in enumerate(root.iter(etree.Element)):
|
||||
tag.attrib.pop('xmlns', '')
|
||||
for x in tag.attrib:
|
||||
@ -657,6 +662,10 @@ class MobiReader(object):
|
||||
if not tag.text:
|
||||
tag.tag = 'div'
|
||||
|
||||
if (attrib.get('class', None) == 'mbp_pagebreak' and tag.tag ==
|
||||
'div' and 'filepos-id' in attrib):
|
||||
pagebreak_anchors.append(tag)
|
||||
|
||||
if 'filepos-id' in attrib:
|
||||
attrib['id'] = attrib.pop('filepos-id')
|
||||
if 'name' in attrib and attrib['name'] != attrib['id']:
|
||||
@ -670,8 +679,7 @@ class MobiReader(object):
|
||||
if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos')
|
||||
and not tag.text and (tag.tail is None or not
|
||||
tag.tail.strip()) and getattr(tag.getnext(), 'tag',
|
||||
None) in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'div', 'p')):
|
||||
None) in BLOCK_TAGS):
|
||||
# This is an empty anchor immediately before a block tag, move
|
||||
# the id onto the block tag instead
|
||||
forwardable_anchors.append(tag)
|
||||
@ -704,6 +712,18 @@ class MobiReader(object):
|
||||
if hasattr(parent, 'remove'):
|
||||
parent.remove(tag)
|
||||
|
||||
for tag in pagebreak_anchors:
|
||||
anchor = tag.attrib['id']
|
||||
del tag.attrib['id']
|
||||
if 'name' in tag.attrib:
|
||||
del tag.attrib['name']
|
||||
p = tag.getparent()
|
||||
a = p.makeelement('a')
|
||||
a.attrib['id'] = anchor
|
||||
p.insert(p.index(tag)+1, a)
|
||||
if getattr(a.getnext(), 'tag', None) in BLOCK_TAGS:
|
||||
forwardable_anchors.append(a)
|
||||
|
||||
for tag in forwardable_anchors:
|
||||
block = tag.getnext()
|
||||
tag.getparent().remove(tag)
|
||||
@ -919,7 +939,7 @@ class MobiReader(object):
|
||||
|
||||
def replace_page_breaks(self):
|
||||
self.processed_html = self.PAGE_BREAK_PAT.sub(
|
||||
'<div class="mbp_pagebreak" />',
|
||||
r'<div \1 class="mbp_pagebreak" />',
|
||||
self.processed_html)
|
||||
|
||||
def add_anchors(self):
|
||||
@ -1047,3 +1067,19 @@ def get_metadata(stream):
|
||||
im.convert('RGB').save(obuf, format='JPEG')
|
||||
mi.cover_data = ('jpg', obuf.getvalue())
|
||||
return mi
|
||||
|
||||
def test_mbp_regex():
|
||||
for raw, m in {
|
||||
'<mbp:pagebreak></mbp:pagebreak>':'',
|
||||
'<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy',
|
||||
'<mbp:pagebreak> </mbp:pagebreak>':'',
|
||||
'<mbp:pagebreak>xxx':'xxx',
|
||||
'<mbp:pagebreak/>xxx':'xxx',
|
||||
'<mbp:pagebreak sdf/ >xxx':' sdfxxx',
|
||||
'<mbp:pagebreak / >':' ',
|
||||
}.iteritems():
|
||||
ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
|
||||
if ans != m:
|
||||
raise Exception('%r != %r for %r'%(ans, m, raw))
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user