mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
MOBI Input: Handle MOBI files that specify anchor point exactly at pagebreaks. These are apparently produced by John Wiley and Sons. Fixes #914036 (MOBI to EPUB conversion bad TOC item pagebreaks)
This commit is contained in:
parent
2d9e47db52
commit
5d664cf830
@ -244,7 +244,9 @@ class MetadataHeader(BookHeader):
|
|||||||
|
|
||||||
|
|
||||||
class MobiReader(object):
|
class MobiReader(object):
|
||||||
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
|
PAGE_BREAK_PAT = re.compile(
|
||||||
|
r'<\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
|
||||||
|
re.IGNORECASE)
|
||||||
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
|
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
|
||||||
|
|
||||||
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
|
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
|
||||||
@ -539,6 +541,9 @@ class MobiReader(object):
|
|||||||
x.getparent().remove(x)
|
x.getparent().remove(x)
|
||||||
svg_tags = []
|
svg_tags = []
|
||||||
forwardable_anchors = []
|
forwardable_anchors = []
|
||||||
|
pagebreak_anchors = []
|
||||||
|
BLOCK_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||||
|
'div', 'p'}
|
||||||
for i, tag in enumerate(root.iter(etree.Element)):
|
for i, tag in enumerate(root.iter(etree.Element)):
|
||||||
tag.attrib.pop('xmlns', '')
|
tag.attrib.pop('xmlns', '')
|
||||||
for x in tag.attrib:
|
for x in tag.attrib:
|
||||||
@ -657,6 +662,10 @@ class MobiReader(object):
|
|||||||
if not tag.text:
|
if not tag.text:
|
||||||
tag.tag = 'div'
|
tag.tag = 'div'
|
||||||
|
|
||||||
|
if (attrib.get('class', None) == 'mbp_pagebreak' and tag.tag ==
|
||||||
|
'div' and 'filepos-id' in attrib):
|
||||||
|
pagebreak_anchors.append(tag)
|
||||||
|
|
||||||
if 'filepos-id' in attrib:
|
if 'filepos-id' in attrib:
|
||||||
attrib['id'] = attrib.pop('filepos-id')
|
attrib['id'] = attrib.pop('filepos-id')
|
||||||
if 'name' in attrib and attrib['name'] != attrib['id']:
|
if 'name' in attrib and attrib['name'] != attrib['id']:
|
||||||
@ -670,8 +679,7 @@ class MobiReader(object):
|
|||||||
if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos')
|
if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos')
|
||||||
and not tag.text and (tag.tail is None or not
|
and not tag.text and (tag.tail is None or not
|
||||||
tag.tail.strip()) and getattr(tag.getnext(), 'tag',
|
tag.tail.strip()) and getattr(tag.getnext(), 'tag',
|
||||||
None) in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
None) in BLOCK_TAGS):
|
||||||
'div', 'p')):
|
|
||||||
# This is an empty anchor immediately before a block tag, move
|
# This is an empty anchor immediately before a block tag, move
|
||||||
# the id onto the block tag instead
|
# the id onto the block tag instead
|
||||||
forwardable_anchors.append(tag)
|
forwardable_anchors.append(tag)
|
||||||
@ -704,6 +712,18 @@ class MobiReader(object):
|
|||||||
if hasattr(parent, 'remove'):
|
if hasattr(parent, 'remove'):
|
||||||
parent.remove(tag)
|
parent.remove(tag)
|
||||||
|
|
||||||
|
for tag in pagebreak_anchors:
|
||||||
|
anchor = tag.attrib['id']
|
||||||
|
del tag.attrib['id']
|
||||||
|
if 'name' in tag.attrib:
|
||||||
|
del tag.attrib['name']
|
||||||
|
p = tag.getparent()
|
||||||
|
a = p.makeelement('a')
|
||||||
|
a.attrib['id'] = anchor
|
||||||
|
p.insert(p.index(tag)+1, a)
|
||||||
|
if getattr(a.getnext(), 'tag', None) in BLOCK_TAGS:
|
||||||
|
forwardable_anchors.append(a)
|
||||||
|
|
||||||
for tag in forwardable_anchors:
|
for tag in forwardable_anchors:
|
||||||
block = tag.getnext()
|
block = tag.getnext()
|
||||||
tag.getparent().remove(tag)
|
tag.getparent().remove(tag)
|
||||||
@ -919,7 +939,7 @@ class MobiReader(object):
|
|||||||
|
|
||||||
def replace_page_breaks(self):
|
def replace_page_breaks(self):
|
||||||
self.processed_html = self.PAGE_BREAK_PAT.sub(
|
self.processed_html = self.PAGE_BREAK_PAT.sub(
|
||||||
'<div class="mbp_pagebreak" />',
|
r'<div \1 class="mbp_pagebreak" />',
|
||||||
self.processed_html)
|
self.processed_html)
|
||||||
|
|
||||||
def add_anchors(self):
|
def add_anchors(self):
|
||||||
@ -1047,3 +1067,19 @@ def get_metadata(stream):
|
|||||||
im.convert('RGB').save(obuf, format='JPEG')
|
im.convert('RGB').save(obuf, format='JPEG')
|
||||||
mi.cover_data = ('jpg', obuf.getvalue())
|
mi.cover_data = ('jpg', obuf.getvalue())
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
|
def test_mbp_regex():
|
||||||
|
for raw, m in {
|
||||||
|
'<mbp:pagebreak></mbp:pagebreak>':'',
|
||||||
|
'<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy',
|
||||||
|
'<mbp:pagebreak> </mbp:pagebreak>':'',
|
||||||
|
'<mbp:pagebreak>xxx':'xxx',
|
||||||
|
'<mbp:pagebreak/>xxx':'xxx',
|
||||||
|
'<mbp:pagebreak sdf/ >xxx':' sdfxxx',
|
||||||
|
'<mbp:pagebreak / >':' ',
|
||||||
|
}.iteritems():
|
||||||
|
ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
|
||||||
|
if ans != m:
|
||||||
|
raise Exception('%r != %r for %r'%(ans, m, raw))
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user