mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
MOBI Input: Handle poor quality PRC files from Baen better. Images are now correctly extracted despite the missing image_offset and metadata embedded in the content is used when converting (though not when simply reading metadata as this would be too slow)
This commit is contained in:
parent
3c23b14db0
commit
cfeaed5c71
@ -22,7 +22,7 @@ from calibre.ebooks.mobi.huffcdic import HuffReader
|
|||||||
from calibre.ebooks.mobi.palmdoc import decompress_doc
|
from calibre.ebooks.mobi.palmdoc import decompress_doc
|
||||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language
|
from calibre.ebooks.mobi.langcodes import main_language, sub_language
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre import sanitize_file_name
|
from calibre import sanitize_file_name
|
||||||
|
|
||||||
@ -215,7 +215,11 @@ class MobiReader(object):
|
|||||||
self.upshift_markup(root)
|
self.upshift_markup(root)
|
||||||
guides = root.xpath('//guide')
|
guides = root.xpath('//guide')
|
||||||
guide = guides[0] if guides else None
|
guide = guides[0] if guides else None
|
||||||
for elem in guides + root.xpath('//metadata'):
|
metadata_elems = root.xpath('//metadata')
|
||||||
|
self.embedded_mi = None
|
||||||
|
if metadata_elems and self.book_header.exth is None:
|
||||||
|
self.read_embedded_metadata(root, metadata_elems[0], guide)
|
||||||
|
for elem in guides + metadata_elems:
|
||||||
elem.getparent().remove(elem)
|
elem.getparent().remove(elem)
|
||||||
htmlfile = os.path.join(output_dir,
|
htmlfile = os.path.join(output_dir,
|
||||||
sanitize_file_name(self.name)+'.html')
|
sanitize_file_name(self.name)+'.html')
|
||||||
@ -235,7 +239,7 @@ class MobiReader(object):
|
|||||||
f.write(raw)
|
f.write(raw)
|
||||||
self.htmlfile = htmlfile
|
self.htmlfile = htmlfile
|
||||||
|
|
||||||
if self.book_header.exth is not None:
|
if self.book_header.exth is not None or self.embedded_mi is not None:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Creating OPF...'
|
print 'Creating OPF...'
|
||||||
ncx = cStringIO.StringIO()
|
ncx = cStringIO.StringIO()
|
||||||
@ -245,6 +249,32 @@ class MobiReader(object):
|
|||||||
if ncx:
|
if ncx:
|
||||||
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
|
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
|
||||||
|
|
||||||
|
def read_embedded_metadata(self, root, elem, guide):
|
||||||
|
raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>'
|
||||||
|
stream = cStringIO.StringIO(raw)
|
||||||
|
opf = OPF(stream)
|
||||||
|
self.embedded_mi = MetaInformation(opf)
|
||||||
|
if guide is not None:
|
||||||
|
for ref in guide.xpath('descendant::reference'):
|
||||||
|
if 'cover' in ref.get('type', '').lower():
|
||||||
|
href = ref.get('href', '')
|
||||||
|
if href.startswith('#'):
|
||||||
|
href = href[1:]
|
||||||
|
anchors = root.xpath('//*[@id="%s"]'%href)
|
||||||
|
if anchors:
|
||||||
|
cpos = anchors[0]
|
||||||
|
reached = False
|
||||||
|
for elem in root.iter():
|
||||||
|
if elem is cpos:
|
||||||
|
reached = True
|
||||||
|
if reached and elem.tag == 'img':
|
||||||
|
cover = elem.get('src', None)
|
||||||
|
self.embedded_mi.cover = cover
|
||||||
|
elem.getparent().remove(elem)
|
||||||
|
break
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
def cleanup_html(self):
|
def cleanup_html(self):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Cleaning up HTML...'
|
print 'Cleaning up HTML...'
|
||||||
@ -333,10 +363,12 @@ class MobiReader(object):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def create_opf(self, htmlfile, guide=None, root=None):
|
def create_opf(self, htmlfile, guide=None, root=None):
|
||||||
mi = self.book_header.exth.mi
|
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
|
||||||
opf = OPFCreator(os.path.dirname(htmlfile), mi)
|
opf = OPFCreator(os.path.dirname(htmlfile), mi)
|
||||||
if hasattr(self.book_header.exth, 'cover_offset'):
|
if hasattr(self.book_header.exth, 'cover_offset'):
|
||||||
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
|
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
|
||||||
|
elif mi.cover is not None:
|
||||||
|
opf.cover = mi.cover
|
||||||
manifest = [(htmlfile, 'text/x-oeb1-document')]
|
manifest = [(htmlfile, 'text/x-oeb1-document')]
|
||||||
bp = os.path.dirname(htmlfile)
|
bp = os.path.dirname(htmlfile)
|
||||||
for i in getattr(self, 'image_names', []):
|
for i in getattr(self, 'image_names', []):
|
||||||
@ -481,7 +513,11 @@ class MobiReader(object):
|
|||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
image_index = 0
|
image_index = 0
|
||||||
self.image_names = []
|
self.image_names = []
|
||||||
for i in range(self.book_header.first_image_index, self.num_sections):
|
start = self.book_header.first_image_index
|
||||||
|
if start > self.num_sections or start < 0:
|
||||||
|
# BAEN PRC files have bad headers
|
||||||
|
start=0
|
||||||
|
for i in range(start, self.num_sections):
|
||||||
if i in processed_records:
|
if i in processed_records:
|
||||||
continue
|
continue
|
||||||
processed_records.append(i)
|
processed_records.append(i)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user