Fix #1712 (MOBI error dump from Mac) and make detection if title in MOBI files more robust

This commit is contained in:
Kovid Goyal 2009-01-27 18:00:25 -08:00
commit e82972af80
5 changed files with 64 additions and 49 deletions

View File

@ -28,7 +28,7 @@ from calibre import sanitize_file_name
class EXTHHeader(object): class EXTHHeader(object):
def __init__(self, raw, codec): def __init__(self, raw, codec, title):
self.doctype = raw[:4] self.doctype = raw[:4]
self.length, self.num_items = struct.unpack('>LL', raw[4:12]) self.length, self.num_items = struct.unpack('>LL', raw[4:12])
raw = raw[12:] raw = raw[12:]
@ -52,16 +52,8 @@ class EXTHHeader(object):
self.thumbnail_offset, = struct.unpack('>L', content) self.thumbnail_offset, = struct.unpack('>L', content)
#else: #else:
# print 'unknown record', id, repr(content) # print 'unknown record', id, repr(content)
title = re.search(r'\0+([^\0]+)\0+', raw[pos:])
if title: if title:
title = title.group(1).decode(codec, 'replace')
if len(title) > 2:
self.mi.title = title self.mi.title = title
else:
title = re.search(r'\0+([^\0]+)\0+', ''.join(reversed(raw[pos:])))
if title:
self.mi.title = ''.join(reversed(title.group(1).decode(codec, 'replace')))
def process_metadata(self, id, content, codec): def process_metadata(self, id, content, codec):
if id == 100: if id == 100:
@ -121,6 +113,9 @@ class BookHeader(object):
if self.compression_type == 'DH': if self.compression_type == 'DH':
self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78]) self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
tend = toff + tlen
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
langcode = struct.unpack('!L', raw[0x5C:0x60])[0] langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
langid = langcode & 0xFF langid = langcode & 0xFF
sublangid = (langcode >> 10) & 0xFF sublangid = (langcode >> 10) & 0xFF
@ -131,7 +126,7 @@ class BookHeader(object):
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None self.exth = None
if self.exth_flag & 0x40: if self.exth_flag & 0x40:
self.exth = EXTHHeader(raw[16+self.length:], self.codec) self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title)
self.exth.mi.uid = self.unique_id self.exth.mi.uid = self.unique_id
self.exth.mi.language = self.language self.exth.mi.language = self.language

View File

@ -23,6 +23,7 @@ from PIL import Image
from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \ from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \
OEB_RASTER_IMAGES OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import xpath, barename, namespace, prefixname from calibre.ebooks.oeb.base import xpath, barename, namespace, prefixname
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.base import Logger, OEBBook from calibre.ebooks.oeb.base import Logger, OEBBook
from calibre.ebooks.oeb.profile import Context from calibre.ebooks.oeb.profile import Context
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
@ -178,7 +179,7 @@ class Serializer(object):
def serialize_href(self, href, base=None): def serialize_href(self, href, base=None):
hrefs = self.oeb.manifest.hrefs hrefs = self.oeb.manifest.hrefs
path, frag = urldefrag(href) path, frag = urldefrag(urlnormalize(href))
if path and base: if path and base:
path = base.abshref(path) path = base.abshref(path)
if path and path not in hrefs: if path and path not in hrefs:

View File

@ -447,7 +447,7 @@ class Manifest(object):
return cmp(skey, okey) return cmp(skey, okey)
def relhref(self, href): def relhref(self, href):
if '/' not in self.href: if '/' not in self.href or ':' in href:
return href return href
base = os.path.dirname(self.href).split('/') base = os.path.dirname(self.href).split('/')
target, frag = urldefrag(href) target, frag = urldefrag(href)
@ -463,7 +463,7 @@ class Manifest(object):
return relhref return relhref
def abshref(self, href): def abshref(self, href):
if '/' not in self.href: if '/' not in self.href or ':' in href:
return href return href
dirname = os.path.dirname(self.href) dirname = os.path.dirname(self.href)
href = os.path.join(dirname, href) href = os.path.join(dirname, href)
@ -796,6 +796,10 @@ class TOC(object):
class OEBBook(object): class OEBBook(object):
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
COVER_IMG_XP = XPath('h:body//h:img[@src][position() = 1]')
def __init__(self, opfpath=None, container=None, encoding=None, def __init__(self, opfpath=None, container=None, encoding=None,
logger=FauxLogger()): logger=FauxLogger()):
if opfpath and not container: if opfpath and not container:
@ -971,7 +975,7 @@ class OEBBook(object):
ncx = item.data ncx = item.data
self.manifest.remove(item) self.manifest.remove(item)
title = xpath(ncx, 'ncx:docTitle/ncx:text/text()') title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')
title = title[0].strip() if title else unicode(self.metadata.title) title = title[0].strip() if title else unicode(self.metadata.title[0])
self.toc = toc = TOC(title) self.toc = toc = TOC(title)
navmaps = xpath(ncx, 'ncx:navMap') navmaps = xpath(ncx, 'ncx:navMap')
for navmap in navmaps: for navmap in navmaps:
@ -1051,40 +1055,54 @@ class OEBBook(object):
if self._toc_from_html(opf): return if self._toc_from_html(opf): return
self._toc_from_spine(opf) self._toc_from_spine(opf)
def _ensure_cover_image(self): def _locate_cover_image(self):
cover = None if self.metadata.cover:
id = str(self.metadata.cover[0])
item = self.manifest.ids.get(id, None)
if item is not None:
return item
hcover = self.spine[0] hcover = self.spine[0]
if 'cover' in self.guide: if 'cover' in self.guide:
href = self.guide['cover'].href href = self.guide['cover'].href
item = self.manifest.hrefs[href] item = self.manifest.hrefs[href]
media_type = item.media_type media_type = item.media_type
if media_type in OEB_RASTER_IMAGES: if media_type in OEB_IMAGES:
cover = item return item
elif media_type in OEB_DOCS: elif media_type in OEB_DOCS:
hcover = item hcover = item
html = hcover.data html = hcover.data
if cover is not None: if MS_COVER_TYPE in self.guide:
pass
elif self.metadata.cover:
id = str(self.metadata.cover[0])
cover = self.manifest.ids[id]
elif MS_COVER_TYPE in self.guide:
href = self.guide[MS_COVER_TYPE].href href = self.guide[MS_COVER_TYPE].href
cover = self.manifest.hrefs[href] item = self.manifest.hrefs.get(href, None)
elif xpath(html, '//h:img[position()=1]'): if item is not None and item.media_type in OEB_IMAGES:
img = xpath(html, '//h:img[position()=1]')[0] return item
href = hcover.abshref(img.get('src')) if self.COVER_SVG_XP(html):
cover = self.manifest.hrefs[href] svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
elif xpath(html, '//h:object[position()=1]'):
object = xpath(html, '//h:object[position()=1]')[0]
href = hcover.abshref(object.get('data'))
cover = self.manifest.hrefs[href]
elif xpath(html, '//svg:svg[position()=1]'):
svg = copy.deepcopy(xpath(html, '//svg:svg[position()=1]')[0])
href = os.path.splitext(hcover.href)[0] + '.svg' href = os.path.splitext(hcover.href)[0] + '.svg'
id, href = self.manifest.generate(hcover.id, href) id, href = self.manifest.generate(hcover.id, href)
cover = self.manifest.add(id, href, SVG_MIME, data=svg) item = self.manifest.add(id, href, SVG_MIME, data=svg)
if cover and not self.metadata.cover: return item
if self.COVER_OBJECT_XP(html):
object = self.COVER_OBJECT_XP(html)[0]
href = hcover.abshref(object.get('data'))
item = self.manifest.hrefs.get(href, None)
if item is not None and item.media_type in OEB_IMAGES:
return item
if self.COVER_IMG_XP(html):
img = self.COVER_IMG_XP(html)[0]
href = hcover.abshref(img.get('src'))
item = self.manifest.hrefs.get(href, None)
if item is not None and item.media_type in OEB_IMAGES:
return item
return None
def _ensure_cover_image(self):
cover = self._locate_cover_image()
if not cover:
return
if self.metadata.cover:
self.metadata.cover[0].value = cover.id
return
self.metadata.add('cover', cover.id) self.metadata.add('cover', cover.id)
def _all_from_opf(self, opf): def _all_from_opf(self, opf):

View File

@ -23,6 +23,7 @@ from PyQt4.QtGui import QApplication
from calibre.ebooks.oeb.base import XHTML_NS, XHTML, SVG_NS, SVG, XLINK from calibre.ebooks.oeb.base import XHTML_NS, XHTML, SVG_NS, SVG, XLINK
from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME, JPEG_MIME from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME, JPEG_MIME
from calibre.ebooks.oeb.base import xml2str, xpath, namespace, barename from calibre.ebooks.oeb.base import xml2str, xpath, namespace, barename
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
IMAGE_TAGS = set([XHTML('img'), XHTML('object')]) IMAGE_TAGS = set([XHTML('img'), XHTML('object')])
@ -78,7 +79,7 @@ class SVGRasterizer(object):
svg = item.data svg = item.data
hrefs = self.oeb.manifest.hrefs hrefs = self.oeb.manifest.hrefs
for elem in xpath(svg, '//svg:*[@xl:href]'): for elem in xpath(svg, '//svg:*[@xl:href]'):
href = elem.attrib[XLINK('href')] href = urlnormalize(elem.attrib[XLINK('href')])
path, frag = urldefrag(href) path, frag = urldefrag(href)
if not path: if not path:
continue continue
@ -100,15 +101,15 @@ class SVGRasterizer(object):
def rasterize_item(self, item, stylizer): def rasterize_item(self, item, stylizer):
html = item.data html = item.data
hrefs = self.oeb.manifest.hrefs hrefs = self.oeb.manifest.hrefs
for elem in xpath(html, '//h:img'): for elem in xpath(html, '//h:img[@src]'):
src = elem.get('src', None) src = urlnormalize(elem.attrib['src'])
image = hrefs.get(item.abshref(src), None) if src else None image = hrefs.get(item.abshref(src), None)
if image and image.media_type == SVG_MIME: if image and image.media_type == SVG_MIME:
style = stylizer.style(elem) style = stylizer.style(elem)
self.rasterize_external(elem, style, item, image) self.rasterize_external(elem, style, item, image)
for elem in xpath(html, '//h:object[@type="%s"]' % SVG_MIME): for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME):
data = elem.get('data', None) data = urlnormalize(elem.attrib['data'])
image = hrefs.get(item.abshref(data), None) if data else None image = hrefs.get(item.abshref(data), None)
if image and image.media_type == SVG_MIME: if image and image.media_type == SVG_MIME:
style = stylizer.style(elem) style = stylizer.style(elem)
self.rasterize_external(elem, style, item, image) self.rasterize_external(elem, style, item, image)

View File

@ -54,7 +54,7 @@ class ManifestTrimmer(object):
new.add(found) new.add(found)
elif item.media_type == CSS_MIME: elif item.media_type == CSS_MIME:
def replacer(uri): def replacer(uri):
absuri = item.abshref(uri) absuri = item.abshref(urlnormalize(uri))
if absuri in oeb.manifest.hrefs: if absuri in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href] found = oeb.manifest.hrefs[href]
if found not in used: if found not in used: