Fix #1712 (MOBI error dump from Mac) and make detection if title in MOBI files more robust

This commit is contained in:
Kovid Goyal 2009-01-27 18:00:25 -08:00
commit e82972af80
5 changed files with 64 additions and 49 deletions

View File

@ -28,7 +28,7 @@ from calibre import sanitize_file_name
class EXTHHeader(object):
def __init__(self, raw, codec):
def __init__(self, raw, codec, title):
self.doctype = raw[:4]
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
raw = raw[12:]
@ -52,17 +52,9 @@ class EXTHHeader(object):
self.thumbnail_offset, = struct.unpack('>L', content)
#else:
# print 'unknown record', id, repr(content)
title = re.search(r'\0+([^\0]+)\0+', raw[pos:])
if title:
title = title.group(1).decode(codec, 'replace')
if len(title) > 2:
self.mi.title = title
else:
title = re.search(r'\0+([^\0]+)\0+', ''.join(reversed(raw[pos:])))
if title:
self.mi.title = ''.join(reversed(title.group(1).decode(codec, 'replace')))
self.mi.title = title
def process_metadata(self, id, content, codec):
if id == 100:
if self.mi.authors == [_('Unknown')]:
@ -121,6 +113,9 @@ class BookHeader(object):
if self.compression_type == 'DH':
self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
tend = toff + tlen
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
langid = langcode & 0xFF
sublangid = (langcode >> 10) & 0xFF
@ -131,7 +126,7 @@ class BookHeader(object):
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
if self.exth_flag & 0x40:
self.exth = EXTHHeader(raw[16+self.length:], self.codec)
self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title)
self.exth.mi.uid = self.unique_id
self.exth.mi.language = self.language

View File

@ -23,6 +23,7 @@ from PIL import Image
from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \
OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import xpath, barename, namespace, prefixname
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.base import Logger, OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
@ -178,7 +179,7 @@ class Serializer(object):
def serialize_href(self, href, base=None):
hrefs = self.oeb.manifest.hrefs
path, frag = urldefrag(href)
path, frag = urldefrag(urlnormalize(href))
if path and base:
path = base.abshref(path)
if path and path not in hrefs:

View File

@ -447,7 +447,7 @@ class Manifest(object):
return cmp(skey, okey)
def relhref(self, href):
if '/' not in self.href:
if '/' not in self.href or ':' in href:
return href
base = os.path.dirname(self.href).split('/')
target, frag = urldefrag(href)
@ -463,7 +463,7 @@ class Manifest(object):
return relhref
def abshref(self, href):
if '/' not in self.href:
if '/' not in self.href or ':' in href:
return href
dirname = os.path.dirname(self.href)
href = os.path.join(dirname, href)
@ -796,6 +796,10 @@ class TOC(object):
class OEBBook(object):
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
COVER_IMG_XP = XPath('h:body//h:img[@src][position() = 1]')
def __init__(self, opfpath=None, container=None, encoding=None,
logger=FauxLogger()):
if opfpath and not container:
@ -971,7 +975,7 @@ class OEBBook(object):
ncx = item.data
self.manifest.remove(item)
title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')
title = title[0].strip() if title else unicode(self.metadata.title)
title = title[0].strip() if title else unicode(self.metadata.title[0])
self.toc = toc = TOC(title)
navmaps = xpath(ncx, 'ncx:navMap')
for navmap in navmaps:
@ -1051,42 +1055,56 @@ class OEBBook(object):
if self._toc_from_html(opf): return
self._toc_from_spine(opf)
def _ensure_cover_image(self):
cover = None
def _locate_cover_image(self):
if self.metadata.cover:
id = str(self.metadata.cover[0])
item = self.manifest.ids.get(id, None)
if item is not None:
return item
hcover = self.spine[0]
if 'cover' in self.guide:
href = self.guide['cover'].href
item = self.manifest.hrefs[href]
media_type = item.media_type
if media_type in OEB_RASTER_IMAGES:
cover = item
if media_type in OEB_IMAGES:
return item
elif media_type in OEB_DOCS:
hcover = item
html = hcover.data
if cover is not None:
pass
elif self.metadata.cover:
id = str(self.metadata.cover[0])
cover = self.manifest.ids[id]
elif MS_COVER_TYPE in self.guide:
if MS_COVER_TYPE in self.guide:
href = self.guide[MS_COVER_TYPE].href
cover = self.manifest.hrefs[href]
elif xpath(html, '//h:img[position()=1]'):
img = xpath(html, '//h:img[position()=1]')[0]
href = hcover.abshref(img.get('src'))
cover = self.manifest.hrefs[href]
elif xpath(html, '//h:object[position()=1]'):
object = xpath(html, '//h:object[position()=1]')[0]
href = hcover.abshref(object.get('data'))
cover = self.manifest.hrefs[href]
elif xpath(html, '//svg:svg[position()=1]'):
svg = copy.deepcopy(xpath(html, '//svg:svg[position()=1]')[0])
item = self.manifest.hrefs.get(href, None)
if item is not None and item.media_type in OEB_IMAGES:
return item
if self.COVER_SVG_XP(html):
svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
href = os.path.splitext(hcover.href)[0] + '.svg'
id, href = self.manifest.generate(hcover.id, href)
cover = self.manifest.add(id, href, SVG_MIME, data=svg)
if cover and not self.metadata.cover:
self.metadata.add('cover', cover.id)
item = self.manifest.add(id, href, SVG_MIME, data=svg)
return item
if self.COVER_OBJECT_XP(html):
object = self.COVER_OBJECT_XP(html)[0]
href = hcover.abshref(object.get('data'))
item = self.manifest.hrefs.get(href, None)
if item is not None and item.media_type in OEB_IMAGES:
return item
if self.COVER_IMG_XP(html):
img = self.COVER_IMG_XP(html)[0]
href = hcover.abshref(img.get('src'))
item = self.manifest.hrefs.get(href, None)
if item is not None and item.media_type in OEB_IMAGES:
return item
return None
def _ensure_cover_image(self):
cover = self._locate_cover_image()
if not cover:
return
if self.metadata.cover:
self.metadata.cover[0].value = cover.id
return
self.metadata.add('cover', cover.id)
def _all_from_opf(self, opf):
self._metadata_from_opf(opf)
self._manifest_from_opf(opf)

View File

@ -23,6 +23,7 @@ from PyQt4.QtGui import QApplication
from calibre.ebooks.oeb.base import XHTML_NS, XHTML, SVG_NS, SVG, XLINK
from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME, JPEG_MIME
from calibre.ebooks.oeb.base import xml2str, xpath, namespace, barename
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer
IMAGE_TAGS = set([XHTML('img'), XHTML('object')])
@ -78,7 +79,7 @@ class SVGRasterizer(object):
svg = item.data
hrefs = self.oeb.manifest.hrefs
for elem in xpath(svg, '//svg:*[@xl:href]'):
href = elem.attrib[XLINK('href')]
href = urlnormalize(elem.attrib[XLINK('href')])
path, frag = urldefrag(href)
if not path:
continue
@ -100,15 +101,15 @@ class SVGRasterizer(object):
def rasterize_item(self, item, stylizer):
html = item.data
hrefs = self.oeb.manifest.hrefs
for elem in xpath(html, '//h:img'):
src = elem.get('src', None)
image = hrefs.get(item.abshref(src), None) if src else None
for elem in xpath(html, '//h:img[@src]'):
src = urlnormalize(elem.attrib['src'])
image = hrefs.get(item.abshref(src), None)
if image and image.media_type == SVG_MIME:
style = stylizer.style(elem)
self.rasterize_external(elem, style, item, image)
for elem in xpath(html, '//h:object[@type="%s"]' % SVG_MIME):
data = elem.get('data', None)
image = hrefs.get(item.abshref(data), None) if data else None
for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME):
data = urlnormalize(elem.attrib['data'])
image = hrefs.get(item.abshref(data), None)
if image and image.media_type == SVG_MIME:
style = stylizer.style(elem)
self.rasterize_external(elem, style, item, image)

View File

@ -54,7 +54,7 @@ class ManifestTrimmer(object):
new.add(found)
elif item.media_type == CSS_MIME:
def replacer(uri):
absuri = item.abshref(uri)
absuri = item.abshref(urlnormalize(uri))
if absuri in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href]
if found not in used: