From 2f3680e563b59fdaaf5ac689925872253b5d7ad0 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 1 Feb 2009 19:45:21 -0500 Subject: [PATCH 1/2] Implement NCX and Adobe page-map parsing and generation. --- src/calibre/ebooks/oeb/base.py | 312 ++++++++++++++++++++++++++------- 1 file changed, 250 insertions(+), 62 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index b89be6b1ec..e9252c7609 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -17,8 +17,10 @@ import logging import re import uuid import copy +import mimetypes from lxml import etree from lxml import html +import calibre from calibre import LoggingInterface from calibre.translations.dynamic import translate from calibre.startup import get_lang @@ -64,6 +66,7 @@ XHTML_MIME = 'application/xhtml+xml' CSS_MIME = 'text/css' NCX_MIME = 'application/x-dtbncx+xml' OPF_MIME = 'application/oebps-package+xml' +PAGE_MAP_MIME = 'application/oebps-page-map+xml' OEB_DOC_MIME = 'text/x-oeb1-document' OEB_CSS_MIME = 'text/x-oeb1-css' OPENTYPE_MIME = 'font/opentype' @@ -892,25 +895,71 @@ class TOC(object): node.to_opf1(tour) return tour - def to_ncx(self, parent, order=None, depth=1): - if not order: order = [0] + def to_ncx(self, parent, depth=1): for node in self.nodes: - order[0] += 1 - playOrder = str(order[0]) - id = self.id or 'np' + playOrder - point = etree.SubElement(parent, - NCX('navPoint'), id=id, playOrder=playOrder) + id = self.id or unicode(uuid.uuid4()) + attrib = {'id': id, 'playOrder': '0'} if self.klass: - point.attrib['class'] = node.klass + attrib['class'] = node.klass + point = element(parent, NCX('navPoint'), attrib=attrib) label = etree.SubElement(point, NCX('navLabel')) - etree.SubElement(label, NCX('text')).text = node.title + element(label, NCX('text')).text = node.title href = node.href if depth > 1 else urldefrag(node.href)[0] - child = etree.SubElement(point, - NCX('content'), attrib={'src': href}) - node.to_ncx(point, order, depth+1) + element(point, NCX('content'), src=href) + node.to_ncx(point, depth+1) return parent + +class PageList(object): + class Page(object): + def __init__(self, name, href, type='normal', klass=None, id=None): + self.name = name + self.href = urlnormalize(href) + self.type = type + self.id = id + self.klass = klass + def __init__(self): + self.pages = [] + + def add(self, name, href, type='normal', klass=None, id=None): + page = self.Page(name, href, type, klass, id) + self.pages.append(page) + return page + + def __len__(self): + return len(self.pages) + + def __iter__(self): + for page in self.pages: + yield node + + def __getitem__(self, index): + return self.pages[index] + + def to_ncx(self, parent=None): + plist = element(parent, NCX('pageList'), id=str(uuid.uuid4())) + values = dict((t, count(1)) for t in ('front', 'normal', 'special')) + for page in self.pages: + id = page.id or unicode(uuid.uuid4()) + type = page.type + value = str(values[type].next()) + attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'} + if page.klass: + attrib['class'] = page.klass + ptarget = element(plist, NCX('pageTarget'), attrib=attrib) + label = element(ptarget, NCX('navLabel')) + element(label, NCX('text')).text = page.name + element(ptarget, NCX('content'), src=page.href) + return plist + + def to_page_map(self): + pmap = etree.Element(OPF('page-map'), nsmap={None: OPF2_NS}) + for page in self.pages: + element(pmap, OPF('page'), name=page.name, href=page.href) + return pmap + + class OEBBook(object): COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') @@ -972,7 +1021,7 @@ class OEBBook(object): return opf def _metadata_from_opf(self, opf): - uid = opf.get('unique-identifier', 'calibre-uuid') + uid = opf.get('unique-identifier', None) self.uid = None self.metadata = metadata = Metadata(self) for elem in xpath(opf, '/o2:package/o2:metadata//*'): @@ -996,8 +1045,12 @@ class OEBBook(object): if not haveuuid and haveid: bookid = "urn:uuid:%s" % str(uuid.uuid4()) metadata.add('identifier', bookid, id='calibre-uuid') + if uid is None: + self.logger.warn(u'Unique-identifier not specified') for item in metadata.identifier: - if item.id == uid: + if not item.id: + continue + if uid is None or item.id == uid: self.uid = item break else: @@ -1023,7 +1076,10 @@ class OEBBook(object): href = elem.get('href') media_type = elem.get('media-type', None) if media_type is None: - media_type = elem.get('mediatype', BINARY_MIME) + media_type = elem.get('mediatype', None) + if media_type is None or media_type == 'text/xml': + guessed = mimetypes.guess_type(href)[0] + media_type = guessed or media_type or BINARY_MIME fallback = elem.get('fallback') if href in manifest.hrefs: self.logger.warn(u'Duplicate manifest entry for %r' % href) @@ -1055,7 +1111,7 @@ class OEBBook(object): spine.add(item, False) if len(spine) == 0: raise OEBError("Spine is empty") - + def _guide_from_opf(self, opf): self.guide = guide = Guide(self) for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): @@ -1065,49 +1121,74 @@ class OEBBook(object): self.logger.warn(u'Guide reference %r not found' % href) continue guide.add(elem.get('type'), elem.get('title'), href) - - def _toc_from_navpoint(self, toc, navpoint): + + def _find_ncx(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@toc') + if result: + id = result[0] + if id not in self.manifest.ids: + return None + item = self.manifest.ids[id] + self.manifest.remove(item) + return item + for item in self.manifest.values(): + if item.media_type == NCX_MIME: + self.manifest.remove(item) + return item + return None + + def _toc_from_navpoint(self, item, toc, navpoint): children = xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) - href = xpath(child, 'ncx:content/@src')[0] + title = COLLAPSE_RE.sub(' ', title.strip()) + href = xpath(child, 'ncx:content/@src') + if not title or not href: + continue + href = item.abshref(urlnormalize(href[0])) + path, _ = urldefrag(href) + if path not in self.manifest.hrefs: + self.logger.warn('TOC reference %r not found' % href) + continue id = child.get('id') klass = child.get('class') node = toc.add(title, href, id=id, klass=klass) - self._toc_from_navpoint(node, child) - - def _toc_from_ncx(self, opf): - result = xpath(opf, '/o2:package/o2:spine/@toc') - if not result: - expr = '/o2:package/o2:manifest/o2:item[@media-type="%s"]/@id' - result = xpath(opf, expr % NCX_MIME) - if len(result) != 1: - return False - id = result[0] - if id not in self.manifest.ids: + self._toc_from_navpoint(item, node, child) + + def _toc_from_ncx(self, item): + if item is None: return False - item = self.manifest.ids[id] ncx = item.data - self.manifest.remove(item) - title = xpath(ncx, 'ncx:docTitle/ncx:text/text()') - title = title[0].strip() if title else unicode(self.metadata.title[0]) + title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + title = title or unicode(self.metadata.title[0]) self.toc = toc = TOC(title) navmaps = xpath(ncx, 'ncx:navMap') for navmap in navmaps: - self._toc_from_navpoint(toc, navmap) + self._toc_from_navpoint(item, toc, navmap) return True - + def _toc_from_tour(self, opf): - result = xpath(opf, '/o2:package/o2:tours/o2:tour') + result = xpath(opf, 'o2:tours/o2:tour') if not result: return False tour = result[0] self.toc = toc = TOC(tour.get('title')) sites = xpath(tour, 'o2:site') for site in sites: - toc.add(site.get('title'), site.get('href')) + title = site.get('title') + href = site.get('href') + if not title or not href: + continue + href = item.abshref(urlnormalize(href)) + path, _ = urldefrag(href) + if path not in self.manifest.hrefs: + self.logger.warn('TOC reference %r not found' % href) + continue + id = child.get('id') + toc.add(title, href, id=id) return True - + def _toc_from_html(self, opf): if 'toc' not in self.guide: return False @@ -1131,6 +1212,7 @@ class OEBBook(object): if not path: href = '#'.join((itempath, frag)) title = ' '.join(xpath(anchor, './/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) href = urlnormalize(href) if href not in titles: order.append(href) @@ -1146,15 +1228,17 @@ class OEBBook(object): for item in self.spine: if not item.linear: continue html = item.data - title = xpath(html, '/h:html/h:head/h:title/text()') - title = title[0].strip() if title else None - if title: titles.append(title) + title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) + title = COLLAPSE_RE(' ', title.strip()) + if title: + titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): - expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,) - header = xpath(html, expr) + expr = '/h:html/h:body//h:%s[position()=1]/text()' + header = ''.join(xpath(html % tag, expr)) + header = COLLAPSE_RE.sub(' ', header.strip()) if header: - headers[-1] = header[0] + headers[-1] = header break use = titles if len(titles) > len(set(titles)): @@ -1164,12 +1248,71 @@ class OEBBook(object): toc.add(title, item.href) return True - def _toc_from_opf(self, opf): - if self._toc_from_ncx(opf): return + def _toc_from_opf(self, opf, item): + if self._toc_from_ncx(item): return if self._toc_from_tour(opf): return + self.logger.warn('No metadata table of contents found') if self._toc_from_html(opf): return self._toc_from_spine(opf) - + + def _pages_from_ncx(self, opf, item): + if item is None: + return False + ncx = item.data + ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') + if not ptargets: + return False + pages = self.pages = PageList() + for ptarget in ptargets: + name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) + name = COLLAPSE_RE.sub(' ', name.strip()) + href = xpath(ptarget, 'ncx:content/@src') + if not href: + continue + href = item.abshref(urlnormalize(href[0])) + id = ptarget.get('id') + type = ptarget.get('type', 'normal') + klass = ptarget.get('class') + pages.add(name, href, type=type, id=id, klass=klass) + return True + + def _find_page_map(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@page-map') + if result: + id = result[0] + if id not in self.manifest.ids: + return None + item = self.manifest.ids[id] + self.manifest.remove(item) + return item + for item in self.manifest.values(): + if item.media_type == PAGE_MAP_MIME: + self.manifest.remove(item) + return item + return None + + def _pages_from_page_map(self, opf): + item = self._find_page_map(opf) + if item is None: + return False + pmap = item.data + pages = self.pages = PageList() + for page in xpath(pmap, 'o2:page'): + name = page.get('name', '') + href = page.get('href') + if not href: + continue + name = COLLAPSE_RE.sub(' ', name.strip()) + href = item.abshref(urlnormalize(href)) + pages.add(name, href) + return True + + def _pages_from_opf(self, opf, item): + if self._pages_from_ncx(opf, item): return + if self._pages_from_page_map(opf): return + self.pages = PageList() + return + def _cover_from_html(self, hcover): with TemporaryDirectory('_html_cover') as tdir: writer = DirWriter() @@ -1228,7 +1371,9 @@ class OEBBook(object): self._manifest_from_opf(opf) self._spine_from_opf(opf) self._guide_from_opf(opf) - self._toc_from_opf(opf) + item = self._find_ncx(opf) + self._toc_from_opf(opf, item) + self._pages_from_opf(opf, item) self._ensure_cover_image() def translate(self, text): @@ -1249,6 +1394,34 @@ class OEBBook(object): guide = self.guide.to_opf1(package) return {OPF_MIME: ('content.opf', package)} + def _update_playorder(self, ncx): + hrefs = set(xpath(ncx, '//ncx:content/@src')) + playorder = {} + next = 1 + selector = XPath('h:body//*[@id or @name]') + for item in self.spine: + base = item.href + if base in hrefs: + playorder[base] = next + next += 1 + for elem in selector(item.data): + added = False + for attr in ('id', 'name'): + id = elem.get(attr) + if not id: + continue + href = '#'.join([base, id]) + if href in hrefs: + playorder[href] = next + added = True + if added: + next += 1 + selector = XPath('ncx:content/@src') + for elem in xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]'): + order = playorder[selector(elem)[0]] + elem.attrib['playOrder'] = str(order) + return + def _to_ncx(self): lang = unicode(self.metadata.language[0]) ncx = etree.Element(NCX('ncx'), @@ -1256,35 +1429,50 @@ class OEBBook(object): nsmap={None: NCX_NS}) head = etree.SubElement(ncx, NCX('head')) etree.SubElement(head, NCX('meta'), - attrib={'name': 'dtb:uid', 'content': unicode(self.uid)}) + name='dtb:uid', content=unicode(self.uid)) etree.SubElement(head, NCX('meta'), - attrib={'name': 'dtb:depth', 'content': str(self.toc.depth())}) + name='dtb:depth', content=str(self.toc.depth())) + generator = ''.join(['calibre (', calibre.__version__, ')']) etree.SubElement(head, NCX('meta'), - attrib={'name': 'dtb:totalPageCount', 'content': '0'}) + name='dtb:generator', content=generator) etree.SubElement(head, NCX('meta'), - attrib={'name': 'dtb:maxPageNumber', 'content': '0'}) + name='dtb:totalPageCount', content=str(len(self.pages))) + maxpnum = etree.SubElement(head, NCX('meta'), + name='dtb:maxPageNumber', content='0') title = etree.SubElement(ncx, NCX('docTitle')) text = etree.SubElement(title, NCX('text')) text.text = unicode(self.metadata.title[0]) navmap = etree.SubElement(ncx, NCX('navMap')) self.toc.to_ncx(navmap) + if len(self.pages) > 0: + plist = self.pages.to_ncx(ncx) + value = max(int(x) for x in xpath(plist, '//@value')) + maxpnum.attrib['content'] = str(value) + self._update_playorder(ncx) return ncx - def to_opf2(self): + def to_opf2(self, page_map=False): + results = {} package = etree.Element(OPF('package'), attrib={'version': '2.0', 'unique-identifier': self.uid.id}, nsmap={None: OPF2_NS}) metadata = self.metadata.to_opf2(package) manifest = self.manifest.to_opf2(package) - id, href = self.manifest.generate('ncx', 'toc.ncx') - etree.SubElement(manifest, OPF('item'), - attrib={'id': id, 'href': href, 'media-type': NCX_MIME}) spine = self.spine.to_opf2(package) - spine.attrib['toc'] = id guide = self.guide.to_opf2(package) - ncx = self._to_ncx() - return {OPF_MIME: ('content.opf', package), - NCX_MIME: (href, ncx)} + results[OPF_MIME] = ('content.opf', package) + id, href = self.manifest.generate('ncx', 'toc.ncx') + etree.SubElement(manifest, OPF('item'), id=id, href=href, + attrib={'media-type': NCX_MIME}) + spine.attrib['toc'] = id + results[NCX_MIME] = (href, self._to_ncx()) + if page_map and len(self.pages) > 0: + id, href = self.manifest.generate('page-map', 'page-map.xml') + etree.SubElement(manifest, OPF('item'), id=id, href=href, + attrib={'media-type': PAGE_MAP_MIME}) + spine.attrib['page-map'] = id + results[PAGE_MAP_MIME] = (href, self.pages.to_page_map()) + return results def main(argv=sys.argv): @@ -1292,7 +1480,7 @@ def main(argv=sys.argv): oeb = OEBBook(arg) for name, doc in oeb.to_opf1().values(): print etree.tostring(doc, pretty_print=True) - for name, doc in oeb.to_opf2().values(): + for name, doc in oeb.to_opf2(page_map=True).values(): print etree.tostring(doc, pretty_print=True) return 0 From 6fbf78aa7fdef164a1f9faf9e39e729960e1f0c8 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 1 Feb 2009 21:20:41 -0500 Subject: [PATCH 2/2] Fix #1746. Improve handling of encoding. --- src/calibre/ebooks/oeb/base.py | 23 ++++++++++++++++++++--- src/calibre/ebooks/oeb/stylizer.py | 15 ++++++++++++--- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index e9252c7609..854f8bef94 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -24,6 +24,7 @@ import calibre from calibre import LoggingInterface from calibre.translations.dynamic import translate from calibre.startup import get_lang +from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.entitydefs import ENTITYDEFS from calibre.ebooks.metadata.epub import CoverRenderer from calibre.ptempfile import TemporaryDirectory @@ -87,6 +88,7 @@ ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);') COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$') PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+') +XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') def element(parent, *args, **kwargs): if parent is not None: @@ -447,9 +449,10 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _force_xhtml(self, data): - # Possibly decode in user-specified encoding - if self.oeb.encoding is not None: - data = data.decode(self.oeb.encoding, 'replace') + # Convert to Unicode and normalize line endings + data = self.oeb.decode(data) + data = XMLDECL_RE.sub('', data) + data = data.replace('\r\n', '\n').replace('\r', '\n') # Handle broken XHTML w/ SVG (ugh) if 'svg:' in data and SVG_NS not in data: data = data.replace( @@ -1381,6 +1384,20 @@ class OEBBook(object): lang = lang.split('-', 1)[0].lower() return translate(lang, text) + def decode(self, data): + if isinstance(data, unicode): + return data + encodings = ['utf-8', 'utf-16'] + if self.encoding is not None: + encodings.append(self.encoding) + for encoding in encodings: + try: + return data.decode(encoding) + except UnicodeDecodeError: + pass + data, _ = xml_to_unicode(data) + return data + def to_opf1(self): package = etree.Element('package', attrib={'unique-identifier': self.uid.id}) diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 03a1fade10..ae42e063b7 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -109,6 +109,7 @@ class Stylizer(object): STYLESHEETS = {} def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']): + self.oeb = oeb self.profile = profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] @@ -117,7 +118,7 @@ class Stylizer(object): stylesheets = [HTML_CSS_STYLESHEET] head = xpath(tree, '/h:html/h:head')[0] parser = cssutils.CSSParser() - parser.setFetcher(lambda path: ('utf-8', oeb.container.read(path))) + parser.setFetcher(self._fetch_css_file) for elem in head: if elem.tag == XHTML('style') and elem.text \ and elem.get('type', CSS_MIME) in OEB_STYLES: @@ -138,8 +139,7 @@ class Stylizer(object): if path in self.STYLESHEETS: stylesheet = self.STYLESHEETS[path] else: - data = XHTML_CSS_NAMESPACE - data += oeb.manifest.hrefs[path].data + data = self._fetch_css_file(path)[1] stylesheet = parser.parseString(data, href=path) stylesheet.namespaces['h'] = XHTML_NS self.STYLESHEETS[path] = stylesheet @@ -167,6 +167,15 @@ class Stylizer(object): for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr() + def _fetch_css_file(self, path): + hrefs = self.oeb.manifest.hrefs + if path not in hrefs: + return (None, None) + data = hrefs[path].data + data = self.oeb.decode(data) + data = XHTML_CSS_NAMESPACE + data + return (None, data) + def flatten_rule(self, rule, href, index): results = [] if isinstance(rule, CSSStyleRule):