Fix #1746 (Error while News downloading El Pais)

2025-11-15 19:13:02 -05:00 · 2009-02-01 18:47:42 -08:00 · 2009-02-01 18:47:42 -08:00 · e6fbdb4a43
commit e6fbdb4a43
parent 47e94f47cb 6fbf78aa7f
2 changed files with 282 additions and 68 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -17,11 +17,14 @@ import logging
 import re
 import uuid
 import copy
 import mimetypes
 from lxml import etree
 from lxml import html
 import calibre
 from calibre import LoggingInterface
 from calibre.translations.dynamic import translate
 from calibre.startup import get_lang
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
 from calibre.ebooks.metadata.epub import CoverRenderer
 from calibre.ptempfile import TemporaryDirectory
@ -64,6 +67,7 @@ XHTML_MIME = 'application/xhtml+xml'
 CSS_MIME = 'text/css'
 NCX_MIME = 'application/x-dtbncx+xml'
 OPF_MIME = 'application/oebps-package+xml'
 PAGE_MAP_MIME = 'application/oebps-page-map+xml'
 OEB_DOC_MIME = 'text/x-oeb1-document'
 OEB_CSS_MIME = 'text/x-oeb1-css'
 OPENTYPE_MIME = 'font/opentype'
@ -84,6 +88,7 @@ ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
 COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
 QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
 PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
 XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
 def element(parent, *args, **kwargs):
    if parent is not None:
@ -444,9 +449,10 @@ class Manifest(object):
                % (self.id, self.href, self.media_type)
        def _force_xhtml(self, data):
-            # Possibly decode in user-specified encoding
+            # Convert to Unicode and normalize line endings
-            if self.oeb.encoding is not None:
+            data = self.oeb.decode(data)
-                data = data.decode(self.oeb.encoding, 'replace')
+            data = XMLDECL_RE.sub('', data)
            data = data.replace('\r\n', '\n').replace('\r', '\n')
            # Handle broken XHTML w/ SVG (ugh)
            if 'svg:' in data and SVG_NS not in data:
                data = data.replace(
@ -892,25 +898,71 @@ class TOC(object):
            node.to_opf1(tour)
        return tour
-    def to_ncx(self, parent, order=None, depth=1):
+    def to_ncx(self, parent, depth=1):
        if not order: order = [0]
        for node in self.nodes:
-            order[0] += 1
+            id = self.id or unicode(uuid.uuid4())
-            playOrder = str(order[0])
+            attrib = {'id': id, 'playOrder': '0'}
            id = self.id or 'np' + playOrder
            point = etree.SubElement(parent,
                NCX('navPoint'), id=id, playOrder=playOrder)
            if self.klass:
-                point.attrib['class'] = node.klass
+                attrib['class'] = node.klass
            point = element(parent, NCX('navPoint'), attrib=attrib)
            label = etree.SubElement(point, NCX('navLabel'))
-            etree.SubElement(label, NCX('text')).text = node.title
+            element(label, NCX('text')).text = node.title
            href = node.href if depth > 1 else urldefrag(node.href)[0]
-            child = etree.SubElement(point,
+            element(point, NCX('content'), src=href)
-                NCX('content'), attrib={'src': href})
+            node.to_ncx(point, depth+1)
            node.to_ncx(point, order, depth+1)
        return parent
 class PageList(object):
    class Page(object):
        def __init__(self, name, href, type='normal', klass=None, id=None):
            self.name = name
            self.href = urlnormalize(href)
            self.type = type
            self.id = id
            self.klass = klass
    def __init__(self):
        self.pages = []
    def add(self, name, href, type='normal', klass=None, id=None):
        page = self.Page(name, href, type, klass, id)
        self.pages.append(page)
        return page
    def __len__(self):
        return len(self.pages)
    def __iter__(self):
        for page in self.pages:
            yield node
    def __getitem__(self, index):
        return self.pages[index]
    def to_ncx(self, parent=None):
        plist = element(parent, NCX('pageList'), id=str(uuid.uuid4()))
        values = dict((t, count(1)) for t in ('front', 'normal', 'special'))
        for page in self.pages:
            id = page.id or unicode(uuid.uuid4())
            type = page.type
            value = str(values[type].next())
            attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'}
            if page.klass:
                attrib['class'] = page.klass
            ptarget = element(plist, NCX('pageTarget'), attrib=attrib)
            label = element(ptarget, NCX('navLabel'))
            element(label, NCX('text')).text = page.name
            element(ptarget, NCX('content'), src=page.href)
        return plist
    def to_page_map(self):
        pmap = etree.Element(OPF('page-map'), nsmap={None: OPF2_NS})
        for page in self.pages:
            element(pmap, OPF('page'), name=page.name, href=page.href)
        return pmap
 class OEBBook(object):
    COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
    COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
@ -972,7 +1024,7 @@ class OEBBook(object):
        return opf
    def _metadata_from_opf(self, opf):
-        uid = opf.get('unique-identifier', 'calibre-uuid')
+        uid = opf.get('unique-identifier', None)
        self.uid = None
        self.metadata = metadata = Metadata(self)
        for elem in xpath(opf, '/o2:package/o2:metadata//*'):
@ -996,8 +1048,12 @@ class OEBBook(object):
        if not haveuuid and haveid:
            bookid = "urn:uuid:%s" % str(uuid.uuid4())
            metadata.add('identifier', bookid, id='calibre-uuid')
        if uid is None:
            self.logger.warn(u'Unique-identifier not specified')
        for item in metadata.identifier:
-            if item.id == uid:
+            if not item.id:
                continue
            if uid is None or item.id == uid:
                self.uid = item
                break
        else:
@ -1023,7 +1079,10 @@ class OEBBook(object):
            href = elem.get('href')
            media_type = elem.get('media-type', None)
            if media_type is None:
-                media_type = elem.get('mediatype', BINARY_MIME)
+                media_type = elem.get('mediatype', None)
            if media_type is None or media_type == 'text/xml':
                guessed = mimetypes.guess_type(href)[0]
                media_type = guessed or media_type or BINARY_MIME
            fallback = elem.get('fallback')
            if href in manifest.hrefs:
                self.logger.warn(u'Duplicate manifest entry for %r' % href)
@ -1066,46 +1125,71 @@ class OEBBook(object):
                continue
            guide.add(elem.get('type'), elem.get('title'), href)
-    def _toc_from_navpoint(self, toc, navpoint):
+    def _find_ncx(self, opf):
        result = xpath(opf, '/o2:package/o2:spine/@toc')
        if result:
            id = result[0]
            if id not in self.manifest.ids:
                return None
            item = self.manifest.ids[id]
            self.manifest.remove(item)
            return item
        for item in self.manifest.values():
            if item.media_type == NCX_MIME:
                self.manifest.remove(item)
                return item                
        return None
    def _toc_from_navpoint(self, item, toc, navpoint):
        children = xpath(navpoint, 'ncx:navPoint')
        for child in children:
            title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
-            href = xpath(child, 'ncx:content/@src')[0]
+            title = COLLAPSE_RE.sub(' ', title.strip())
            href = xpath(child, 'ncx:content/@src')
            if not title or not href:
                continue
            href = item.abshref(urlnormalize(href[0]))
            path, _ = urldefrag(href)
            if path not in self.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                continue
            id = child.get('id')
            klass = child.get('class')
            node = toc.add(title, href, id=id, klass=klass)
-            self._toc_from_navpoint(node, child)
+            self._toc_from_navpoint(item, node, child)
-    def _toc_from_ncx(self, opf):
+    def _toc_from_ncx(self, item):
-        result = xpath(opf, '/o2:package/o2:spine/@toc')
+        if item is None:
        if not result:
            expr = '/o2:package/o2:manifest/o2:item[@media-type="%s"]/@id'
            result = xpath(opf, expr % NCX_MIME)
            if len(result) != 1:
            return False
        id = result[0]
        if id not in self.manifest.ids:
            return False
        item = self.manifest.ids[id]
        ncx = item.data
-        self.manifest.remove(item)
+        title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
-        title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')
+        title = COLLAPSE_RE.sub(' ', title.strip())
-        title = title[0].strip() if title else unicode(self.metadata.title[0])
+        title = title or unicode(self.metadata.title[0])
        self.toc = toc = TOC(title)
        navmaps = xpath(ncx, 'ncx:navMap')
        for navmap in navmaps:
-            self._toc_from_navpoint(toc, navmap)
+            self._toc_from_navpoint(item, toc, navmap)
        return True
    def _toc_from_tour(self, opf):
-        result = xpath(opf, '/o2:package/o2:tours/o2:tour')
+        result = xpath(opf, 'o2:tours/o2:tour')
        if not result:
            return False
        tour = result[0]
        self.toc = toc = TOC(tour.get('title'))
        sites = xpath(tour, 'o2:site')
        for site in sites:
-            toc.add(site.get('title'), site.get('href'))
+            title = site.get('title')
            href = site.get('href')
            if not title or not href:
                continue
            href = item.abshref(urlnormalize(href))
            path, _ = urldefrag(href)
            if path not in self.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                continue            
            id = child.get('id')
            toc.add(title, href, id=id)
        return True
    def _toc_from_html(self, opf):
@ -1131,6 +1215,7 @@ class OEBBook(object):
            if not path:
                href = '#'.join((itempath, frag))
            title = ' '.join(xpath(anchor, './/text()'))
            title = COLLAPSE_RE.sub(' ', title.strip())
            href = urlnormalize(href)
            if href not in titles:
                order.append(href)
@ -1146,15 +1231,17 @@ class OEBBook(object):
        for item in self.spine:
            if not item.linear: continue
            html = item.data
-            title = xpath(html, '/h:html/h:head/h:title/text()')
+            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
-            title = title[0].strip() if title else None
+            title = COLLAPSE_RE(' ', title.strip())
-            if title: titles.append(title)
+            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
-                expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,)
+                expr = '/h:html/h:body//h:%s[position()=1]/text()'
-                header = xpath(html, expr)
+                header = ''.join(xpath(html % tag, expr))
                header = COLLAPSE_RE.sub(' ', header.strip())
                if header:
-                    headers[-1] = header[0]
+                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
@ -1164,12 +1251,71 @@ class OEBBook(object):
            toc.add(title, item.href)
        return True
-    def _toc_from_opf(self, opf):
+    def _toc_from_opf(self, opf, item):
-        if self._toc_from_ncx(opf): return
+        if self._toc_from_ncx(item): return
        if self._toc_from_tour(opf): return
        self.logger.warn('No metadata table of contents found')
        if self._toc_from_html(opf): return
        self._toc_from_spine(opf)
    def _pages_from_ncx(self, opf, item):
        if item is None:
            return False
        ncx = item.data
        ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget')
        if not ptargets:
            return False
        pages = self.pages = PageList()
        for ptarget in ptargets:
            name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
            name = COLLAPSE_RE.sub(' ', name.strip())
            href = xpath(ptarget, 'ncx:content/@src')
            if not href:
                continue
            href = item.abshref(urlnormalize(href[0]))
            id = ptarget.get('id')
            type = ptarget.get('type', 'normal')
            klass = ptarget.get('class')
            pages.add(name, href, type=type, id=id, klass=klass)
        return True
    def _find_page_map(self, opf):
        result = xpath(opf, '/o2:package/o2:spine/@page-map')
        if result:
            id = result[0]
            if id not in self.manifest.ids:
                return None
            item = self.manifest.ids[id]
            self.manifest.remove(item)
            return item
        for item in self.manifest.values():
            if item.media_type == PAGE_MAP_MIME:
                self.manifest.remove(item)
                return item
        return None
    def _pages_from_page_map(self, opf):
        item = self._find_page_map(opf)
        if item is None:
            return False
        pmap = item.data
        pages = self.pages = PageList()
        for page in xpath(pmap, 'o2:page'):
            name = page.get('name', '')
            href = page.get('href')
            if not href:
                continue
            name = COLLAPSE_RE.sub(' ', name.strip())
            href = item.abshref(urlnormalize(href))
            pages.add(name, href)
        return True
    def _pages_from_opf(self, opf, item):
        if self._pages_from_ncx(opf, item): return
        if self._pages_from_page_map(opf): return
        self.pages = PageList()
        return
    def _cover_from_html(self, hcover):
        with TemporaryDirectory('_html_cover') as tdir:
            writer = DirWriter()
@ -1228,7 +1374,9 @@ class OEBBook(object):
        self._manifest_from_opf(opf)
        self._spine_from_opf(opf)
        self._guide_from_opf(opf)
-        self._toc_from_opf(opf)
+        item = self._find_ncx(opf)
        self._toc_from_opf(opf, item)
        self._pages_from_opf(opf, item)
        self._ensure_cover_image()
    def translate(self, text):
@ -1236,6 +1384,20 @@ class OEBBook(object):
        lang = lang.split('-', 1)[0].lower()
        return translate(lang, text)
    def decode(self, data):
        if isinstance(data, unicode):
            return data
        encodings = ['utf-8', 'utf-16']
        if self.encoding is not None:
            encodings.append(self.encoding)
        for encoding in encodings:
            try:
                return data.decode(encoding)
            except UnicodeDecodeError:
                pass
        data, _ = xml_to_unicode(data)
        return data
    def to_opf1(self):
        package = etree.Element('package',
            attrib={'unique-identifier': self.uid.id})
@ -1249,6 +1411,34 @@ class OEBBook(object):
        guide = self.guide.to_opf1(package)
        return {OPF_MIME: ('content.opf', package)}
    def _update_playorder(self, ncx):
        hrefs = set(xpath(ncx, '//ncx:content/@src'))
        playorder = {}
        next = 1
        selector = XPath('h:body//*[@id or @name]')
        for item in self.spine:
            base = item.href
            if base in hrefs:
                playorder[base] = next
                next += 1
            for elem in selector(item.data):
                added = False
                for attr in ('id', 'name'):
                    id = elem.get(attr)
                    if not id:
                        continue
                    href = '#'.join([base, id])
                    if href in hrefs:
                        playorder[href] = next
                        added = True
                if added:
                    next += 1
        selector = XPath('ncx:content/@src')
        for elem in xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]'):
            order = playorder[selector(elem)[0]]
            elem.attrib['playOrder'] = str(order)
        return
    def _to_ncx(self):
        lang = unicode(self.metadata.language[0])
        ncx = etree.Element(NCX('ncx'),
@ -1256,35 +1446,50 @@ class OEBBook(object):
            nsmap={None: NCX_NS})
        head = etree.SubElement(ncx, NCX('head'))
        etree.SubElement(head, NCX('meta'),
-            attrib={'name': 'dtb:uid', 'content': unicode(self.uid)})
+            name='dtb:uid', content=unicode(self.uid))
        etree.SubElement(head, NCX('meta'),
-            attrib={'name': 'dtb:depth', 'content': str(self.toc.depth())})
+            name='dtb:depth', content=str(self.toc.depth()))
        generator = ''.join(['calibre (', calibre.__version__, ')'])
        etree.SubElement(head, NCX('meta'),
-            attrib={'name': 'dtb:totalPageCount', 'content': '0'})
+            name='dtb:generator', content=generator)
        etree.SubElement(head, NCX('meta'),
-            attrib={'name': 'dtb:maxPageNumber', 'content': '0'})
+            name='dtb:totalPageCount', content=str(len(self.pages)))
        maxpnum = etree.SubElement(head, NCX('meta'),
            name='dtb:maxPageNumber', content='0')
        title = etree.SubElement(ncx, NCX('docTitle'))
        text = etree.SubElement(title, NCX('text'))
        text.text = unicode(self.metadata.title[0])
        navmap = etree.SubElement(ncx, NCX('navMap'))
        self.toc.to_ncx(navmap)
        if len(self.pages) > 0:
            plist = self.pages.to_ncx(ncx)
            value = max(int(x) for x in xpath(plist, '//@value'))
            maxpnum.attrib['content'] = str(value)
        self._update_playorder(ncx)
        return ncx
-    def to_opf2(self):
+    def to_opf2(self, page_map=False):
        results = {}
        package = etree.Element(OPF('package'),
            attrib={'version': '2.0', 'unique-identifier': self.uid.id},
            nsmap={None: OPF2_NS})
        metadata = self.metadata.to_opf2(package)
        manifest = self.manifest.to_opf2(package)
        id, href = self.manifest.generate('ncx', 'toc.ncx')
        etree.SubElement(manifest, OPF('item'),
            attrib={'id': id, 'href': href, 'media-type': NCX_MIME})
        spine = self.spine.to_opf2(package)
        spine.attrib['toc'] = id
        guide = self.guide.to_opf2(package)
-        ncx = self._to_ncx()
+        results[OPF_MIME] = ('content.opf', package)
-        return {OPF_MIME: ('content.opf', package),
+        id, href = self.manifest.generate('ncx', 'toc.ncx')
-                NCX_MIME: (href, ncx)}
+        etree.SubElement(manifest, OPF('item'), id=id, href=href,
                         attrib={'media-type': NCX_MIME})
        spine.attrib['toc'] = id
        results[NCX_MIME] = (href, self._to_ncx())
        if page_map and len(self.pages) > 0:
            id, href = self.manifest.generate('page-map', 'page-map.xml')
            etree.SubElement(manifest, OPF('item'), id=id, href=href,
                             attrib={'media-type': PAGE_MAP_MIME})
            spine.attrib['page-map'] = id
            results[PAGE_MAP_MIME] = (href, self.pages.to_page_map())
        return results
 def main(argv=sys.argv):
@ -1292,7 +1497,7 @@ def main(argv=sys.argv):
        oeb = OEBBook(arg)
        for name, doc in oeb.to_opf1().values():
            print etree.tostring(doc, pretty_print=True)
-        for name, doc in oeb.to_opf2().values():
+        for name, doc in oeb.to_opf2(page_map=True).values():
            print etree.tostring(doc, pretty_print=True)
    return 0
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@ -109,6 +109,7 @@ class Stylizer(object):
    STYLESHEETS = {}
    def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']):
        self.oeb = oeb
        self.profile = profile
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
@ -117,7 +118,7 @@ class Stylizer(object):
        stylesheets = [HTML_CSS_STYLESHEET]
        head = xpath(tree, '/h:html/h:head')[0]
        parser = cssutils.CSSParser()
-        parser.setFetcher(lambda path: ('utf-8', oeb.container.read(path)))
+        parser.setFetcher(self._fetch_css_file)
        for elem in head:
            if elem.tag == XHTML('style') and elem.text \
               and elem.get('type', CSS_MIME) in OEB_STYLES:
@ -138,8 +139,7 @@ class Stylizer(object):
                if path in self.STYLESHEETS:
                    stylesheet = self.STYLESHEETS[path]
                else:
-                    data = XHTML_CSS_NAMESPACE
+                    data = self._fetch_css_file(path)[1]
                    data += oeb.manifest.hrefs[path].data
                    stylesheet = parser.parseString(data, href=path)
                    stylesheet.namespaces['h'] = XHTML_NS
                    self.STYLESHEETS[path] = stylesheet
@ -167,6 +167,15 @@ class Stylizer(object):
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr()
    def _fetch_css_file(self, path):
        hrefs = self.oeb.manifest.hrefs
        if path not in hrefs:
            return (None, None)
        data = hrefs[path].data
        data = self.oeb.decode(data)
        data = XHTML_CSS_NAMESPACE + data
        return (None, data)
    def flatten_rule(self, rule, href, index):
        results = []
        if isinstance(rule, CSSStyleRule):