mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Fix #1746 (Error while News downloading El Pais)
This commit is contained in:
commit
e6fbdb4a43
@ -17,11 +17,14 @@ import logging
|
|||||||
import re
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
import copy
|
import copy
|
||||||
|
import mimetypes
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
import calibre
|
||||||
from calibre import LoggingInterface
|
from calibre import LoggingInterface
|
||||||
from calibre.translations.dynamic import translate
|
from calibre.translations.dynamic import translate
|
||||||
from calibre.startup import get_lang
|
from calibre.startup import get_lang
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
||||||
from calibre.ebooks.metadata.epub import CoverRenderer
|
from calibre.ebooks.metadata.epub import CoverRenderer
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
@ -64,6 +67,7 @@ XHTML_MIME = 'application/xhtml+xml'
|
|||||||
CSS_MIME = 'text/css'
|
CSS_MIME = 'text/css'
|
||||||
NCX_MIME = 'application/x-dtbncx+xml'
|
NCX_MIME = 'application/x-dtbncx+xml'
|
||||||
OPF_MIME = 'application/oebps-package+xml'
|
OPF_MIME = 'application/oebps-package+xml'
|
||||||
|
PAGE_MAP_MIME = 'application/oebps-page-map+xml'
|
||||||
OEB_DOC_MIME = 'text/x-oeb1-document'
|
OEB_DOC_MIME = 'text/x-oeb1-document'
|
||||||
OEB_CSS_MIME = 'text/x-oeb1-css'
|
OEB_CSS_MIME = 'text/x-oeb1-css'
|
||||||
OPENTYPE_MIME = 'font/opentype'
|
OPENTYPE_MIME = 'font/opentype'
|
||||||
@ -84,6 +88,7 @@ ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
|
|||||||
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
||||||
QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
|
QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
|
||||||
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
|
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
|
||||||
|
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||||
|
|
||||||
def element(parent, *args, **kwargs):
|
def element(parent, *args, **kwargs):
|
||||||
if parent is not None:
|
if parent is not None:
|
||||||
@ -444,9 +449,10 @@ class Manifest(object):
|
|||||||
% (self.id, self.href, self.media_type)
|
% (self.id, self.href, self.media_type)
|
||||||
|
|
||||||
def _force_xhtml(self, data):
|
def _force_xhtml(self, data):
|
||||||
# Possibly decode in user-specified encoding
|
# Convert to Unicode and normalize line endings
|
||||||
if self.oeb.encoding is not None:
|
data = self.oeb.decode(data)
|
||||||
data = data.decode(self.oeb.encoding, 'replace')
|
data = XMLDECL_RE.sub('', data)
|
||||||
|
data = data.replace('\r\n', '\n').replace('\r', '\n')
|
||||||
# Handle broken XHTML w/ SVG (ugh)
|
# Handle broken XHTML w/ SVG (ugh)
|
||||||
if 'svg:' in data and SVG_NS not in data:
|
if 'svg:' in data and SVG_NS not in data:
|
||||||
data = data.replace(
|
data = data.replace(
|
||||||
@ -892,25 +898,71 @@ class TOC(object):
|
|||||||
node.to_opf1(tour)
|
node.to_opf1(tour)
|
||||||
return tour
|
return tour
|
||||||
|
|
||||||
def to_ncx(self, parent, order=None, depth=1):
|
def to_ncx(self, parent, depth=1):
|
||||||
if not order: order = [0]
|
|
||||||
for node in self.nodes:
|
for node in self.nodes:
|
||||||
order[0] += 1
|
id = self.id or unicode(uuid.uuid4())
|
||||||
playOrder = str(order[0])
|
attrib = {'id': id, 'playOrder': '0'}
|
||||||
id = self.id or 'np' + playOrder
|
|
||||||
point = etree.SubElement(parent,
|
|
||||||
NCX('navPoint'), id=id, playOrder=playOrder)
|
|
||||||
if self.klass:
|
if self.klass:
|
||||||
point.attrib['class'] = node.klass
|
attrib['class'] = node.klass
|
||||||
|
point = element(parent, NCX('navPoint'), attrib=attrib)
|
||||||
label = etree.SubElement(point, NCX('navLabel'))
|
label = etree.SubElement(point, NCX('navLabel'))
|
||||||
etree.SubElement(label, NCX('text')).text = node.title
|
element(label, NCX('text')).text = node.title
|
||||||
href = node.href if depth > 1 else urldefrag(node.href)[0]
|
href = node.href if depth > 1 else urldefrag(node.href)[0]
|
||||||
child = etree.SubElement(point,
|
element(point, NCX('content'), src=href)
|
||||||
NCX('content'), attrib={'src': href})
|
node.to_ncx(point, depth+1)
|
||||||
node.to_ncx(point, order, depth+1)
|
|
||||||
return parent
|
return parent
|
||||||
|
|
||||||
|
|
||||||
|
class PageList(object):
|
||||||
|
class Page(object):
|
||||||
|
def __init__(self, name, href, type='normal', klass=None, id=None):
|
||||||
|
self.name = name
|
||||||
|
self.href = urlnormalize(href)
|
||||||
|
self.type = type
|
||||||
|
self.id = id
|
||||||
|
self.klass = klass
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.pages = []
|
||||||
|
|
||||||
|
def add(self, name, href, type='normal', klass=None, id=None):
|
||||||
|
page = self.Page(name, href, type, klass, id)
|
||||||
|
self.pages.append(page)
|
||||||
|
return page
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.pages)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for page in self.pages:
|
||||||
|
yield node
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return self.pages[index]
|
||||||
|
|
||||||
|
def to_ncx(self, parent=None):
|
||||||
|
plist = element(parent, NCX('pageList'), id=str(uuid.uuid4()))
|
||||||
|
values = dict((t, count(1)) for t in ('front', 'normal', 'special'))
|
||||||
|
for page in self.pages:
|
||||||
|
id = page.id or unicode(uuid.uuid4())
|
||||||
|
type = page.type
|
||||||
|
value = str(values[type].next())
|
||||||
|
attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'}
|
||||||
|
if page.klass:
|
||||||
|
attrib['class'] = page.klass
|
||||||
|
ptarget = element(plist, NCX('pageTarget'), attrib=attrib)
|
||||||
|
label = element(ptarget, NCX('navLabel'))
|
||||||
|
element(label, NCX('text')).text = page.name
|
||||||
|
element(ptarget, NCX('content'), src=page.href)
|
||||||
|
return plist
|
||||||
|
|
||||||
|
def to_page_map(self):
|
||||||
|
pmap = etree.Element(OPF('page-map'), nsmap={None: OPF2_NS})
|
||||||
|
for page in self.pages:
|
||||||
|
element(pmap, OPF('page'), name=page.name, href=page.href)
|
||||||
|
return pmap
|
||||||
|
|
||||||
|
|
||||||
class OEBBook(object):
|
class OEBBook(object):
|
||||||
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
|
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
|
||||||
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
|
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
|
||||||
@ -972,7 +1024,7 @@ class OEBBook(object):
|
|||||||
return opf
|
return opf
|
||||||
|
|
||||||
def _metadata_from_opf(self, opf):
|
def _metadata_from_opf(self, opf):
|
||||||
uid = opf.get('unique-identifier', 'calibre-uuid')
|
uid = opf.get('unique-identifier', None)
|
||||||
self.uid = None
|
self.uid = None
|
||||||
self.metadata = metadata = Metadata(self)
|
self.metadata = metadata = Metadata(self)
|
||||||
for elem in xpath(opf, '/o2:package/o2:metadata//*'):
|
for elem in xpath(opf, '/o2:package/o2:metadata//*'):
|
||||||
@ -996,8 +1048,12 @@ class OEBBook(object):
|
|||||||
if not haveuuid and haveid:
|
if not haveuuid and haveid:
|
||||||
bookid = "urn:uuid:%s" % str(uuid.uuid4())
|
bookid = "urn:uuid:%s" % str(uuid.uuid4())
|
||||||
metadata.add('identifier', bookid, id='calibre-uuid')
|
metadata.add('identifier', bookid, id='calibre-uuid')
|
||||||
|
if uid is None:
|
||||||
|
self.logger.warn(u'Unique-identifier not specified')
|
||||||
for item in metadata.identifier:
|
for item in metadata.identifier:
|
||||||
if item.id == uid:
|
if not item.id:
|
||||||
|
continue
|
||||||
|
if uid is None or item.id == uid:
|
||||||
self.uid = item
|
self.uid = item
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
@ -1023,7 +1079,10 @@ class OEBBook(object):
|
|||||||
href = elem.get('href')
|
href = elem.get('href')
|
||||||
media_type = elem.get('media-type', None)
|
media_type = elem.get('media-type', None)
|
||||||
if media_type is None:
|
if media_type is None:
|
||||||
media_type = elem.get('mediatype', BINARY_MIME)
|
media_type = elem.get('mediatype', None)
|
||||||
|
if media_type is None or media_type == 'text/xml':
|
||||||
|
guessed = mimetypes.guess_type(href)[0]
|
||||||
|
media_type = guessed or media_type or BINARY_MIME
|
||||||
fallback = elem.get('fallback')
|
fallback = elem.get('fallback')
|
||||||
if href in manifest.hrefs:
|
if href in manifest.hrefs:
|
||||||
self.logger.warn(u'Duplicate manifest entry for %r' % href)
|
self.logger.warn(u'Duplicate manifest entry for %r' % href)
|
||||||
@ -1066,46 +1125,71 @@ class OEBBook(object):
|
|||||||
continue
|
continue
|
||||||
guide.add(elem.get('type'), elem.get('title'), href)
|
guide.add(elem.get('type'), elem.get('title'), href)
|
||||||
|
|
||||||
def _toc_from_navpoint(self, toc, navpoint):
|
def _find_ncx(self, opf):
|
||||||
|
result = xpath(opf, '/o2:package/o2:spine/@toc')
|
||||||
|
if result:
|
||||||
|
id = result[0]
|
||||||
|
if id not in self.manifest.ids:
|
||||||
|
return None
|
||||||
|
item = self.manifest.ids[id]
|
||||||
|
self.manifest.remove(item)
|
||||||
|
return item
|
||||||
|
for item in self.manifest.values():
|
||||||
|
if item.media_type == NCX_MIME:
|
||||||
|
self.manifest.remove(item)
|
||||||
|
return item
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _toc_from_navpoint(self, item, toc, navpoint):
|
||||||
children = xpath(navpoint, 'ncx:navPoint')
|
children = xpath(navpoint, 'ncx:navPoint')
|
||||||
for child in children:
|
for child in children:
|
||||||
title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
|
title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
|
||||||
href = xpath(child, 'ncx:content/@src')[0]
|
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||||
|
href = xpath(child, 'ncx:content/@src')
|
||||||
|
if not title or not href:
|
||||||
|
continue
|
||||||
|
href = item.abshref(urlnormalize(href[0]))
|
||||||
|
path, _ = urldefrag(href)
|
||||||
|
if path not in self.manifest.hrefs:
|
||||||
|
self.logger.warn('TOC reference %r not found' % href)
|
||||||
|
continue
|
||||||
id = child.get('id')
|
id = child.get('id')
|
||||||
klass = child.get('class')
|
klass = child.get('class')
|
||||||
node = toc.add(title, href, id=id, klass=klass)
|
node = toc.add(title, href, id=id, klass=klass)
|
||||||
self._toc_from_navpoint(node, child)
|
self._toc_from_navpoint(item, node, child)
|
||||||
|
|
||||||
def _toc_from_ncx(self, opf):
|
def _toc_from_ncx(self, item):
|
||||||
result = xpath(opf, '/o2:package/o2:spine/@toc')
|
if item is None:
|
||||||
if not result:
|
|
||||||
expr = '/o2:package/o2:manifest/o2:item[@media-type="%s"]/@id'
|
|
||||||
result = xpath(opf, expr % NCX_MIME)
|
|
||||||
if len(result) != 1:
|
|
||||||
return False
|
return False
|
||||||
id = result[0]
|
|
||||||
if id not in self.manifest.ids:
|
|
||||||
return False
|
|
||||||
item = self.manifest.ids[id]
|
|
||||||
ncx = item.data
|
ncx = item.data
|
||||||
self.manifest.remove(item)
|
title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
|
||||||
title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')
|
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||||
title = title[0].strip() if title else unicode(self.metadata.title[0])
|
title = title or unicode(self.metadata.title[0])
|
||||||
self.toc = toc = TOC(title)
|
self.toc = toc = TOC(title)
|
||||||
navmaps = xpath(ncx, 'ncx:navMap')
|
navmaps = xpath(ncx, 'ncx:navMap')
|
||||||
for navmap in navmaps:
|
for navmap in navmaps:
|
||||||
self._toc_from_navpoint(toc, navmap)
|
self._toc_from_navpoint(item, toc, navmap)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _toc_from_tour(self, opf):
|
def _toc_from_tour(self, opf):
|
||||||
result = xpath(opf, '/o2:package/o2:tours/o2:tour')
|
result = xpath(opf, 'o2:tours/o2:tour')
|
||||||
if not result:
|
if not result:
|
||||||
return False
|
return False
|
||||||
tour = result[0]
|
tour = result[0]
|
||||||
self.toc = toc = TOC(tour.get('title'))
|
self.toc = toc = TOC(tour.get('title'))
|
||||||
sites = xpath(tour, 'o2:site')
|
sites = xpath(tour, 'o2:site')
|
||||||
for site in sites:
|
for site in sites:
|
||||||
toc.add(site.get('title'), site.get('href'))
|
title = site.get('title')
|
||||||
|
href = site.get('href')
|
||||||
|
if not title or not href:
|
||||||
|
continue
|
||||||
|
href = item.abshref(urlnormalize(href))
|
||||||
|
path, _ = urldefrag(href)
|
||||||
|
if path not in self.manifest.hrefs:
|
||||||
|
self.logger.warn('TOC reference %r not found' % href)
|
||||||
|
continue
|
||||||
|
id = child.get('id')
|
||||||
|
toc.add(title, href, id=id)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _toc_from_html(self, opf):
|
def _toc_from_html(self, opf):
|
||||||
@ -1131,6 +1215,7 @@ class OEBBook(object):
|
|||||||
if not path:
|
if not path:
|
||||||
href = '#'.join((itempath, frag))
|
href = '#'.join((itempath, frag))
|
||||||
title = ' '.join(xpath(anchor, './/text()'))
|
title = ' '.join(xpath(anchor, './/text()'))
|
||||||
|
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||||
href = urlnormalize(href)
|
href = urlnormalize(href)
|
||||||
if href not in titles:
|
if href not in titles:
|
||||||
order.append(href)
|
order.append(href)
|
||||||
@ -1146,15 +1231,17 @@ class OEBBook(object):
|
|||||||
for item in self.spine:
|
for item in self.spine:
|
||||||
if not item.linear: continue
|
if not item.linear: continue
|
||||||
html = item.data
|
html = item.data
|
||||||
title = xpath(html, '/h:html/h:head/h:title/text()')
|
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
|
||||||
title = title[0].strip() if title else None
|
title = COLLAPSE_RE(' ', title.strip())
|
||||||
if title: titles.append(title)
|
if title:
|
||||||
|
titles.append(title)
|
||||||
headers.append('(unlabled)')
|
headers.append('(unlabled)')
|
||||||
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
|
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
|
||||||
expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,)
|
expr = '/h:html/h:body//h:%s[position()=1]/text()'
|
||||||
header = xpath(html, expr)
|
header = ''.join(xpath(html % tag, expr))
|
||||||
|
header = COLLAPSE_RE.sub(' ', header.strip())
|
||||||
if header:
|
if header:
|
||||||
headers[-1] = header[0]
|
headers[-1] = header
|
||||||
break
|
break
|
||||||
use = titles
|
use = titles
|
||||||
if len(titles) > len(set(titles)):
|
if len(titles) > len(set(titles)):
|
||||||
@ -1164,12 +1251,71 @@ class OEBBook(object):
|
|||||||
toc.add(title, item.href)
|
toc.add(title, item.href)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _toc_from_opf(self, opf):
|
def _toc_from_opf(self, opf, item):
|
||||||
if self._toc_from_ncx(opf): return
|
if self._toc_from_ncx(item): return
|
||||||
if self._toc_from_tour(opf): return
|
if self._toc_from_tour(opf): return
|
||||||
|
self.logger.warn('No metadata table of contents found')
|
||||||
if self._toc_from_html(opf): return
|
if self._toc_from_html(opf): return
|
||||||
self._toc_from_spine(opf)
|
self._toc_from_spine(opf)
|
||||||
|
|
||||||
|
def _pages_from_ncx(self, opf, item):
|
||||||
|
if item is None:
|
||||||
|
return False
|
||||||
|
ncx = item.data
|
||||||
|
ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget')
|
||||||
|
if not ptargets:
|
||||||
|
return False
|
||||||
|
pages = self.pages = PageList()
|
||||||
|
for ptarget in ptargets:
|
||||||
|
name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
|
||||||
|
name = COLLAPSE_RE.sub(' ', name.strip())
|
||||||
|
href = xpath(ptarget, 'ncx:content/@src')
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
href = item.abshref(urlnormalize(href[0]))
|
||||||
|
id = ptarget.get('id')
|
||||||
|
type = ptarget.get('type', 'normal')
|
||||||
|
klass = ptarget.get('class')
|
||||||
|
pages.add(name, href, type=type, id=id, klass=klass)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _find_page_map(self, opf):
|
||||||
|
result = xpath(opf, '/o2:package/o2:spine/@page-map')
|
||||||
|
if result:
|
||||||
|
id = result[0]
|
||||||
|
if id not in self.manifest.ids:
|
||||||
|
return None
|
||||||
|
item = self.manifest.ids[id]
|
||||||
|
self.manifest.remove(item)
|
||||||
|
return item
|
||||||
|
for item in self.manifest.values():
|
||||||
|
if item.media_type == PAGE_MAP_MIME:
|
||||||
|
self.manifest.remove(item)
|
||||||
|
return item
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _pages_from_page_map(self, opf):
|
||||||
|
item = self._find_page_map(opf)
|
||||||
|
if item is None:
|
||||||
|
return False
|
||||||
|
pmap = item.data
|
||||||
|
pages = self.pages = PageList()
|
||||||
|
for page in xpath(pmap, 'o2:page'):
|
||||||
|
name = page.get('name', '')
|
||||||
|
href = page.get('href')
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
name = COLLAPSE_RE.sub(' ', name.strip())
|
||||||
|
href = item.abshref(urlnormalize(href))
|
||||||
|
pages.add(name, href)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _pages_from_opf(self, opf, item):
|
||||||
|
if self._pages_from_ncx(opf, item): return
|
||||||
|
if self._pages_from_page_map(opf): return
|
||||||
|
self.pages = PageList()
|
||||||
|
return
|
||||||
|
|
||||||
def _cover_from_html(self, hcover):
|
def _cover_from_html(self, hcover):
|
||||||
with TemporaryDirectory('_html_cover') as tdir:
|
with TemporaryDirectory('_html_cover') as tdir:
|
||||||
writer = DirWriter()
|
writer = DirWriter()
|
||||||
@ -1228,7 +1374,9 @@ class OEBBook(object):
|
|||||||
self._manifest_from_opf(opf)
|
self._manifest_from_opf(opf)
|
||||||
self._spine_from_opf(opf)
|
self._spine_from_opf(opf)
|
||||||
self._guide_from_opf(opf)
|
self._guide_from_opf(opf)
|
||||||
self._toc_from_opf(opf)
|
item = self._find_ncx(opf)
|
||||||
|
self._toc_from_opf(opf, item)
|
||||||
|
self._pages_from_opf(opf, item)
|
||||||
self._ensure_cover_image()
|
self._ensure_cover_image()
|
||||||
|
|
||||||
def translate(self, text):
|
def translate(self, text):
|
||||||
@ -1236,6 +1384,20 @@ class OEBBook(object):
|
|||||||
lang = lang.split('-', 1)[0].lower()
|
lang = lang.split('-', 1)[0].lower()
|
||||||
return translate(lang, text)
|
return translate(lang, text)
|
||||||
|
|
||||||
|
def decode(self, data):
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
return data
|
||||||
|
encodings = ['utf-8', 'utf-16']
|
||||||
|
if self.encoding is not None:
|
||||||
|
encodings.append(self.encoding)
|
||||||
|
for encoding in encodings:
|
||||||
|
try:
|
||||||
|
return data.decode(encoding)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
data, _ = xml_to_unicode(data)
|
||||||
|
return data
|
||||||
|
|
||||||
def to_opf1(self):
|
def to_opf1(self):
|
||||||
package = etree.Element('package',
|
package = etree.Element('package',
|
||||||
attrib={'unique-identifier': self.uid.id})
|
attrib={'unique-identifier': self.uid.id})
|
||||||
@ -1249,6 +1411,34 @@ class OEBBook(object):
|
|||||||
guide = self.guide.to_opf1(package)
|
guide = self.guide.to_opf1(package)
|
||||||
return {OPF_MIME: ('content.opf', package)}
|
return {OPF_MIME: ('content.opf', package)}
|
||||||
|
|
||||||
|
def _update_playorder(self, ncx):
|
||||||
|
hrefs = set(xpath(ncx, '//ncx:content/@src'))
|
||||||
|
playorder = {}
|
||||||
|
next = 1
|
||||||
|
selector = XPath('h:body//*[@id or @name]')
|
||||||
|
for item in self.spine:
|
||||||
|
base = item.href
|
||||||
|
if base in hrefs:
|
||||||
|
playorder[base] = next
|
||||||
|
next += 1
|
||||||
|
for elem in selector(item.data):
|
||||||
|
added = False
|
||||||
|
for attr in ('id', 'name'):
|
||||||
|
id = elem.get(attr)
|
||||||
|
if not id:
|
||||||
|
continue
|
||||||
|
href = '#'.join([base, id])
|
||||||
|
if href in hrefs:
|
||||||
|
playorder[href] = next
|
||||||
|
added = True
|
||||||
|
if added:
|
||||||
|
next += 1
|
||||||
|
selector = XPath('ncx:content/@src')
|
||||||
|
for elem in xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]'):
|
||||||
|
order = playorder[selector(elem)[0]]
|
||||||
|
elem.attrib['playOrder'] = str(order)
|
||||||
|
return
|
||||||
|
|
||||||
def _to_ncx(self):
|
def _to_ncx(self):
|
||||||
lang = unicode(self.metadata.language[0])
|
lang = unicode(self.metadata.language[0])
|
||||||
ncx = etree.Element(NCX('ncx'),
|
ncx = etree.Element(NCX('ncx'),
|
||||||
@ -1256,35 +1446,50 @@ class OEBBook(object):
|
|||||||
nsmap={None: NCX_NS})
|
nsmap={None: NCX_NS})
|
||||||
head = etree.SubElement(ncx, NCX('head'))
|
head = etree.SubElement(ncx, NCX('head'))
|
||||||
etree.SubElement(head, NCX('meta'),
|
etree.SubElement(head, NCX('meta'),
|
||||||
attrib={'name': 'dtb:uid', 'content': unicode(self.uid)})
|
name='dtb:uid', content=unicode(self.uid))
|
||||||
etree.SubElement(head, NCX('meta'),
|
etree.SubElement(head, NCX('meta'),
|
||||||
attrib={'name': 'dtb:depth', 'content': str(self.toc.depth())})
|
name='dtb:depth', content=str(self.toc.depth()))
|
||||||
|
generator = ''.join(['calibre (', calibre.__version__, ')'])
|
||||||
etree.SubElement(head, NCX('meta'),
|
etree.SubElement(head, NCX('meta'),
|
||||||
attrib={'name': 'dtb:totalPageCount', 'content': '0'})
|
name='dtb:generator', content=generator)
|
||||||
etree.SubElement(head, NCX('meta'),
|
etree.SubElement(head, NCX('meta'),
|
||||||
attrib={'name': 'dtb:maxPageNumber', 'content': '0'})
|
name='dtb:totalPageCount', content=str(len(self.pages)))
|
||||||
|
maxpnum = etree.SubElement(head, NCX('meta'),
|
||||||
|
name='dtb:maxPageNumber', content='0')
|
||||||
title = etree.SubElement(ncx, NCX('docTitle'))
|
title = etree.SubElement(ncx, NCX('docTitle'))
|
||||||
text = etree.SubElement(title, NCX('text'))
|
text = etree.SubElement(title, NCX('text'))
|
||||||
text.text = unicode(self.metadata.title[0])
|
text.text = unicode(self.metadata.title[0])
|
||||||
navmap = etree.SubElement(ncx, NCX('navMap'))
|
navmap = etree.SubElement(ncx, NCX('navMap'))
|
||||||
self.toc.to_ncx(navmap)
|
self.toc.to_ncx(navmap)
|
||||||
|
if len(self.pages) > 0:
|
||||||
|
plist = self.pages.to_ncx(ncx)
|
||||||
|
value = max(int(x) for x in xpath(plist, '//@value'))
|
||||||
|
maxpnum.attrib['content'] = str(value)
|
||||||
|
self._update_playorder(ncx)
|
||||||
return ncx
|
return ncx
|
||||||
|
|
||||||
def to_opf2(self):
|
def to_opf2(self, page_map=False):
|
||||||
|
results = {}
|
||||||
package = etree.Element(OPF('package'),
|
package = etree.Element(OPF('package'),
|
||||||
attrib={'version': '2.0', 'unique-identifier': self.uid.id},
|
attrib={'version': '2.0', 'unique-identifier': self.uid.id},
|
||||||
nsmap={None: OPF2_NS})
|
nsmap={None: OPF2_NS})
|
||||||
metadata = self.metadata.to_opf2(package)
|
metadata = self.metadata.to_opf2(package)
|
||||||
manifest = self.manifest.to_opf2(package)
|
manifest = self.manifest.to_opf2(package)
|
||||||
id, href = self.manifest.generate('ncx', 'toc.ncx')
|
|
||||||
etree.SubElement(manifest, OPF('item'),
|
|
||||||
attrib={'id': id, 'href': href, 'media-type': NCX_MIME})
|
|
||||||
spine = self.spine.to_opf2(package)
|
spine = self.spine.to_opf2(package)
|
||||||
spine.attrib['toc'] = id
|
|
||||||
guide = self.guide.to_opf2(package)
|
guide = self.guide.to_opf2(package)
|
||||||
ncx = self._to_ncx()
|
results[OPF_MIME] = ('content.opf', package)
|
||||||
return {OPF_MIME: ('content.opf', package),
|
id, href = self.manifest.generate('ncx', 'toc.ncx')
|
||||||
NCX_MIME: (href, ncx)}
|
etree.SubElement(manifest, OPF('item'), id=id, href=href,
|
||||||
|
attrib={'media-type': NCX_MIME})
|
||||||
|
spine.attrib['toc'] = id
|
||||||
|
results[NCX_MIME] = (href, self._to_ncx())
|
||||||
|
if page_map and len(self.pages) > 0:
|
||||||
|
id, href = self.manifest.generate('page-map', 'page-map.xml')
|
||||||
|
etree.SubElement(manifest, OPF('item'), id=id, href=href,
|
||||||
|
attrib={'media-type': PAGE_MAP_MIME})
|
||||||
|
spine.attrib['page-map'] = id
|
||||||
|
results[PAGE_MAP_MIME] = (href, self.pages.to_page_map())
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
def main(argv=sys.argv):
|
def main(argv=sys.argv):
|
||||||
@ -1292,7 +1497,7 @@ def main(argv=sys.argv):
|
|||||||
oeb = OEBBook(arg)
|
oeb = OEBBook(arg)
|
||||||
for name, doc in oeb.to_opf1().values():
|
for name, doc in oeb.to_opf1().values():
|
||||||
print etree.tostring(doc, pretty_print=True)
|
print etree.tostring(doc, pretty_print=True)
|
||||||
for name, doc in oeb.to_opf2().values():
|
for name, doc in oeb.to_opf2(page_map=True).values():
|
||||||
print etree.tostring(doc, pretty_print=True)
|
print etree.tostring(doc, pretty_print=True)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
@ -109,6 +109,7 @@ class Stylizer(object):
|
|||||||
STYLESHEETS = {}
|
STYLESHEETS = {}
|
||||||
|
|
||||||
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']):
|
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']):
|
||||||
|
self.oeb = oeb
|
||||||
self.profile = profile
|
self.profile = profile
|
||||||
self.logger = oeb.logger
|
self.logger = oeb.logger
|
||||||
item = oeb.manifest.hrefs[path]
|
item = oeb.manifest.hrefs[path]
|
||||||
@ -117,7 +118,7 @@ class Stylizer(object):
|
|||||||
stylesheets = [HTML_CSS_STYLESHEET]
|
stylesheets = [HTML_CSS_STYLESHEET]
|
||||||
head = xpath(tree, '/h:html/h:head')[0]
|
head = xpath(tree, '/h:html/h:head')[0]
|
||||||
parser = cssutils.CSSParser()
|
parser = cssutils.CSSParser()
|
||||||
parser.setFetcher(lambda path: ('utf-8', oeb.container.read(path)))
|
parser.setFetcher(self._fetch_css_file)
|
||||||
for elem in head:
|
for elem in head:
|
||||||
if elem.tag == XHTML('style') and elem.text \
|
if elem.tag == XHTML('style') and elem.text \
|
||||||
and elem.get('type', CSS_MIME) in OEB_STYLES:
|
and elem.get('type', CSS_MIME) in OEB_STYLES:
|
||||||
@ -138,8 +139,7 @@ class Stylizer(object):
|
|||||||
if path in self.STYLESHEETS:
|
if path in self.STYLESHEETS:
|
||||||
stylesheet = self.STYLESHEETS[path]
|
stylesheet = self.STYLESHEETS[path]
|
||||||
else:
|
else:
|
||||||
data = XHTML_CSS_NAMESPACE
|
data = self._fetch_css_file(path)[1]
|
||||||
data += oeb.manifest.hrefs[path].data
|
|
||||||
stylesheet = parser.parseString(data, href=path)
|
stylesheet = parser.parseString(data, href=path)
|
||||||
stylesheet.namespaces['h'] = XHTML_NS
|
stylesheet.namespaces['h'] = XHTML_NS
|
||||||
self.STYLESHEETS[path] = stylesheet
|
self.STYLESHEETS[path] = stylesheet
|
||||||
@ -167,6 +167,15 @@ class Stylizer(object):
|
|||||||
for elem in xpath(tree, '//h:*[@style]'):
|
for elem in xpath(tree, '//h:*[@style]'):
|
||||||
self.style(elem)._apply_style_attr()
|
self.style(elem)._apply_style_attr()
|
||||||
|
|
||||||
|
def _fetch_css_file(self, path):
|
||||||
|
hrefs = self.oeb.manifest.hrefs
|
||||||
|
if path not in hrefs:
|
||||||
|
return (None, None)
|
||||||
|
data = hrefs[path].data
|
||||||
|
data = self.oeb.decode(data)
|
||||||
|
data = XHTML_CSS_NAMESPACE + data
|
||||||
|
return (None, data)
|
||||||
|
|
||||||
def flatten_rule(self, rule, href, index):
|
def flatten_rule(self, rule, href, index):
|
||||||
results = []
|
results = []
|
||||||
if isinstance(rule, CSSStyleRule):
|
if isinstance(rule, CSSStyleRule):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user