From 7abdacff582945d9a9e04dc6a4aeffca036e02c2 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Tue, 20 Jan 2009 22:49:30 -0500 Subject: [PATCH] Forced default encoding: - Support for forced default encoding of HTML content of OEBs - oeb2mobi command-line option to set the encoding --- src/calibre/ebooks/mobi/writer.py | 2 +- src/calibre/ebooks/oeb/base.py | 28 +++++++++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index 1d9152c446..fdafd2e08b 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -560,7 +560,7 @@ def oeb2mobi(opts, inpath): compression = PALMDOC if opts.compress else UNCOMPRESSED imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None context = Context(source, dest) - oeb = OEBBook(inpath, logger=logger) + oeb = OEBBook(inpath, logger=logger, encoding=opts.encoding) tocadder = HTMLTOCAdder(title=opts.toc_title) tocadder.transform(oeb, context) mangler = CaseMangler() diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index c44ffcd8d9..c2d30eb2c3 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -90,6 +90,9 @@ def prefixname(name, nsrmap): return barename(name) return ':'.join((prefix, barename(name))) +def XPath(expr): + return etree.XPath(expr, namespaces=XPNSMAP) + def xpath(elem, expr): return elem.xpath(expr, namespaces=XPNSMAP) @@ -292,15 +295,19 @@ class Metadata(object): class Manifest(object): class Item(object): NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') + META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]') - def __init__(self, id, href, media_type, + def __init__(self, oeb, id, href, media_type, fallback=None, loader=str, data=None): + self.oeb = oeb self.id = id self.href = self.path = urlnormalize(href) self.media_type = media_type self.fallback = fallback self.spine_position = None self.linear = True + if loader is None and data is None: + loader = oeb.container.read self._loader = loader self._data = data @@ -309,16 +316,20 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _force_xhtml(self, data): + if self.oeb.encoding is not None: + data = data.decode(self.oeb.encoding, 'replace') try: data = etree.fromstring(data, parser=XML_PARSER) except etree.XMLSyntaxError: - data = html.fromstring(data, parser=XML_PARSER) + data = html.fromstring(data) data = etree.tostring(data, encoding=unicode) data = etree.fromstring(data, parser=XML_PARSER) if namespace(data.tag) != XHTML_NS: data.attrib['xmlns'] = XHTML_NS - data = etree.tostring(data) + data = etree.tostring(data, encoding=unicode) data = etree.fromstring(data, parser=XML_PARSER) + for meta in self.META_XP(data): + meta.getparent().remove(meta) return data def data(): @@ -395,9 +406,8 @@ class Manifest(object): self.hrefs = {} def add(self, id, href, media_type, fallback=None, loader=None, data=None): - loader = loader or self.oeb.container.read item = self.Item( - id, href, media_type, fallback, loader, data) + self.oeb, id, href, media_type, fallback, loader, data) self.ids[item.id] = item self.hrefs[item.href] = item return item @@ -607,9 +617,7 @@ class Guide(object): __iter__ = iterkeys def values(self): - values = list(self.refs.values()) - values.sort() - return values + return sorted(self.refs.values()) def items(self): for type, ref in self.refs.items(): @@ -713,11 +721,13 @@ class TOC(object): class OEBBook(object): - def __init__(self, opfpath=None, container=None, logger=FauxLogger()): + def __init__(self, opfpath=None, container=None, encoding=None, + logger=FauxLogger()): if opfpath and not container: container = DirContainer(os.path.dirname(opfpath)) opfpath = os.path.basename(opfpath) self.container = container + self.encoding = encoding self.logger = logger if opfpath or container: opf = self._read_opf(opfpath)