Forced default encoding:

- Support for forced default encoding of HTML content of OEBs
  - oeb2mobi command-line option to set the encoding
This commit is contained in:
Marshall T. Vandegrift 2009-01-20 22:49:30 -05:00
parent bf50a850f4
commit 7abdacff58
2 changed files with 20 additions and 10 deletions

View File

@ -560,7 +560,7 @@ def oeb2mobi(opts, inpath):
compression = PALMDOC if opts.compress else UNCOMPRESSED
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
context = Context(source, dest)
oeb = OEBBook(inpath, logger=logger)
oeb = OEBBook(inpath, logger=logger, encoding=opts.encoding)
tocadder = HTMLTOCAdder(title=opts.toc_title)
tocadder.transform(oeb, context)
mangler = CaseMangler()

View File

@ -90,6 +90,9 @@ def prefixname(name, nsrmap):
return barename(name)
return ':'.join((prefix, barename(name)))
def XPath(expr):
return etree.XPath(expr, namespaces=XPNSMAP)
def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP)
@ -292,15 +295,19 @@ class Metadata(object):
class Manifest(object):
class Item(object):
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
def __init__(self, id, href, media_type,
def __init__(self, oeb, id, href, media_type,
fallback=None, loader=str, data=None):
self.oeb = oeb
self.id = id
self.href = self.path = urlnormalize(href)
self.media_type = media_type
self.fallback = fallback
self.spine_position = None
self.linear = True
if loader is None and data is None:
loader = oeb.container.read
self._loader = loader
self._data = data
@ -309,16 +316,20 @@ class Manifest(object):
% (self.id, self.href, self.media_type)
def _force_xhtml(self, data):
if self.oeb.encoding is not None:
data = data.decode(self.oeb.encoding, 'replace')
try:
data = etree.fromstring(data, parser=XML_PARSER)
except etree.XMLSyntaxError:
data = html.fromstring(data, parser=XML_PARSER)
data = html.fromstring(data)
data = etree.tostring(data, encoding=unicode)
data = etree.fromstring(data, parser=XML_PARSER)
if namespace(data.tag) != XHTML_NS:
data.attrib['xmlns'] = XHTML_NS
data = etree.tostring(data)
data = etree.tostring(data, encoding=unicode)
data = etree.fromstring(data, parser=XML_PARSER)
for meta in self.META_XP(data):
meta.getparent().remove(meta)
return data
def data():
@ -395,9 +406,8 @@ class Manifest(object):
self.hrefs = {}
def add(self, id, href, media_type, fallback=None, loader=None, data=None):
loader = loader or self.oeb.container.read
item = self.Item(
id, href, media_type, fallback, loader, data)
self.oeb, id, href, media_type, fallback, loader, data)
self.ids[item.id] = item
self.hrefs[item.href] = item
return item
@ -607,9 +617,7 @@ class Guide(object):
__iter__ = iterkeys
def values(self):
values = list(self.refs.values())
values.sort()
return values
return sorted(self.refs.values())
def items(self):
for type, ref in self.refs.items():
@ -713,11 +721,13 @@ class TOC(object):
class OEBBook(object):
def __init__(self, opfpath=None, container=None, logger=FauxLogger()):
def __init__(self, opfpath=None, container=None, encoding=None,
logger=FauxLogger()):
if opfpath and not container:
container = DirContainer(os.path.dirname(opfpath))
opfpath = os.path.basename(opfpath)
self.container = container
self.encoding = encoding
self.logger = logger
if opfpath or container:
opf = self._read_opf(opfpath)