Make distinction between input and output encodings in OEBBook framework. Add encoding declaration to XML files produced by OEB plugin. Create OPF file in same encoding as input_encoding in HTML input plugin. This should fix all remaining issues with encoding handling for HTML/OPF input files.

This commit is contained in:
Kovid Goyal 2009-05-24 20:24:06 -07:00
parent ac1e73174a
commit 352b5d24ed
7 changed files with 37 additions and 22 deletions

View File

@ -679,7 +679,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html) opts.preprocess_html)
oeb = OEBBook(log, html_preprocessor, oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print, encoding=encoding) pretty_print=opts.pretty_print, input_encoding=encoding)
# Read OEB Book into OEBBook # Read OEB Book into OEBBook
log('Parsing all content...') log('Parsing all content...')
if reader is None: if reader is None:

View File

@ -16,7 +16,7 @@ from urlparse import urlparse, urlunparse
from urllib import unquote from urllib import unquote
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.customize.conversion import OptionRecommendation from calibre.customize.conversion import OptionRecommendation
from calibre import unicode_path from calibre import unicode_path
@ -264,7 +264,7 @@ class HTMLInput(InputFormatPlugin):
def convert(self, stream, opts, file_ext, log, def convert(self, stream, opts, file_ext, log,
accelerators): accelerators):
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.html import get_metadata_
basedir = os.getcwd() basedir = os.getcwd()
self.opts = opts self.opts = opts
@ -275,18 +275,16 @@ class HTMLInput(InputFormatPlugin):
opfpath = stream.name opfpath = stream.name
else: else:
filelist = get_filelist(stream.name, basedir, opts, log) filelist = get_filelist(stream.name, basedir, opts, log)
mi = get_metadata(stream, 'html') mi = get_metadata_(stream.read(), opts.input_encoding)
mi = OPFCreator(os.getcwdu(), mi) mi = OPFCreator(os.getcwdu(), mi)
mi.guide = None mi.guide = None
entries = [(f.path, 'application/xhtml+xml') for f in filelist] entries = [(f.path, 'application/xhtml+xml') for f in filelist]
mi.create_manifest(entries) mi.create_manifest(entries)
mi.create_spine([f.path for f in filelist]) mi.create_spine([f.path for f in filelist])
mi.render(open('metadata.opf', 'wb')) mi.render(open('metadata.opf', 'wb'), encoding=opts.input_encoding)
opfpath = os.path.abspath('metadata.opf') opfpath = os.path.abspath('metadata.opf')
opf = OPF(opfpath, os.getcwdu())
if opts.dont_package: if opts.dont_package:
return opfpath return opfpath

View File

@ -12,9 +12,18 @@ import re
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
def get_metadata(stream): def get_metadata(stream):
src = xml_to_unicode(stream.read())[0] src = stream.read()
return get_metadata_(src)
def get_metadata_(src, encoding=None):
if not isinstance(src, unicode):
if not encoding:
src = xml_to_unicode(src)[0]
else:
src = src.decode(encoding, 'replace')
# Title # Title
title = None title = None
pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL) pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
@ -26,29 +35,29 @@ def get_metadata(stream):
match = pat.search(src) match = pat.search(src)
if match: if match:
title = match.group(1) title = match.group(1)
# Author # Author
author = None author = None
pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL) pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
match = pat.search(src) match = pat.search(src)
if match: if match:
author = match.group(2).replace(',', ';') author = match.group(2).replace(',', ';')
mi = MetaInformation(title, [author] if author else None) mi = MetaInformation(title, [author] if author else None)
# Publisher # Publisher
pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL) pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
match = pat.search(src) match = pat.search(src)
if match: if match:
mi.publisher = match.group(2) mi.publisher = match.group(2)
# ISBN # ISBN
pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
match = pat.search(src) match = pat.search(src)
if match: if match:
isbn = match.group(1) isbn = match.group(1)
mi.isbn = re.sub(r'[^0-9xX]', '', isbn) mi.isbn = re.sub(r'[^0-9xX]', '', isbn)
return mi return mi

View File

@ -1,4 +1,3 @@
<?xml version="1.0" encoding="UTF-8"?>
<package version="2.0" <package version="2.0"
xmlns="http://www.idpf.org/2007/opf" xmlns="http://www.idpf.org/2007/opf"
xmlns:py="http://genshi.edgewall.org/" xmlns:py="http://genshi.edgewall.org/"

View File

@ -924,9 +924,11 @@ class OPFCreator(MetaInformation):
self.guide.set_basedir(self.base_path) self.guide.set_basedir(self.base_path)
def render(self, opf_stream=sys.stdout, ncx_stream=None, def render(self, opf_stream=sys.stdout, ncx_stream=None,
ncx_manifest_entry=None): ncx_manifest_entry=None, encoding=None):
from calibre.resources import opf_template from calibre.resources import opf_template
from calibre.utils.genshi.template import MarkupTemplate from calibre.utils.genshi.template import MarkupTemplate
if encoding is None:
encoding = 'utf-8'
template = MarkupTemplate(opf_template) template = MarkupTemplate(opf_template)
toc = getattr(self, 'toc', None) toc = getattr(self, 'toc', None)
if self.manifest: if self.manifest:
@ -948,7 +950,11 @@ class OPFCreator(MetaInformation):
cover = os.path.abspath(os.path.join(self.base_path, cover)) cover = os.path.abspath(os.path.join(self.base_path, cover))
self.guide.set_cover(cover) self.guide.set_cover(cover)
self.guide.set_basedir(self.base_path) self.guide.set_basedir(self.base_path)
opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml') opf = template.generate(
__appname__=__appname__, mi=self,
__version__=__version__).render('xml', encoding=encoding)
opf_stream.write('<?xml version="1.0" encoding="%s" ?>\n'
%encoding.upper())
opf_stream.write(opf) opf_stream.write(opf)
opf_stream.flush() opf_stream.flush()
if toc is not None and ncx_stream is not None: if toc is not None and ncx_stream is not None:

View File

@ -1516,7 +1516,8 @@ class OEBBook(object):
def __init__(self, logger, def __init__(self, logger,
html_preprocessor, html_preprocessor,
css_preprocessor=CSSPreProcessor(), css_preprocessor=CSSPreProcessor(),
encoding='utf-8', pretty_print=False): encoding='utf-8', pretty_print=False,
input_encoding='utf-8'):
"""Create empty book. Arguments: """Create empty book. Arguments:
:param:`encoding`: Default encoding for textual content read :param:`encoding`: Default encoding for textual content read
@ -1549,6 +1550,7 @@ class OEBBook(object):
""" """
_css_log_handler.log = logger _css_log_handler.log = logger
self.encoding = encoding self.encoding = encoding
self.input_encoding = input_encoding
self.html_preprocessor = html_preprocessor self.html_preprocessor = html_preprocessor
self.css_preprocessor = css_preprocessor self.css_preprocessor = css_preprocessor
self.pretty_print = pretty_print self.pretty_print = pretty_print
@ -1588,9 +1590,9 @@ class OEBBook(object):
return fix_data(data.decode('utf-16')) return fix_data(data.decode('utf-16'))
except UnicodeDecodeError: except UnicodeDecodeError:
pass pass
if self.encoding is not None: if self.input_encoding is not None:
try: try:
return fix_data(data.decode(self.encoding, 'replace')) return fix_data(data.decode(self.input_encoding, 'replace'))
except UnicodeDecodeError: except UnicodeDecodeError:
pass pass
try: try:

View File

@ -30,6 +30,7 @@ class OEBOutput(OutputFormatPlugin):
raw = etree.tostring(root, pretty_print=True, raw = etree.tostring(root, pretty_print=True,
encoding='utf-8') encoding='utf-8')
with open(href, 'wb') as f: with open(href, 'wb') as f:
f.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
f.write(raw) f.write(raw)
for item in oeb_book.manifest: for item in oeb_book.manifest: