Make distinction between input and output encodings in OEBBook framework. Add encoding declaration to XML files produced by OEB plugin. Create OPF file in same encoding as input_encoding in HTML input plugin. This should fix all remaining issues with encoding handling for HTML/OPF input files.

2025-07-08 18:54:09 -04:00 · 2009-05-24 20:24:06 -07:00 · 2009-05-24 20:24:06 -07:00 · 352b5d24ed
commit 352b5d24ed
parent ac1e73174a
7 changed files with 37 additions and 22 deletions
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -679,7 +679,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
    html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
            opts.preprocess_html)
    oeb = OEBBook(log, html_preprocessor,
-            pretty_print=opts.pretty_print, encoding=encoding)
+            pretty_print=opts.pretty_print, input_encoding=encoding)
    # Read OEB Book into OEBBook
    log('Parsing all content...')
    if reader is None:
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -16,7 +16,7 @@ from urlparse import urlparse, urlunparse
 from urllib import unquote

 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
+from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.customize.conversion import OptionRecommendation
 from calibre import unicode_path
@ -264,7 +264,7 @@ class HTMLInput(InputFormatPlugin):

    def convert(self, stream, opts, file_ext, log,
                accelerators):
-        from calibre.ebooks.metadata.meta import get_metadata
+        from calibre.ebooks.metadata.html import get_metadata_

        basedir = os.getcwd()
        self.opts = opts
@ -275,18 +275,16 @@ class HTMLInput(InputFormatPlugin):
            opfpath = stream.name
        else:
            filelist = get_filelist(stream.name, basedir, opts, log)
-            mi = get_metadata(stream, 'html')
+            mi = get_metadata_(stream.read(), opts.input_encoding)
            mi = OPFCreator(os.getcwdu(), mi)
            mi.guide = None
            entries = [(f.path, 'application/xhtml+xml') for f in filelist]
            mi.create_manifest(entries)
            mi.create_spine([f.path for f in filelist])

-            mi.render(open('metadata.opf', 'wb'))
+            mi.render(open('metadata.opf', 'wb'), encoding=opts.input_encoding)
            opfpath = os.path.abspath('metadata.opf')

-        opf = OPF(opfpath, os.getcwdu())
-
        if opts.dont_package:
            return opfpath

--- a/src/calibre/ebooks/metadata/html.py
+++ b/src/calibre/ebooks/metadata/html.py
@ -12,9 +12,18 @@ import re
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.chardet import xml_to_unicode

+
 def get_metadata(stream):
-    src = xml_to_unicode(stream.read())[0]
-    
+    src = stream.read()
+    return get_metadata_(src)
+
+def get_metadata_(src, encoding=None):
+    if not isinstance(src, unicode):
+        if not encoding:
+            src = xml_to_unicode(src)[0]
+        else:
+            src = src.decode(encoding, 'replace')
+
    # Title
    title = None
    pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
@ -26,29 +35,29 @@ def get_metadata(stream):
        match = pat.search(src)
        if match:
            title = match.group(1)
-        
+
    # Author
    author = None
    pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        author = match.group(2).replace(',', ';')
-        
+
    mi = MetaInformation(title, [author] if author else None)
-    
+
    # Publisher
    pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        mi.publisher = match.group(2)
-        
+
    # ISBN
    pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        isbn = match.group(1)
        mi.isbn = re.sub(r'[^0-9xX]', '', isbn)
-        
+
    return mi
-    
-    
+
+
--- a/src/calibre/ebooks/metadata/opf.xml
+++ b/src/calibre/ebooks/metadata/opf.xml
@ -1,4 +1,3 @@
-<?xml version="1.0"  encoding="UTF-8"?>
 <package version="2.0" 
         xmlns="http://www.idpf.org/2007/opf" 
         xmlns:py="http://genshi.edgewall.org/" 
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@ -924,9 +924,11 @@ class OPFCreator(MetaInformation):
        self.guide.set_basedir(self.base_path)

    def render(self, opf_stream=sys.stdout, ncx_stream=None,
-               ncx_manifest_entry=None):
+               ncx_manifest_entry=None, encoding=None):
        from calibre.resources import opf_template
        from calibre.utils.genshi.template import MarkupTemplate
+        if encoding is None:
+            encoding = 'utf-8'
        template = MarkupTemplate(opf_template)
        toc = getattr(self, 'toc', None)
        if self.manifest:
@ -948,7 +950,11 @@ class OPFCreator(MetaInformation):
                cover = os.path.abspath(os.path.join(self.base_path, cover))
            self.guide.set_cover(cover)
        self.guide.set_basedir(self.base_path)
-        opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml')
+        opf = template.generate(
+                __appname__=__appname__, mi=self,
+                __version__=__version__).render('xml', encoding=encoding)
+        opf_stream.write('<?xml version="1.0" encoding="%s" ?>\n'
+                %encoding.upper())
        opf_stream.write(opf)
        opf_stream.flush()
        if toc is not None and ncx_stream is not None:
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -1516,7 +1516,8 @@ class OEBBook(object):
    def __init__(self, logger,
            html_preprocessor,
            css_preprocessor=CSSPreProcessor(),
-            encoding='utf-8', pretty_print=False):
+            encoding='utf-8', pretty_print=False,
+            input_encoding='utf-8'):
        """Create empty book.  Arguments:

        :param:`encoding`: Default encoding for textual content read
@ -1549,6 +1550,7 @@ class OEBBook(object):
        """
        _css_log_handler.log = logger
        self.encoding = encoding
+        self.input_encoding = input_encoding
        self.html_preprocessor = html_preprocessor
        self.css_preprocessor = css_preprocessor
        self.pretty_print = pretty_print
@ -1588,9 +1590,9 @@ class OEBBook(object):
                return fix_data(data.decode('utf-16'))
            except UnicodeDecodeError:
                pass
-        if self.encoding is not None:
+        if self.input_encoding is not None:
            try:
-                return fix_data(data.decode(self.encoding, 'replace'))
+                return fix_data(data.decode(self.input_encoding, 'replace'))
            except UnicodeDecodeError:
                pass
        try:
--- a/src/calibre/ebooks/oeb/output.py
+++ b/src/calibre/ebooks/oeb/output.py
@ -30,6 +30,7 @@ class OEBOutput(OutputFormatPlugin):
                    raw = etree.tostring(root, pretty_print=True,
                            encoding='utf-8')
                    with open(href, 'wb') as f:
+                        f.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
                        f.write(raw)

            for item in oeb_book.manifest: