mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make distinction between input and output encodings in OEBBook framework. Add encoding declaration to XML files produced by OEB plugin. Create OPF file in same encoding as input_encoding in HTML input plugin. This should fix all remaining issues with encoding handling for HTML/OPF input files.
This commit is contained in:
parent
ac1e73174a
commit
352b5d24ed
@ -679,7 +679,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
|
|||||||
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
|
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
|
||||||
opts.preprocess_html)
|
opts.preprocess_html)
|
||||||
oeb = OEBBook(log, html_preprocessor,
|
oeb = OEBBook(log, html_preprocessor,
|
||||||
pretty_print=opts.pretty_print, encoding=encoding)
|
pretty_print=opts.pretty_print, input_encoding=encoding)
|
||||||
# Read OEB Book into OEBBook
|
# Read OEB Book into OEBBook
|
||||||
log('Parsing all content...')
|
log('Parsing all content...')
|
||||||
if reader is None:
|
if reader is None:
|
||||||
|
@ -16,7 +16,7 @@ from urlparse import urlparse, urlunparse
|
|||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
from calibre.customize.conversion import OptionRecommendation
|
||||||
from calibre import unicode_path
|
from calibre import unicode_path
|
||||||
@ -264,7 +264,7 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
|
|
||||||
def convert(self, stream, opts, file_ext, log,
|
def convert(self, stream, opts, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
from calibre.ebooks.metadata.meta import get_metadata
|
from calibre.ebooks.metadata.html import get_metadata_
|
||||||
|
|
||||||
basedir = os.getcwd()
|
basedir = os.getcwd()
|
||||||
self.opts = opts
|
self.opts = opts
|
||||||
@ -275,18 +275,16 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
opfpath = stream.name
|
opfpath = stream.name
|
||||||
else:
|
else:
|
||||||
filelist = get_filelist(stream.name, basedir, opts, log)
|
filelist = get_filelist(stream.name, basedir, opts, log)
|
||||||
mi = get_metadata(stream, 'html')
|
mi = get_metadata_(stream.read(), opts.input_encoding)
|
||||||
mi = OPFCreator(os.getcwdu(), mi)
|
mi = OPFCreator(os.getcwdu(), mi)
|
||||||
mi.guide = None
|
mi.guide = None
|
||||||
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
||||||
mi.create_manifest(entries)
|
mi.create_manifest(entries)
|
||||||
mi.create_spine([f.path for f in filelist])
|
mi.create_spine([f.path for f in filelist])
|
||||||
|
|
||||||
mi.render(open('metadata.opf', 'wb'))
|
mi.render(open('metadata.opf', 'wb'), encoding=opts.input_encoding)
|
||||||
opfpath = os.path.abspath('metadata.opf')
|
opfpath = os.path.abspath('metadata.opf')
|
||||||
|
|
||||||
opf = OPF(opfpath, os.getcwdu())
|
|
||||||
|
|
||||||
if opts.dont_package:
|
if opts.dont_package:
|
||||||
return opfpath
|
return opfpath
|
||||||
|
|
||||||
|
@ -12,9 +12,18 @@ import re
|
|||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
|
|
||||||
def get_metadata(stream):
|
def get_metadata(stream):
|
||||||
src = xml_to_unicode(stream.read())[0]
|
src = stream.read()
|
||||||
|
return get_metadata_(src)
|
||||||
|
|
||||||
|
def get_metadata_(src, encoding=None):
|
||||||
|
if not isinstance(src, unicode):
|
||||||
|
if not encoding:
|
||||||
|
src = xml_to_unicode(src)[0]
|
||||||
|
else:
|
||||||
|
src = src.decode(encoding, 'replace')
|
||||||
|
|
||||||
# Title
|
# Title
|
||||||
title = None
|
title = None
|
||||||
pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
|
pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
|
||||||
@ -26,29 +35,29 @@ def get_metadata(stream):
|
|||||||
match = pat.search(src)
|
match = pat.search(src)
|
||||||
if match:
|
if match:
|
||||||
title = match.group(1)
|
title = match.group(1)
|
||||||
|
|
||||||
# Author
|
# Author
|
||||||
author = None
|
author = None
|
||||||
pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
|
pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
|
||||||
match = pat.search(src)
|
match = pat.search(src)
|
||||||
if match:
|
if match:
|
||||||
author = match.group(2).replace(',', ';')
|
author = match.group(2).replace(',', ';')
|
||||||
|
|
||||||
mi = MetaInformation(title, [author] if author else None)
|
mi = MetaInformation(title, [author] if author else None)
|
||||||
|
|
||||||
# Publisher
|
# Publisher
|
||||||
pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
|
pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
|
||||||
match = pat.search(src)
|
match = pat.search(src)
|
||||||
if match:
|
if match:
|
||||||
mi.publisher = match.group(2)
|
mi.publisher = match.group(2)
|
||||||
|
|
||||||
# ISBN
|
# ISBN
|
||||||
pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
|
pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
|
||||||
match = pat.search(src)
|
match = pat.search(src)
|
||||||
if match:
|
if match:
|
||||||
isbn = match.group(1)
|
isbn = match.group(1)
|
||||||
mi.isbn = re.sub(r'[^0-9xX]', '', isbn)
|
mi.isbn = re.sub(r'[^0-9xX]', '', isbn)
|
||||||
|
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<package version="2.0"
|
<package version="2.0"
|
||||||
xmlns="http://www.idpf.org/2007/opf"
|
xmlns="http://www.idpf.org/2007/opf"
|
||||||
xmlns:py="http://genshi.edgewall.org/"
|
xmlns:py="http://genshi.edgewall.org/"
|
||||||
|
@ -924,9 +924,11 @@ class OPFCreator(MetaInformation):
|
|||||||
self.guide.set_basedir(self.base_path)
|
self.guide.set_basedir(self.base_path)
|
||||||
|
|
||||||
def render(self, opf_stream=sys.stdout, ncx_stream=None,
|
def render(self, opf_stream=sys.stdout, ncx_stream=None,
|
||||||
ncx_manifest_entry=None):
|
ncx_manifest_entry=None, encoding=None):
|
||||||
from calibre.resources import opf_template
|
from calibre.resources import opf_template
|
||||||
from calibre.utils.genshi.template import MarkupTemplate
|
from calibre.utils.genshi.template import MarkupTemplate
|
||||||
|
if encoding is None:
|
||||||
|
encoding = 'utf-8'
|
||||||
template = MarkupTemplate(opf_template)
|
template = MarkupTemplate(opf_template)
|
||||||
toc = getattr(self, 'toc', None)
|
toc = getattr(self, 'toc', None)
|
||||||
if self.manifest:
|
if self.manifest:
|
||||||
@ -948,7 +950,11 @@ class OPFCreator(MetaInformation):
|
|||||||
cover = os.path.abspath(os.path.join(self.base_path, cover))
|
cover = os.path.abspath(os.path.join(self.base_path, cover))
|
||||||
self.guide.set_cover(cover)
|
self.guide.set_cover(cover)
|
||||||
self.guide.set_basedir(self.base_path)
|
self.guide.set_basedir(self.base_path)
|
||||||
opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml')
|
opf = template.generate(
|
||||||
|
__appname__=__appname__, mi=self,
|
||||||
|
__version__=__version__).render('xml', encoding=encoding)
|
||||||
|
opf_stream.write('<?xml version="1.0" encoding="%s" ?>\n'
|
||||||
|
%encoding.upper())
|
||||||
opf_stream.write(opf)
|
opf_stream.write(opf)
|
||||||
opf_stream.flush()
|
opf_stream.flush()
|
||||||
if toc is not None and ncx_stream is not None:
|
if toc is not None and ncx_stream is not None:
|
||||||
|
@ -1516,7 +1516,8 @@ class OEBBook(object):
|
|||||||
def __init__(self, logger,
|
def __init__(self, logger,
|
||||||
html_preprocessor,
|
html_preprocessor,
|
||||||
css_preprocessor=CSSPreProcessor(),
|
css_preprocessor=CSSPreProcessor(),
|
||||||
encoding='utf-8', pretty_print=False):
|
encoding='utf-8', pretty_print=False,
|
||||||
|
input_encoding='utf-8'):
|
||||||
"""Create empty book. Arguments:
|
"""Create empty book. Arguments:
|
||||||
|
|
||||||
:param:`encoding`: Default encoding for textual content read
|
:param:`encoding`: Default encoding for textual content read
|
||||||
@ -1549,6 +1550,7 @@ class OEBBook(object):
|
|||||||
"""
|
"""
|
||||||
_css_log_handler.log = logger
|
_css_log_handler.log = logger
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
self.input_encoding = input_encoding
|
||||||
self.html_preprocessor = html_preprocessor
|
self.html_preprocessor = html_preprocessor
|
||||||
self.css_preprocessor = css_preprocessor
|
self.css_preprocessor = css_preprocessor
|
||||||
self.pretty_print = pretty_print
|
self.pretty_print = pretty_print
|
||||||
@ -1588,9 +1590,9 @@ class OEBBook(object):
|
|||||||
return fix_data(data.decode('utf-16'))
|
return fix_data(data.decode('utf-16'))
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
pass
|
pass
|
||||||
if self.encoding is not None:
|
if self.input_encoding is not None:
|
||||||
try:
|
try:
|
||||||
return fix_data(data.decode(self.encoding, 'replace'))
|
return fix_data(data.decode(self.input_encoding, 'replace'))
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
|
@ -30,6 +30,7 @@ class OEBOutput(OutputFormatPlugin):
|
|||||||
raw = etree.tostring(root, pretty_print=True,
|
raw = etree.tostring(root, pretty_print=True,
|
||||||
encoding='utf-8')
|
encoding='utf-8')
|
||||||
with open(href, 'wb') as f:
|
with open(href, 'wb') as f:
|
||||||
|
f.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
|
||||||
f.write(raw)
|
f.write(raw)
|
||||||
|
|
||||||
for item in oeb_book.manifest:
|
for item in oeb_book.manifest:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user