diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index eb61e6d988..9bab5d6701 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -679,7 +679,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, opts.preprocess_html) oeb = OEBBook(log, html_preprocessor, - pretty_print=opts.pretty_print, encoding=encoding) + pretty_print=opts.pretty_print, input_encoding=encoding) # Read OEB Book into OEBBook log('Parsing all content...') if reader is None: diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index f880d8731c..f566714878 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -16,7 +16,7 @@ from urlparse import urlparse, urlunparse from urllib import unquote from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.metadata.opf2 import OPFCreator, OPF +from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.chardet import xml_to_unicode from calibre.customize.conversion import OptionRecommendation from calibre import unicode_path @@ -264,7 +264,7 @@ class HTMLInput(InputFormatPlugin): def convert(self, stream, opts, file_ext, log, accelerators): - from calibre.ebooks.metadata.meta import get_metadata + from calibre.ebooks.metadata.html import get_metadata_ basedir = os.getcwd() self.opts = opts @@ -275,18 +275,16 @@ class HTMLInput(InputFormatPlugin): opfpath = stream.name else: filelist = get_filelist(stream.name, basedir, opts, log) - mi = get_metadata(stream, 'html') + mi = get_metadata_(stream.read(), opts.input_encoding) mi = OPFCreator(os.getcwdu(), mi) mi.guide = None entries = [(f.path, 'application/xhtml+xml') for f in filelist] mi.create_manifest(entries) mi.create_spine([f.path for f in filelist]) - mi.render(open('metadata.opf', 'wb')) + mi.render(open('metadata.opf', 'wb'), encoding=opts.input_encoding) opfpath = os.path.abspath('metadata.opf') - opf = OPF(opfpath, os.getcwdu()) - if opts.dont_package: return opfpath diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index 9ef578c858..d5aa9b8bef 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -12,9 +12,18 @@ import re from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.chardet import xml_to_unicode + def get_metadata(stream): - src = xml_to_unicode(stream.read())[0] - + src = stream.read() + return get_metadata_(src) + +def get_metadata_(src, encoding=None): + if not isinstance(src, unicode): + if not encoding: + src = xml_to_unicode(src)[0] + else: + src = src.decode(encoding, 'replace') + # Title title = None pat = re.compile(r'', re.DOTALL) @@ -26,29 +35,29 @@ def get_metadata(stream): match = pat.search(src) if match: title = match.group(1) - + # Author author = None pat = re.compile(r'', re.DOTALL) match = pat.search(src) if match: author = match.group(2).replace(',', ';') - + mi = MetaInformation(title, [author] if author else None) - + # Publisher pat = re.compile(r'', re.DOTALL) match = pat.search(src) if match: mi.publisher = match.group(2) - + # ISBN pat = re.compile(r'', re.DOTALL) match = pat.search(src) if match: isbn = match.group(1) mi.isbn = re.sub(r'[^0-9xX]', '', isbn) - + return mi - - \ No newline at end of file + + diff --git a/src/calibre/ebooks/metadata/opf.xml b/src/calibre/ebooks/metadata/opf.xml index 619fb3301c..027d560ffa 100644 --- a/src/calibre/ebooks/metadata/opf.xml +++ b/src/calibre/ebooks/metadata/opf.xml @@ -1,4 +1,3 @@ - \n' + %encoding.upper()) opf_stream.write(opf) opf_stream.flush() if toc is not None and ncx_stream is not None: diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 55cc2f926b..e2a4875399 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1516,7 +1516,8 @@ class OEBBook(object): def __init__(self, logger, html_preprocessor, css_preprocessor=CSSPreProcessor(), - encoding='utf-8', pretty_print=False): + encoding='utf-8', pretty_print=False, + input_encoding='utf-8'): """Create empty book. Arguments: :param:`encoding`: Default encoding for textual content read @@ -1549,6 +1550,7 @@ class OEBBook(object): """ _css_log_handler.log = logger self.encoding = encoding + self.input_encoding = input_encoding self.html_preprocessor = html_preprocessor self.css_preprocessor = css_preprocessor self.pretty_print = pretty_print @@ -1588,9 +1590,9 @@ class OEBBook(object): return fix_data(data.decode('utf-16')) except UnicodeDecodeError: pass - if self.encoding is not None: + if self.input_encoding is not None: try: - return fix_data(data.decode(self.encoding, 'replace')) + return fix_data(data.decode(self.input_encoding, 'replace')) except UnicodeDecodeError: pass try: diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index 6f141f7e5e..2cb513293c 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -30,6 +30,7 @@ class OEBOutput(OutputFormatPlugin): raw = etree.tostring(root, pretty_print=True, encoding='utf-8') with open(href, 'wb') as f: + f.write('\n') f.write(raw) for item in oeb_book.manifest: