diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index eb61e6d988..9bab5d6701 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -679,7 +679,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, opts.preprocess_html) oeb = OEBBook(log, html_preprocessor, - pretty_print=opts.pretty_print, encoding=encoding) + pretty_print=opts.pretty_print, input_encoding=encoding) # Read OEB Book into OEBBook log('Parsing all content...') if reader is None: diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index f880d8731c..f566714878 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -16,7 +16,7 @@ from urlparse import urlparse, urlunparse from urllib import unquote from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.metadata.opf2 import OPFCreator, OPF +from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.chardet import xml_to_unicode from calibre.customize.conversion import OptionRecommendation from calibre import unicode_path @@ -264,7 +264,7 @@ class HTMLInput(InputFormatPlugin): def convert(self, stream, opts, file_ext, log, accelerators): - from calibre.ebooks.metadata.meta import get_metadata + from calibre.ebooks.metadata.html import get_metadata_ basedir = os.getcwd() self.opts = opts @@ -275,18 +275,16 @@ class HTMLInput(InputFormatPlugin): opfpath = stream.name else: filelist = get_filelist(stream.name, basedir, opts, log) - mi = get_metadata(stream, 'html') + mi = get_metadata_(stream.read(), opts.input_encoding) mi = OPFCreator(os.getcwdu(), mi) mi.guide = None entries = [(f.path, 'application/xhtml+xml') for f in filelist] mi.create_manifest(entries) mi.create_spine([f.path for f in filelist]) - mi.render(open('metadata.opf', 'wb')) + mi.render(open('metadata.opf', 'wb'), encoding=opts.input_encoding) opfpath = os.path.abspath('metadata.opf') - opf = OPF(opfpath, os.getcwdu()) - if opts.dont_package: return opfpath diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index 9ef578c858..d5aa9b8bef 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -12,9 +12,18 @@ import re from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.chardet import xml_to_unicode + def get_metadata(stream): - src = xml_to_unicode(stream.read())[0] - + src = stream.read() + return get_metadata_(src) + +def get_metadata_(src, encoding=None): + if not isinstance(src, unicode): + if not encoding: + src = xml_to_unicode(src)[0] + else: + src = src.decode(encoding, 'replace') + # Title title = None pat = re.compile(r'', re.DOTALL) @@ -26,29 +35,29 @@ def get_metadata(stream): match = pat.search(src) if match: title = match.group(1) - + # Author author = None pat = re.compile(r'', re.DOTALL) match = pat.search(src) if match: author = match.group(2).replace(',', ';') - + mi = MetaInformation(title, [author] if author else None) - + # Publisher pat = re.compile(r'', re.DOTALL) match = pat.search(src) if match: mi.publisher = match.group(2) - + # ISBN pat = re.compile(r'', re.DOTALL) match = pat.search(src) if match: isbn = match.group(1) mi.isbn = re.sub(r'[^0-9xX]', '', isbn) - + return mi - - \ No newline at end of file + + diff --git a/src/calibre/ebooks/metadata/opf.xml b/src/calibre/ebooks/metadata/opf.xml index 619fb3301c..027d560ffa 100644 --- a/src/calibre/ebooks/metadata/opf.xml +++ b/src/calibre/ebooks/metadata/opf.xml @@ -1,4 +1,3 @@ - \n' + %encoding.upper()) opf_stream.write(opf) opf_stream.flush() if toc is not None and ncx_stream is not None: diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 55cc2f926b..e2a4875399 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1516,7 +1516,8 @@ class OEBBook(object): def __init__(self, logger, html_preprocessor, css_preprocessor=CSSPreProcessor(), - encoding='utf-8', pretty_print=False): + encoding='utf-8', pretty_print=False, + input_encoding='utf-8'): """Create empty book. Arguments: :param:`encoding`: Default encoding for textual content read @@ -1549,6 +1550,7 @@ class OEBBook(object): """ _css_log_handler.log = logger self.encoding = encoding + self.input_encoding = input_encoding self.html_preprocessor = html_preprocessor self.css_preprocessor = css_preprocessor self.pretty_print = pretty_print @@ -1588,9 +1590,9 @@ class OEBBook(object): return fix_data(data.decode('utf-16')) except UnicodeDecodeError: pass - if self.encoding is not None: + if self.input_encoding is not None: try: - return fix_data(data.decode(self.encoding, 'replace')) + return fix_data(data.decode(self.input_encoding, 'replace')) except UnicodeDecodeError: pass try: diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index 46b3e64644..6653240629 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -130,6 +130,9 @@ class EbookIterator(object): plumber.opts.dont_package = True if hasattr(plumber.opts, 'no_process'): plumber.opts.no_process = True + if hasattr(plumber.input_plugin, '_preprocess_html_for_viewer'): + plumber.input_plugin._preprocess_html_for_viewer = True + self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'), plumber.opts, plumber.input_fmt, self.log, {}, self.base) diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index 6f141f7e5e..2cb513293c 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -30,6 +30,7 @@ class OEBOutput(OutputFormatPlugin): raw = etree.tostring(root, pretty_print=True, encoding='utf-8') with open(href, 'wb') as f: + f.write('\n') f.write(raw) for item in oeb_book.manifest: diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 6aa695c912..d6e66ebd74 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -17,10 +17,18 @@ class PDFInput(InputFormatPlugin): description = 'Convert PDF files to HTML' file_types = set(['pdf']) + _preprocess_html_for_viewer = False + def convert(self, stream, options, file_ext, log, accelerators): html = pdftohtml(stream.name) + if self._preprocess_html_for_viewer: + from calibre.ebooks.conversion.preprocess import HTMLPreProcessor + prepro = HTMLPreProcessor(lambda x:x, False) + html = prepro(html.decode('utf-8')).encode('utf-8') + + with open('index.html', 'wb') as index: index.write(html) diff --git a/src/calibre/utils/ipc/worker.py b/src/calibre/utils/ipc/worker.py index 8898d753a2..de220340db 100644 --- a/src/calibre/utils/ipc/worker.py +++ b/src/calibre/utils/ipc/worker.py @@ -30,6 +30,13 @@ PARALLEL_FUNCS = { 'read_metadata' : ('calibre.ebooks.metadata.worker', 'read_metadata_', 'notification'), + 'read_pdf_metadata' : + ('calibre.utils.podofo.__init__', 'get_metadata_', None), + + 'write_pdf_metadata' : + ('calibre.utils.podofo.__init__', 'set_metadata_', None), + + 'save_book' : ('calibre.ebooks.metadata.worker', 'save_book', 'notification'), } diff --git a/src/calibre/utils/podofo/__init__.py b/src/calibre/utils/podofo/__init__.py index 3db6699dfe..8654a95c04 100644 --- a/src/calibre/utils/podofo/__init__.py +++ b/src/calibre/utils/podofo/__init__.py @@ -6,11 +6,14 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os +import os, time from calibre.constants import plugins, preferred_encoding from calibre.ebooks.metadata import MetaInformation, string_to_authors, \ authors_to_string +from calibre.utils.ipc.job import ParallelJob +from calibre.utils.ipc.server import Server +from calibre.ptempfile import PersistentTemporaryFile podofo, podofo_err = plugins['podofo'] @@ -19,22 +22,43 @@ class Unavailable(Exception): pass def get_metadata(stream): if not podofo: raise Unavailable(podofo_err) - raw = stream.read() - stream.seek(0) - p = podofo.PDFDoc() - p.load(raw) - title = p.title - if not title: + pt = PersistentTemporaryFile('_podofo.pdf') + pt.write(stream.read()) + pt.close() + server = Server(pool_size=1) + job = ParallelJob('read_pdf_metadata', 'Read pdf metadata', + lambda x,y:x, args=[pt.name]) + server.add_job(job) + while not job.is_finished: + time.sleep(0.1) + job.update() + + job.update() + server.close() + if job.result is None: + raise ValueError('Failed to read metadata: ' + job.details) + title, authors, creator = job.result + if title == '_': title = getattr(stream, 'name', _('Unknown')) - title = os.path.splitext(os.path.basename(title))[0] - author = p.author - authors = string_to_authors(author) if author else [_('Unknown')] + title = os.path.splitext(title)[0] + mi = MetaInformation(title, authors) - creator = p.creator if creator: mi.book_producer = creator + if os.path.exists(pt.name): os.remove(pt.name) return mi +def get_metadata_(path): + p = podofo.PDFDoc() + p.open(path) + title = p.title + if not title: + title = '_' + author = p.author + authors = string_to_authors(author) if author else [_('Unknown')] + creator = p.creator + return (title, authors, creator) + def prep(val): if not val: return u'' @@ -45,21 +69,43 @@ def prep(val): def set_metadata(stream, mi): if not podofo: raise Unavailable(podofo_err) - raw = stream.read() + pt = PersistentTemporaryFile('_podofo.pdf') + pt.write(stream.read()) + pt.close() + server = Server(pool_size=1) + job = ParallelJob('write_pdf_metadata', 'Write pdf metadata', + lambda x,y:x, args=[pt.name, mi.title, mi.authors, mi.book_producer]) + server.add_job(job) + while not job.is_finished: + time.sleep(0.1) + job.update() + + job.update() + server.close() + if job.result is not None: + stream.seek(0) + stream.truncate() + stream.write(job.result) + stream.flush() + stream.seek(0) + + + +def set_metadata_(path, title, authors, bkp): p = podofo.PDFDoc() - p.load(raw) - title = prep(mi.title) + p.open(path) + title = prep(title) touched = False if title: p.title = title touched = True - author = prep(authors_to_string(mi.authors)) + author = prep(authors_to_string(authors)) if author: p.author = author touched = True - bkp = prep(mi.book_producer) + bkp = prep(bkp) if bkp: p.creator = bkp touched = True @@ -68,12 +114,7 @@ def set_metadata(stream, mi): from calibre.ptempfile import TemporaryFile with TemporaryFile('_pdf_set_metadata.pdf') as f: p.save(f) - raw = open(f, 'rb').read() - stream.seek(0) - stream.truncate() - stream.write(raw) - stream.flush() - stream.seek(0) + return open(f, 'rb').read() if __name__ == '__main__': f = '/tmp/t.pdf' diff --git a/src/calibre/utils/podofo/podofo.cpp b/src/calibre/utils/podofo/podofo.cpp index fd5cc6bc32..e81cf0b475 100644 --- a/src/calibre/utils/podofo/podofo.cpp +++ b/src/calibre/utils/podofo/podofo.cpp @@ -64,6 +64,24 @@ podofo_PDFDoc_load(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) { return Py_None; } +static PyObject * +podofo_PDFDoc_open(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) { + char *fname; + + if (PyArg_ParseTuple(args, "s", &fname)) { + try { + self->doc->Load(fname); + } catch(const PdfError & err) { + podofo_set_exception(err); + return NULL; + } +} else return NULL; + + + Py_INCREF(Py_None); + return Py_None; +} + static PyObject * podofo_PDFDoc_save(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) { char *buffer; @@ -232,6 +250,9 @@ static PyMethodDef podofo_PDFDoc_methods[] = { {"load", (PyCFunction)podofo_PDFDoc_load, METH_VARARGS, "Load a PDF document from a byte buffer (string)" }, + {"open", (PyCFunction)podofo_PDFDoc_open, METH_VARARGS, + "Load a PDF document from a file path (string)" + }, {"save", (PyCFunction)podofo_PDFDoc_save, METH_VARARGS, "Save the PDF document to a path on disk" },