mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to pluginize
This commit is contained in:
commit
3432735e48
@ -679,7 +679,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
|
||||
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
|
||||
opts.preprocess_html)
|
||||
oeb = OEBBook(log, html_preprocessor,
|
||||
pretty_print=opts.pretty_print, encoding=encoding)
|
||||
pretty_print=opts.pretty_print, input_encoding=encoding)
|
||||
# Read OEB Book into OEBBook
|
||||
log('Parsing all content...')
|
||||
if reader is None:
|
||||
|
@ -16,7 +16,7 @@ from urlparse import urlparse, urlunparse
|
||||
from urllib import unquote
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre import unicode_path
|
||||
@ -264,7 +264,7 @@ class HTMLInput(InputFormatPlugin):
|
||||
|
||||
def convert(self, stream, opts, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.metadata.html import get_metadata_
|
||||
|
||||
basedir = os.getcwd()
|
||||
self.opts = opts
|
||||
@ -275,18 +275,16 @@ class HTMLInput(InputFormatPlugin):
|
||||
opfpath = stream.name
|
||||
else:
|
||||
filelist = get_filelist(stream.name, basedir, opts, log)
|
||||
mi = get_metadata(stream, 'html')
|
||||
mi = get_metadata_(stream.read(), opts.input_encoding)
|
||||
mi = OPFCreator(os.getcwdu(), mi)
|
||||
mi.guide = None
|
||||
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
||||
mi.create_manifest(entries)
|
||||
mi.create_spine([f.path for f in filelist])
|
||||
|
||||
mi.render(open('metadata.opf', 'wb'))
|
||||
mi.render(open('metadata.opf', 'wb'), encoding=opts.input_encoding)
|
||||
opfpath = os.path.abspath('metadata.opf')
|
||||
|
||||
opf = OPF(opfpath, os.getcwdu())
|
||||
|
||||
if opts.dont_package:
|
||||
return opfpath
|
||||
|
||||
|
@ -12,9 +12,18 @@ import re
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
|
||||
|
||||
def get_metadata(stream):
|
||||
src = xml_to_unicode(stream.read())[0]
|
||||
|
||||
src = stream.read()
|
||||
return get_metadata_(src)
|
||||
|
||||
def get_metadata_(src, encoding=None):
|
||||
if not isinstance(src, unicode):
|
||||
if not encoding:
|
||||
src = xml_to_unicode(src)[0]
|
||||
else:
|
||||
src = src.decode(encoding, 'replace')
|
||||
|
||||
# Title
|
||||
title = None
|
||||
pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
|
||||
@ -26,29 +35,29 @@ def get_metadata(stream):
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
title = match.group(1)
|
||||
|
||||
|
||||
# Author
|
||||
author = None
|
||||
pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
author = match.group(2).replace(',', ';')
|
||||
|
||||
|
||||
mi = MetaInformation(title, [author] if author else None)
|
||||
|
||||
|
||||
# Publisher
|
||||
pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
mi.publisher = match.group(2)
|
||||
|
||||
|
||||
# ISBN
|
||||
pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
isbn = match.group(1)
|
||||
mi.isbn = re.sub(r'[^0-9xX]', '', isbn)
|
||||
|
||||
|
||||
return mi
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<package version="2.0"
|
||||
xmlns="http://www.idpf.org/2007/opf"
|
||||
xmlns:py="http://genshi.edgewall.org/"
|
||||
|
@ -924,9 +924,11 @@ class OPFCreator(MetaInformation):
|
||||
self.guide.set_basedir(self.base_path)
|
||||
|
||||
def render(self, opf_stream=sys.stdout, ncx_stream=None,
|
||||
ncx_manifest_entry=None):
|
||||
ncx_manifest_entry=None, encoding=None):
|
||||
from calibre.resources import opf_template
|
||||
from calibre.utils.genshi.template import MarkupTemplate
|
||||
if encoding is None:
|
||||
encoding = 'utf-8'
|
||||
template = MarkupTemplate(opf_template)
|
||||
toc = getattr(self, 'toc', None)
|
||||
if self.manifest:
|
||||
@ -948,7 +950,11 @@ class OPFCreator(MetaInformation):
|
||||
cover = os.path.abspath(os.path.join(self.base_path, cover))
|
||||
self.guide.set_cover(cover)
|
||||
self.guide.set_basedir(self.base_path)
|
||||
opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml')
|
||||
opf = template.generate(
|
||||
__appname__=__appname__, mi=self,
|
||||
__version__=__version__).render('xml', encoding=encoding)
|
||||
opf_stream.write('<?xml version="1.0" encoding="%s" ?>\n'
|
||||
%encoding.upper())
|
||||
opf_stream.write(opf)
|
||||
opf_stream.flush()
|
||||
if toc is not None and ncx_stream is not None:
|
||||
|
@ -1516,7 +1516,8 @@ class OEBBook(object):
|
||||
def __init__(self, logger,
|
||||
html_preprocessor,
|
||||
css_preprocessor=CSSPreProcessor(),
|
||||
encoding='utf-8', pretty_print=False):
|
||||
encoding='utf-8', pretty_print=False,
|
||||
input_encoding='utf-8'):
|
||||
"""Create empty book. Arguments:
|
||||
|
||||
:param:`encoding`: Default encoding for textual content read
|
||||
@ -1549,6 +1550,7 @@ class OEBBook(object):
|
||||
"""
|
||||
_css_log_handler.log = logger
|
||||
self.encoding = encoding
|
||||
self.input_encoding = input_encoding
|
||||
self.html_preprocessor = html_preprocessor
|
||||
self.css_preprocessor = css_preprocessor
|
||||
self.pretty_print = pretty_print
|
||||
@ -1588,9 +1590,9 @@ class OEBBook(object):
|
||||
return fix_data(data.decode('utf-16'))
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
if self.encoding is not None:
|
||||
if self.input_encoding is not None:
|
||||
try:
|
||||
return fix_data(data.decode(self.encoding, 'replace'))
|
||||
return fix_data(data.decode(self.input_encoding, 'replace'))
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
try:
|
||||
|
@ -130,6 +130,9 @@ class EbookIterator(object):
|
||||
plumber.opts.dont_package = True
|
||||
if hasattr(plumber.opts, 'no_process'):
|
||||
plumber.opts.no_process = True
|
||||
if hasattr(plumber.input_plugin, '_preprocess_html_for_viewer'):
|
||||
plumber.input_plugin._preprocess_html_for_viewer = True
|
||||
|
||||
self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
|
||||
plumber.opts, plumber.input_fmt, self.log,
|
||||
{}, self.base)
|
||||
|
@ -30,6 +30,7 @@ class OEBOutput(OutputFormatPlugin):
|
||||
raw = etree.tostring(root, pretty_print=True,
|
||||
encoding='utf-8')
|
||||
with open(href, 'wb') as f:
|
||||
f.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
|
||||
f.write(raw)
|
||||
|
||||
for item in oeb_book.manifest:
|
||||
|
@ -17,10 +17,18 @@ class PDFInput(InputFormatPlugin):
|
||||
description = 'Convert PDF files to HTML'
|
||||
file_types = set(['pdf'])
|
||||
|
||||
_preprocess_html_for_viewer = False
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
html = pdftohtml(stream.name)
|
||||
|
||||
if self._preprocess_html_for_viewer:
|
||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||
prepro = HTMLPreProcessor(lambda x:x, False)
|
||||
html = prepro(html.decode('utf-8')).encode('utf-8')
|
||||
|
||||
|
||||
with open('index.html', 'wb') as index:
|
||||
index.write(html)
|
||||
|
||||
|
@ -30,6 +30,13 @@ PARALLEL_FUNCS = {
|
||||
'read_metadata' :
|
||||
('calibre.ebooks.metadata.worker', 'read_metadata_', 'notification'),
|
||||
|
||||
'read_pdf_metadata' :
|
||||
('calibre.utils.podofo.__init__', 'get_metadata_', None),
|
||||
|
||||
'write_pdf_metadata' :
|
||||
('calibre.utils.podofo.__init__', 'set_metadata_', None),
|
||||
|
||||
|
||||
'save_book' :
|
||||
('calibre.ebooks.metadata.worker', 'save_book', 'notification'),
|
||||
}
|
||||
|
@ -6,11 +6,14 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import os, time
|
||||
|
||||
from calibre.constants import plugins, preferred_encoding
|
||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors, \
|
||||
authors_to_string
|
||||
from calibre.utils.ipc.job import ParallelJob
|
||||
from calibre.utils.ipc.server import Server
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
podofo, podofo_err = plugins['podofo']
|
||||
|
||||
@ -19,22 +22,43 @@ class Unavailable(Exception): pass
|
||||
def get_metadata(stream):
|
||||
if not podofo:
|
||||
raise Unavailable(podofo_err)
|
||||
raw = stream.read()
|
||||
stream.seek(0)
|
||||
p = podofo.PDFDoc()
|
||||
p.load(raw)
|
||||
title = p.title
|
||||
if not title:
|
||||
pt = PersistentTemporaryFile('_podofo.pdf')
|
||||
pt.write(stream.read())
|
||||
pt.close()
|
||||
server = Server(pool_size=1)
|
||||
job = ParallelJob('read_pdf_metadata', 'Read pdf metadata',
|
||||
lambda x,y:x, args=[pt.name])
|
||||
server.add_job(job)
|
||||
while not job.is_finished:
|
||||
time.sleep(0.1)
|
||||
job.update()
|
||||
|
||||
job.update()
|
||||
server.close()
|
||||
if job.result is None:
|
||||
raise ValueError('Failed to read metadata: ' + job.details)
|
||||
title, authors, creator = job.result
|
||||
if title == '_':
|
||||
title = getattr(stream, 'name', _('Unknown'))
|
||||
title = os.path.splitext(os.path.basename(title))[0]
|
||||
author = p.author
|
||||
authors = string_to_authors(author) if author else [_('Unknown')]
|
||||
title = os.path.splitext(title)[0]
|
||||
|
||||
mi = MetaInformation(title, authors)
|
||||
creator = p.creator
|
||||
if creator:
|
||||
mi.book_producer = creator
|
||||
if os.path.exists(pt.name): os.remove(pt.name)
|
||||
return mi
|
||||
|
||||
def get_metadata_(path):
|
||||
p = podofo.PDFDoc()
|
||||
p.open(path)
|
||||
title = p.title
|
||||
if not title:
|
||||
title = '_'
|
||||
author = p.author
|
||||
authors = string_to_authors(author) if author else [_('Unknown')]
|
||||
creator = p.creator
|
||||
return (title, authors, creator)
|
||||
|
||||
def prep(val):
|
||||
if not val:
|
||||
return u''
|
||||
@ -45,21 +69,43 @@ def prep(val):
|
||||
def set_metadata(stream, mi):
|
||||
if not podofo:
|
||||
raise Unavailable(podofo_err)
|
||||
raw = stream.read()
|
||||
pt = PersistentTemporaryFile('_podofo.pdf')
|
||||
pt.write(stream.read())
|
||||
pt.close()
|
||||
server = Server(pool_size=1)
|
||||
job = ParallelJob('write_pdf_metadata', 'Write pdf metadata',
|
||||
lambda x,y:x, args=[pt.name, mi.title, mi.authors, mi.book_producer])
|
||||
server.add_job(job)
|
||||
while not job.is_finished:
|
||||
time.sleep(0.1)
|
||||
job.update()
|
||||
|
||||
job.update()
|
||||
server.close()
|
||||
if job.result is not None:
|
||||
stream.seek(0)
|
||||
stream.truncate()
|
||||
stream.write(job.result)
|
||||
stream.flush()
|
||||
stream.seek(0)
|
||||
|
||||
|
||||
|
||||
def set_metadata_(path, title, authors, bkp):
|
||||
p = podofo.PDFDoc()
|
||||
p.load(raw)
|
||||
title = prep(mi.title)
|
||||
p.open(path)
|
||||
title = prep(title)
|
||||
touched = False
|
||||
if title:
|
||||
p.title = title
|
||||
touched = True
|
||||
|
||||
author = prep(authors_to_string(mi.authors))
|
||||
author = prep(authors_to_string(authors))
|
||||
if author:
|
||||
p.author = author
|
||||
touched = True
|
||||
|
||||
bkp = prep(mi.book_producer)
|
||||
bkp = prep(bkp)
|
||||
if bkp:
|
||||
p.creator = bkp
|
||||
touched = True
|
||||
@ -68,12 +114,7 @@ def set_metadata(stream, mi):
|
||||
from calibre.ptempfile import TemporaryFile
|
||||
with TemporaryFile('_pdf_set_metadata.pdf') as f:
|
||||
p.save(f)
|
||||
raw = open(f, 'rb').read()
|
||||
stream.seek(0)
|
||||
stream.truncate()
|
||||
stream.write(raw)
|
||||
stream.flush()
|
||||
stream.seek(0)
|
||||
return open(f, 'rb').read()
|
||||
|
||||
if __name__ == '__main__':
|
||||
f = '/tmp/t.pdf'
|
||||
|
@ -64,6 +64,24 @@ podofo_PDFDoc_load(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
||||
return Py_None;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
podofo_PDFDoc_open(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
||||
char *fname;
|
||||
|
||||
if (PyArg_ParseTuple(args, "s", &fname)) {
|
||||
try {
|
||||
self->doc->Load(fname);
|
||||
} catch(const PdfError & err) {
|
||||
podofo_set_exception(err);
|
||||
return NULL;
|
||||
}
|
||||
} else return NULL;
|
||||
|
||||
|
||||
Py_INCREF(Py_None);
|
||||
return Py_None;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
podofo_PDFDoc_save(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
||||
char *buffer;
|
||||
@ -232,6 +250,9 @@ static PyMethodDef podofo_PDFDoc_methods[] = {
|
||||
{"load", (PyCFunction)podofo_PDFDoc_load, METH_VARARGS,
|
||||
"Load a PDF document from a byte buffer (string)"
|
||||
},
|
||||
{"open", (PyCFunction)podofo_PDFDoc_open, METH_VARARGS,
|
||||
"Load a PDF document from a file path (string)"
|
||||
},
|
||||
{"save", (PyCFunction)podofo_PDFDoc_save, METH_VARARGS,
|
||||
"Save the PDF document to a path on disk"
|
||||
},
|
||||
|
Loading…
x
Reference in New Issue
Block a user