mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	More podofo refactoring
This commit is contained in:
		
							parent
							
								
									1136f26186
								
							
						
					
					
						commit
						238bc24cf2
					
				@ -148,7 +148,6 @@ extensions = [
 | 
			
		||||
                    libraries=['podofo'],
 | 
			
		||||
                    lib_dirs=[podofo_lib],
 | 
			
		||||
                    inc_dirs=[podofo_inc, os.path.dirname(podofo_inc)],
 | 
			
		||||
                    optional=True,
 | 
			
		||||
                    error=podofo_error),
 | 
			
		||||
 | 
			
		||||
    Extension('pictureflow',
 | 
			
		||||
 | 
			
		||||
@ -43,12 +43,6 @@ PARALLEL_FUNCS = {
 | 
			
		||||
      'read_metadata' :
 | 
			
		||||
      ('calibre.ebooks.metadata.worker', 'read_metadata_', 'notification'),
 | 
			
		||||
 | 
			
		||||
      'read_pdf_metadata' :
 | 
			
		||||
      ('calibre.utils.podofo.__init__', 'get_metadata_', None),
 | 
			
		||||
 | 
			
		||||
      'write_pdf_metadata' :
 | 
			
		||||
      ('calibre.utils.podofo.__init__', 'set_metadata_', None),
 | 
			
		||||
 | 
			
		||||
      'save_book' :
 | 
			
		||||
      ('calibre.ebooks.metadata.worker', 'save_book', 'notification'),
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -6,109 +6,12 @@ __license__   = 'GPL v3'
 | 
			
		||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 | 
			
		||||
__docformat__ = 'restructuredtext en'
 | 
			
		||||
 | 
			
		||||
import os, time, shutil
 | 
			
		||||
import os, shutil
 | 
			
		||||
 | 
			
		||||
from calibre.constants import plugins, preferred_encoding
 | 
			
		||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors, \
 | 
			
		||||
    authors_to_string
 | 
			
		||||
from calibre.utils.ipc.job import ParallelJob
 | 
			
		||||
from calibre.utils.ipc.server import Server
 | 
			
		||||
from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile
 | 
			
		||||
from calibre import prints
 | 
			
		||||
 | 
			
		||||
podofo, podofo_err = plugins['podofo']
 | 
			
		||||
 | 
			
		||||
class Unavailable(Exception): pass
 | 
			
		||||
 | 
			
		||||
def get_metadata(stream, cpath=None):
 | 
			
		||||
    if not podofo:
 | 
			
		||||
        raise Unavailable(podofo_err)
 | 
			
		||||
    pt = PersistentTemporaryFile('_podofo.pdf')
 | 
			
		||||
    pt.write(stream.read())
 | 
			
		||||
    pt.close()
 | 
			
		||||
    server = Server(pool_size=1)
 | 
			
		||||
    job = ParallelJob('read_pdf_metadata', 'Read pdf metadata',
 | 
			
		||||
        lambda x,y:x,  args=[pt.name, cpath])
 | 
			
		||||
    server.add_job(job)
 | 
			
		||||
    while not job.is_finished:
 | 
			
		||||
        time.sleep(0.1)
 | 
			
		||||
        job.update()
 | 
			
		||||
 | 
			
		||||
    job.update()
 | 
			
		||||
    server.close()
 | 
			
		||||
    if job.result is None:
 | 
			
		||||
        raise ValueError('Failed to read metadata: ' + job.details)
 | 
			
		||||
    title, authors, creator, tags, ok = job.result
 | 
			
		||||
    if not ok:
 | 
			
		||||
        print 'Failed to extract cover:'
 | 
			
		||||
        print job.details
 | 
			
		||||
    if title == '_':
 | 
			
		||||
        title = getattr(stream, 'name', _('Unknown'))
 | 
			
		||||
        title = os.path.splitext(title)[0]
 | 
			
		||||
 | 
			
		||||
    mi = MetaInformation(title, authors)
 | 
			
		||||
    if creator:
 | 
			
		||||
        mi.book_producer = creator
 | 
			
		||||
    if tags:
 | 
			
		||||
        mi.tags = tags
 | 
			
		||||
    if os.path.exists(pt.name): os.remove(pt.name)
 | 
			
		||||
    if ok:
 | 
			
		||||
        mi.cover = cpath
 | 
			
		||||
    return mi
 | 
			
		||||
 | 
			
		||||
def get_metadata_quick(raw):
 | 
			
		||||
    p = podofo.PDFDoc()
 | 
			
		||||
    p.load(raw)
 | 
			
		||||
    title = p.title
 | 
			
		||||
    if not title:
 | 
			
		||||
        title = '_'
 | 
			
		||||
    author = p.author
 | 
			
		||||
    authors = string_to_authors(author) if author else  [_('Unknown')]
 | 
			
		||||
    creator = p.creator
 | 
			
		||||
    try:
 | 
			
		||||
        tags = [x.strip() for x in p.keywords.split(u',')]
 | 
			
		||||
        tags = [x for x in tags if x]
 | 
			
		||||
    except:
 | 
			
		||||
        tags = []
 | 
			
		||||
 | 
			
		||||
    mi = MetaInformation(title, authors)
 | 
			
		||||
    if creator:
 | 
			
		||||
        mi.book_producer = creator
 | 
			
		||||
    if tags:
 | 
			
		||||
        mi.tags = tags
 | 
			
		||||
    return mi
 | 
			
		||||
 | 
			
		||||
def get_metadata_(path, cpath=None):
 | 
			
		||||
    p = podofo.PDFDoc()
 | 
			
		||||
    p.open(path)
 | 
			
		||||
    title = p.title
 | 
			
		||||
    if not title:
 | 
			
		||||
        title = '_'
 | 
			
		||||
    author = p.author
 | 
			
		||||
    authors = string_to_authors(author) if author else  [_('Unknown')]
 | 
			
		||||
    creator = p.creator
 | 
			
		||||
    try:
 | 
			
		||||
        tags = [x.strip() for x in p.keywords.split(u',')]
 | 
			
		||||
        tags = [x for x in tags if x]
 | 
			
		||||
    except:
 | 
			
		||||
        tags = []
 | 
			
		||||
    ok = True
 | 
			
		||||
    try:
 | 
			
		||||
        if cpath is not None:
 | 
			
		||||
            pages = p.pages
 | 
			
		||||
            if pages < 1:
 | 
			
		||||
                raise ValueError('PDF has no pages')
 | 
			
		||||
            if True or pages == 1:
 | 
			
		||||
                shutil.copyfile(path, cpath)
 | 
			
		||||
            else:
 | 
			
		||||
                p.extract_first_page()
 | 
			
		||||
                p.save(cpath)
 | 
			
		||||
    except:
 | 
			
		||||
        import traceback
 | 
			
		||||
        traceback.print_exc()
 | 
			
		||||
        ok = False
 | 
			
		||||
 | 
			
		||||
    return (title, authors, creator, tags, ok)
 | 
			
		||||
from calibre.ebooks.metadata import authors_to_string
 | 
			
		||||
from calibre.ptempfile import TemporaryDirectory
 | 
			
		||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
 | 
			
		||||
 | 
			
		||||
def prep(val):
 | 
			
		||||
    if not val:
 | 
			
		||||
@ -118,27 +21,16 @@ def prep(val):
 | 
			
		||||
    return val.strip()
 | 
			
		||||
 | 
			
		||||
def set_metadata(stream, mi):
 | 
			
		||||
    if not podofo:
 | 
			
		||||
        raise Unavailable(podofo_err)
 | 
			
		||||
    with TemporaryFile('_podofo_read.pdf') as inputf, \
 | 
			
		||||
            TemporaryFile('_podofo_write.pdf') as outputf:
 | 
			
		||||
        server = Server(pool_size=1)
 | 
			
		||||
        with open(inputf, 'wb') as f:
 | 
			
		||||
    with TemporaryDirectory(u'_podofo_set_metadata') as tdir:
 | 
			
		||||
        with open(os.path.join(tdir, u'input.pdf'), 'wb') as f:
 | 
			
		||||
            shutil.copyfileobj(stream, f)
 | 
			
		||||
        job = ParallelJob('write_pdf_metadata', 'Write pdf metadata',
 | 
			
		||||
            lambda x,y:x,  args=[inputf, outputf, mi.title, mi.authors,
 | 
			
		||||
                mi.book_producer, mi.tags])
 | 
			
		||||
        server.add_job(job)
 | 
			
		||||
        while not job.is_finished:
 | 
			
		||||
            time.sleep(0.1)
 | 
			
		||||
            job.update()
 | 
			
		||||
 | 
			
		||||
        job.update()
 | 
			
		||||
        server.close()
 | 
			
		||||
        if job.failed:
 | 
			
		||||
            prints(job.details)
 | 
			
		||||
        elif job.result:
 | 
			
		||||
            with open(outputf, 'rb') as f:
 | 
			
		||||
        try:
 | 
			
		||||
            touched = fork_job('calibre.utils.podofo', 'set_metadata_', (tdir,
 | 
			
		||||
                mi.title, mi.authors, mi.book_producer, mi.tags))
 | 
			
		||||
        except WorkerError as e:
 | 
			
		||||
            raise Exception('Failed to set PDF metadata: %s'%e.orig_tb)
 | 
			
		||||
        if touched:
 | 
			
		||||
            with open(os.path.join(tdir, u'output.pdf'), 'rb') as f:
 | 
			
		||||
                f.seek(0, 2)
 | 
			
		||||
                if f.tell() > 100:
 | 
			
		||||
                    f.seek(0)
 | 
			
		||||
@ -148,10 +40,14 @@ def set_metadata(stream, mi):
 | 
			
		||||
                    stream.flush()
 | 
			
		||||
    stream.seek(0)
 | 
			
		||||
 | 
			
		||||
def set_metadata_(tdir, title, authors, bkp, tags):
 | 
			
		||||
    podofo, podofo_err = plugins['podofo']
 | 
			
		||||
    if podofo is None:
 | 
			
		||||
        raise RuntimeError('Failed to load podofo: %s'%podofo_err)
 | 
			
		||||
 | 
			
		||||
def set_metadata_(path, opath, title, authors, bkp, tags):
 | 
			
		||||
    os.chdir(tdir)
 | 
			
		||||
    p = podofo.PDFDoc()
 | 
			
		||||
    p.open(path)
 | 
			
		||||
    p.open(u'input.pdf')
 | 
			
		||||
    title = prep(title)
 | 
			
		||||
    touched = False
 | 
			
		||||
    if title and title != p.title:
 | 
			
		||||
@ -177,27 +73,32 @@ def set_metadata_(path, opath, title, authors, bkp, tags):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    if touched:
 | 
			
		||||
        p.save(opath)
 | 
			
		||||
        return True
 | 
			
		||||
    return False
 | 
			
		||||
        p.save(u'output.pdf')
 | 
			
		||||
 | 
			
		||||
    return touched
 | 
			
		||||
 | 
			
		||||
def delete_all_but(path, pages):
 | 
			
		||||
    ''' Delete all the pages in the pdf except for the specified ones. Negative
 | 
			
		||||
    numbers are counted from the end of the PDF. '''
 | 
			
		||||
    with TemporaryFile('_podofo_in.pdf') as of:
 | 
			
		||||
        shutil.copyfile(path, of)
 | 
			
		||||
    podofo, podofo_err = plugins['podofo']
 | 
			
		||||
    if podofo is None:
 | 
			
		||||
        raise RuntimeError('Failed to load podofo: %s'%podofo_err)
 | 
			
		||||
 | 
			
		||||
    p = podofo.PDFDoc()
 | 
			
		||||
        p.open(of)
 | 
			
		||||
    with open(path, 'rb') as f:
 | 
			
		||||
        raw = f.read()
 | 
			
		||||
    p.load(raw)
 | 
			
		||||
    total = p.page_count()
 | 
			
		||||
    pages = { total + x if x < 0 else x for x in pages }
 | 
			
		||||
    for page in xrange(total-1, -1, -1):
 | 
			
		||||
        if page not in pages:
 | 
			
		||||
            p.delete_page(page)
 | 
			
		||||
        os.remove(path)
 | 
			
		||||
        p.save(path)
 | 
			
		||||
 | 
			
		||||
    raw = p.write()
 | 
			
		||||
    with open(path, 'wb') as f:
 | 
			
		||||
        f.write(raw)
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    f = '/tmp/t.pdf'
 | 
			
		||||
    f = u'/tmp/t.pdf'
 | 
			
		||||
    delete_all_but(f, [0, 1, -2, -1])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -77,11 +77,72 @@ PDFDoc_save(PDFDoc *self, PyObject *args, PyObject *kwargs) {
 | 
			
		||||
        }
 | 
			
		||||
    } else return NULL;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    Py_INCREF(Py_None);
 | 
			
		||||
    return Py_None;
 | 
			
		||||
    Py_RETURN_NONE;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static PyObject *
 | 
			
		||||
PDFDoc_write(PDFDoc *self, PyObject *args, PyObject *kwargs) {
 | 
			
		||||
    PyObject *ans;
 | 
			
		||||
    PdfRefCountedBuffer buffer(1*1024*1024);
 | 
			
		||||
    PdfOutputDevice out(&buffer);
 | 
			
		||||
    
 | 
			
		||||
    try {
 | 
			
		||||
        self->doc->Write(&out);
 | 
			
		||||
    } catch(const PdfError &err) {
 | 
			
		||||
        podofo_set_exception(err);
 | 
			
		||||
        return NULL;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    ans = PyBytes_FromStringAndSize(buffer.GetBuffer(), out.Tell());
 | 
			
		||||
    if (ans == NULL) PyErr_NoMemory();
 | 
			
		||||
    return ans;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static PyObject *
 | 
			
		||||
PDFDoc_extract_first_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
 | 
			
		||||
    try {
 | 
			
		||||
        while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1);
 | 
			
		||||
    } catch(const PdfError & err) {
 | 
			
		||||
        podofo_set_exception(err);
 | 
			
		||||
        return NULL;
 | 
			
		||||
    }
 | 
			
		||||
    Py_RETURN_NONE;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static PyObject *
 | 
			
		||||
PDFDoc_page_count(PDFDoc *self, PyObject *args, PyObject *kwargs) {
 | 
			
		||||
    int count;
 | 
			
		||||
    try {
 | 
			
		||||
        count = self->doc->GetPageCount();
 | 
			
		||||
    } catch(const PdfError & err) {
 | 
			
		||||
        podofo_set_exception(err);
 | 
			
		||||
        return NULL;
 | 
			
		||||
    }
 | 
			
		||||
    return Py_BuildValue("i", count);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static PyObject *
 | 
			
		||||
PDFDoc_delete_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
 | 
			
		||||
    int num = 0;
 | 
			
		||||
    if (PyArg_ParseTuple(args, "i", &num)) {
 | 
			
		||||
        try {
 | 
			
		||||
            self->doc->DeletePages(num, 1);
 | 
			
		||||
        } catch(const PdfError & err) {
 | 
			
		||||
            podofo_set_exception(err);
 | 
			
		||||
            return NULL;
 | 
			
		||||
        }
 | 
			
		||||
    } else return NULL;
 | 
			
		||||
 | 
			
		||||
    Py_RETURN_NONE;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static PyObject *
 | 
			
		||||
PDFDoc_append(PDFDoc *self, PyObject *args, PyObject *kwargs) {
 | 
			
		||||
    Py_RETURN_NONE;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Properties {{{
 | 
			
		||||
 | 
			
		||||
static PyObject *
 | 
			
		||||
PDFDoc_pages_getter(PDFDoc *self, void *closure) {
 | 
			
		||||
    int pages = self->doc->GetPageCount();
 | 
			
		||||
@ -123,46 +184,6 @@ PDFDoc_version_getter(PDFDoc *self, void *closure) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
static PyObject *
 | 
			
		||||
PDFDoc_extract_first_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
 | 
			
		||||
    try {
 | 
			
		||||
        while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1);
 | 
			
		||||
    } catch(const PdfError & err) {
 | 
			
		||||
        podofo_set_exception(err);
 | 
			
		||||
        return NULL;
 | 
			
		||||
    }
 | 
			
		||||
    Py_RETURN_NONE;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static PyObject *
 | 
			
		||||
PDFDoc_page_count(PDFDoc *self, PyObject *args, PyObject *kwargs) {
 | 
			
		||||
    int count;
 | 
			
		||||
    try {
 | 
			
		||||
        count = self->doc->GetPageCount();
 | 
			
		||||
    } catch(const PdfError & err) {
 | 
			
		||||
        podofo_set_exception(err);
 | 
			
		||||
        return NULL;
 | 
			
		||||
    }
 | 
			
		||||
    return Py_BuildValue("i", count);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static PyObject *
 | 
			
		||||
PDFDoc_delete_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
 | 
			
		||||
    int num = 0;
 | 
			
		||||
    if (PyArg_ParseTuple(args, "i", &num)) {
 | 
			
		||||
        try {
 | 
			
		||||
            self->doc->DeletePages(num, 1);
 | 
			
		||||
        } catch(const PdfError & err) {
 | 
			
		||||
            podofo_set_exception(err);
 | 
			
		||||
            return NULL;
 | 
			
		||||
        }
 | 
			
		||||
    } else return NULL;
 | 
			
		||||
 | 
			
		||||
    Py_RETURN_NONE;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static PyObject *
 | 
			
		||||
PDFDoc_getter(PDFDoc *self, int field)
 | 
			
		||||
{
 | 
			
		||||
@ -288,31 +309,6 @@ PDFDoc_producer_setter(PDFDoc *self, PyObject *val, void *closure) {
 | 
			
		||||
    return  PDFDoc_setter(self, val, 5);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static PyMethodDef PDFDoc_methods[] = {
 | 
			
		||||
    {"load", (PyCFunction)PDFDoc_load, METH_VARARGS,
 | 
			
		||||
     "Load a PDF document from a byte buffer (string)"
 | 
			
		||||
    },
 | 
			
		||||
    {"open", (PyCFunction)PDFDoc_open, METH_VARARGS,
 | 
			
		||||
     "Load a PDF document from a file path (string)"
 | 
			
		||||
    },
 | 
			
		||||
    {"save", (PyCFunction)PDFDoc_save, METH_VARARGS,
 | 
			
		||||
     "Save the PDF document to a path on disk"
 | 
			
		||||
    },
 | 
			
		||||
    {"extract_first_page", (PyCFunction)PDFDoc_extract_first_page, METH_VARARGS,
 | 
			
		||||
     "extract_first_page() -> Remove all but the first page."
 | 
			
		||||
    },
 | 
			
		||||
    {"page_count", (PyCFunction)PDFDoc_page_count, METH_VARARGS,
 | 
			
		||||
     "page_count() -> Number of pages in the PDF."
 | 
			
		||||
    },
 | 
			
		||||
    {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
 | 
			
		||||
     "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    {NULL}  /* Sentinel */
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static PyGetSetDef PDFDoc_getsetters[] = {
 | 
			
		||||
    {(char *)"title", 
 | 
			
		||||
     (getter)PDFDoc_title_getter, (setter)PDFDoc_title_setter,
 | 
			
		||||
@ -350,6 +346,39 @@ static PyGetSetDef PDFDoc_getsetters[] = {
 | 
			
		||||
    {NULL}  /* Sentinel */
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// }}}
 | 
			
		||||
 | 
			
		||||
static PyMethodDef PDFDoc_methods[] = {
 | 
			
		||||
    {"load", (PyCFunction)PDFDoc_load, METH_VARARGS,
 | 
			
		||||
     "Load a PDF document from a byte buffer (string)"
 | 
			
		||||
    },
 | 
			
		||||
    {"open", (PyCFunction)PDFDoc_open, METH_VARARGS,
 | 
			
		||||
     "Load a PDF document from a file path (string)"
 | 
			
		||||
    },
 | 
			
		||||
    {"save", (PyCFunction)PDFDoc_save, METH_VARARGS,
 | 
			
		||||
     "Save the PDF document to a path on disk"
 | 
			
		||||
    },
 | 
			
		||||
    {"write", (PyCFunction)PDFDoc_write, METH_VARARGS,
 | 
			
		||||
     "Return the PDF document as a bytestring."
 | 
			
		||||
    },
 | 
			
		||||
    {"extract_first_page", (PyCFunction)PDFDoc_extract_first_page, METH_VARARGS,
 | 
			
		||||
     "extract_first_page() -> Remove all but the first page."
 | 
			
		||||
    },
 | 
			
		||||
    {"page_count", (PyCFunction)PDFDoc_page_count, METH_VARARGS,
 | 
			
		||||
     "page_count() -> Number of pages in the PDF."
 | 
			
		||||
    },
 | 
			
		||||
    {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
 | 
			
		||||
     "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
 | 
			
		||||
    },
 | 
			
		||||
    {"append", (PyCFunction)PDFDoc_append, METH_VARARGS,
 | 
			
		||||
     "append(doc) -> Append doc (which must be a PDFDoc) to this document."
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    {NULL}  /* Sentinel */
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
PyTypeObject pdf::PDFDocType = {
 | 
			
		||||
    PyObject_HEAD_INIT(NULL)
 | 
			
		||||
    0,                         /*ob_size*/
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user