New PDF Engine: Add parameters to only convert a specified set of pages to XML. Useful when trying to extract an ISBN efficiently

This commit is contained in:
Kovid Goyal 2011-03-30 11:52:38 -06:00
parent ec571a7a19
commit b854aecf12
4 changed files with 13 additions and 9 deletions

View File

@ -34,7 +34,7 @@ class PDFInput(InputFormatPlugin):
from calibre.ebooks.pdf.reflow import PDFDocument from calibre.ebooks.pdf.reflow import PDFDocument
if pdfreflow_err: if pdfreflow_err:
raise RuntimeError('Failed to load pdfreflow: ' + pdfreflow_err) raise RuntimeError('Failed to load pdfreflow: ' + pdfreflow_err)
pdfreflow.reflow(stream.read()) pdfreflow.reflow(stream.read(), 1, -1)
xml = open('index.xml', 'rb').read() xml = open('index.xml', 'rb').read()
PDFDocument(xml, self.opts, self.log) PDFDocument(xml, self.opts, self.log)
return os.path.join(os.getcwd(), 'metadata.opf') return os.path.join(os.getcwd(), 'metadata.opf')

View File

@ -24,13 +24,14 @@ extern "C" {
pdfreflow_reflow(PyObject *self, PyObject *args) { pdfreflow_reflow(PyObject *self, PyObject *args) {
char *pdfdata; char *pdfdata;
Py_ssize_t size; Py_ssize_t size;
int first_page, last_page;
if (!PyArg_ParseTuple(args, "s#", &pdfdata, &size)) if (!PyArg_ParseTuple(args, "s#ii", &pdfdata, &size, &first_page, &last_page))
return NULL; return NULL;
try { try {
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size)); Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
reflow.render(); reflow.render(first_page, last_page);
} catch (std::exception &e) { } catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
} catch (...) { } catch (...) {
@ -166,7 +167,7 @@ extern "C" {
static static
PyMethodDef pdfreflow_methods[] = { PyMethodDef pdfreflow_methods[] = {
{"reflow", pdfreflow_reflow, METH_VARARGS, {"reflow", pdfreflow_reflow, METH_VARARGS,
"reflow(pdf_data)\n\n" "reflow(pdf_data, first_page, last_page)\n\n"
"Reflow the specified PDF." "Reflow the specified PDF."
}, },
{"get_metadata", pdfreflow_get_metadata, METH_VARARGS, {"get_metadata", pdfreflow_get_metadata, METH_VARARGS,

View File

@ -713,15 +713,17 @@ Reflow::Reflow(char *pdfdata, size_t sz) :
} }
void void
Reflow::render() { Reflow::render(int first_page, int last_page) {
if (!this->doc->okToCopy()) if (!this->doc->okToCopy())
cout << "Warning, this document has the copy protection flag set, ignoring." << endl; cout << "Warning, this document has the copy protection flag set, ignoring." << endl;
globalParams->setTextEncoding(encoding); globalParams->setTextEncoding(encoding);
int first_page = 1; int doc_pages = doc->getNumPages();
int last_page = doc->getNumPages(); if (last_page < 1 or last_page > doc_pages) last_page = doc_pages;
if (first_page < 1) first_page = 1;
if (first_page > last_page) first_page = last_page;
XMLOutputDev *xml_out = new XMLOutputDev(this->doc); XMLOutputDev *xml_out = new XMLOutputDev(this->doc);
doc->displayPages(xml_out, first_page, last_page, doc->displayPages(xml_out, first_page, last_page,
@ -733,7 +735,8 @@ Reflow::render() {
false //Printing false //Printing
); );
this->dump_outline(); if (last_page - first_page == doc_pages - 1)
this->dump_outline();
delete xml_out; delete xml_out;
} }

View File

@ -66,7 +66,7 @@ class Reflow {
~Reflow(); ~Reflow();
/* Convert the PDF to XML. All files are output to the current directory */ /* Convert the PDF to XML. All files are output to the current directory */
void render(); void render(int first_page, int last_page);
/* Get the PDF Info Dictionary */ /* Get the PDF Info Dictionary */
map<string, string> get_info(); map<string, string> get_info();