From b854aecf1228e706311c7a9d5c2e72e2cb528827 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 30 Mar 2011 11:52:38 -0600 Subject: [PATCH] New PDF Engine: Add parameters to only convert a specified set of pages to XML. Useful when trying to extract an ISBN efficiently --- src/calibre/ebooks/pdf/input.py | 2 +- src/calibre/ebooks/pdf/main.cpp | 7 ++++--- src/calibre/ebooks/pdf/reflow.cpp | 11 +++++++---- src/calibre/ebooks/pdf/reflow.h | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 14b3552b04..8de3f44d36 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -34,7 +34,7 @@ class PDFInput(InputFormatPlugin): from calibre.ebooks.pdf.reflow import PDFDocument if pdfreflow_err: raise RuntimeError('Failed to load pdfreflow: ' + pdfreflow_err) - pdfreflow.reflow(stream.read()) + pdfreflow.reflow(stream.read(), 1, -1) xml = open('index.xml', 'rb').read() PDFDocument(xml, self.opts, self.log) return os.path.join(os.getcwd(), 'metadata.opf') diff --git a/src/calibre/ebooks/pdf/main.cpp b/src/calibre/ebooks/pdf/main.cpp index 4e6ec60388..4ec1e2fe44 100644 --- a/src/calibre/ebooks/pdf/main.cpp +++ b/src/calibre/ebooks/pdf/main.cpp @@ -24,13 +24,14 @@ extern "C" { pdfreflow_reflow(PyObject *self, PyObject *args) { char *pdfdata; Py_ssize_t size; + int first_page, last_page; - if (!PyArg_ParseTuple(args, "s#", &pdfdata, &size)) + if (!PyArg_ParseTuple(args, "s#ii", &pdfdata, &size, &first_page, &last_page)) return NULL; try { Reflow reflow(pdfdata, static_cast(size)); - reflow.render(); + reflow.render(first_page, last_page); } catch (std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; } catch (...) { @@ -166,7 +167,7 @@ extern "C" { static PyMethodDef pdfreflow_methods[] = { {"reflow", pdfreflow_reflow, METH_VARARGS, - "reflow(pdf_data)\n\n" + "reflow(pdf_data, first_page, last_page)\n\n" "Reflow the specified PDF." }, {"get_metadata", pdfreflow_get_metadata, METH_VARARGS, diff --git a/src/calibre/ebooks/pdf/reflow.cpp b/src/calibre/ebooks/pdf/reflow.cpp index e444c126ab..921f8f67cd 100644 --- a/src/calibre/ebooks/pdf/reflow.cpp +++ b/src/calibre/ebooks/pdf/reflow.cpp @@ -713,15 +713,17 @@ Reflow::Reflow(char *pdfdata, size_t sz) : } void -Reflow::render() { +Reflow::render(int first_page, int last_page) { if (!this->doc->okToCopy()) cout << "Warning, this document has the copy protection flag set, ignoring." << endl; globalParams->setTextEncoding(encoding); - int first_page = 1; - int last_page = doc->getNumPages(); + int doc_pages = doc->getNumPages(); + if (last_page < 1 or last_page > doc_pages) last_page = doc_pages; + if (first_page < 1) first_page = 1; + if (first_page > last_page) first_page = last_page; XMLOutputDev *xml_out = new XMLOutputDev(this->doc); doc->displayPages(xml_out, first_page, last_page, @@ -733,7 +735,8 @@ Reflow::render() { false //Printing ); - this->dump_outline(); + if (last_page - first_page == doc_pages - 1) + this->dump_outline(); delete xml_out; } diff --git a/src/calibre/ebooks/pdf/reflow.h b/src/calibre/ebooks/pdf/reflow.h index ad4b79929d..a4281d16ed 100644 --- a/src/calibre/ebooks/pdf/reflow.h +++ b/src/calibre/ebooks/pdf/reflow.h @@ -66,7 +66,7 @@ class Reflow { ~Reflow(); /* Convert the PDF to XML. All files are output to the current directory */ - void render(); + void render(int first_page, int last_page); /* Get the PDF Info Dictionary */ map get_info();