New PDF Engine: Add parameters to only convert a specified set of pages to XML. Useful when trying to extract an ISBN efficiently

This commit is contained in:
Kovid Goyal 2011-03-30 11:52:38 -06:00
parent ec571a7a19
commit b854aecf12
4 changed files with 13 additions and 9 deletions

View File

@ -34,7 +34,7 @@ class PDFInput(InputFormatPlugin):
from calibre.ebooks.pdf.reflow import PDFDocument
if pdfreflow_err:
raise RuntimeError('Failed to load pdfreflow: ' + pdfreflow_err)
pdfreflow.reflow(stream.read())
pdfreflow.reflow(stream.read(), 1, -1)
xml = open('index.xml', 'rb').read()
PDFDocument(xml, self.opts, self.log)
return os.path.join(os.getcwd(), 'metadata.opf')

View File

@ -24,13 +24,14 @@ extern "C" {
pdfreflow_reflow(PyObject *self, PyObject *args) {
char *pdfdata;
Py_ssize_t size;
int first_page, last_page;
if (!PyArg_ParseTuple(args, "s#", &pdfdata, &size))
if (!PyArg_ParseTuple(args, "s#ii", &pdfdata, &size, &first_page, &last_page))
return NULL;
try {
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
reflow.render();
reflow.render(first_page, last_page);
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
} catch (...) {
@ -166,7 +167,7 @@ extern "C" {
static
PyMethodDef pdfreflow_methods[] = {
{"reflow", pdfreflow_reflow, METH_VARARGS,
"reflow(pdf_data)\n\n"
"reflow(pdf_data, first_page, last_page)\n\n"
"Reflow the specified PDF."
},
{"get_metadata", pdfreflow_get_metadata, METH_VARARGS,

View File

@ -713,15 +713,17 @@ Reflow::Reflow(char *pdfdata, size_t sz) :
}
void
Reflow::render() {
Reflow::render(int first_page, int last_page) {
if (!this->doc->okToCopy())
cout << "Warning, this document has the copy protection flag set, ignoring." << endl;
globalParams->setTextEncoding(encoding);
int first_page = 1;
int last_page = doc->getNumPages();
int doc_pages = doc->getNumPages();
if (last_page < 1 or last_page > doc_pages) last_page = doc_pages;
if (first_page < 1) first_page = 1;
if (first_page > last_page) first_page = last_page;
XMLOutputDev *xml_out = new XMLOutputDev(this->doc);
doc->displayPages(xml_out, first_page, last_page,
@ -733,7 +735,8 @@ Reflow::render() {
false //Printing
);
this->dump_outline();
if (last_page - first_page == doc_pages - 1)
this->dump_outline();
delete xml_out;
}

View File

@ -66,7 +66,7 @@ class Reflow {
~Reflow();
/* Convert the PDF to XML. All files are output to the current directory */
void render();
void render(int first_page, int last_page);
/* Get the PDF Info Dictionary */
map<string, string> get_info();