mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New PDF Engine: Add parameters to only convert a specified set of pages to XML. Useful when trying to extract an ISBN efficiently
This commit is contained in:
parent
ec571a7a19
commit
b854aecf12
@ -34,7 +34,7 @@ class PDFInput(InputFormatPlugin):
|
|||||||
from calibre.ebooks.pdf.reflow import PDFDocument
|
from calibre.ebooks.pdf.reflow import PDFDocument
|
||||||
if pdfreflow_err:
|
if pdfreflow_err:
|
||||||
raise RuntimeError('Failed to load pdfreflow: ' + pdfreflow_err)
|
raise RuntimeError('Failed to load pdfreflow: ' + pdfreflow_err)
|
||||||
pdfreflow.reflow(stream.read())
|
pdfreflow.reflow(stream.read(), 1, -1)
|
||||||
xml = open('index.xml', 'rb').read()
|
xml = open('index.xml', 'rb').read()
|
||||||
PDFDocument(xml, self.opts, self.log)
|
PDFDocument(xml, self.opts, self.log)
|
||||||
return os.path.join(os.getcwd(), 'metadata.opf')
|
return os.path.join(os.getcwd(), 'metadata.opf')
|
||||||
|
@ -24,13 +24,14 @@ extern "C" {
|
|||||||
pdfreflow_reflow(PyObject *self, PyObject *args) {
|
pdfreflow_reflow(PyObject *self, PyObject *args) {
|
||||||
char *pdfdata;
|
char *pdfdata;
|
||||||
Py_ssize_t size;
|
Py_ssize_t size;
|
||||||
|
int first_page, last_page;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "s#", &pdfdata, &size))
|
if (!PyArg_ParseTuple(args, "s#ii", &pdfdata, &size, &first_page, &last_page))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
|
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
|
||||||
reflow.render();
|
reflow.render(first_page, last_page);
|
||||||
} catch (std::exception &e) {
|
} catch (std::exception &e) {
|
||||||
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
|
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
@ -166,7 +167,7 @@ extern "C" {
|
|||||||
static
|
static
|
||||||
PyMethodDef pdfreflow_methods[] = {
|
PyMethodDef pdfreflow_methods[] = {
|
||||||
{"reflow", pdfreflow_reflow, METH_VARARGS,
|
{"reflow", pdfreflow_reflow, METH_VARARGS,
|
||||||
"reflow(pdf_data)\n\n"
|
"reflow(pdf_data, first_page, last_page)\n\n"
|
||||||
"Reflow the specified PDF."
|
"Reflow the specified PDF."
|
||||||
},
|
},
|
||||||
{"get_metadata", pdfreflow_get_metadata, METH_VARARGS,
|
{"get_metadata", pdfreflow_get_metadata, METH_VARARGS,
|
||||||
|
@ -713,15 +713,17 @@ Reflow::Reflow(char *pdfdata, size_t sz) :
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
Reflow::render() {
|
Reflow::render(int first_page, int last_page) {
|
||||||
|
|
||||||
if (!this->doc->okToCopy())
|
if (!this->doc->okToCopy())
|
||||||
cout << "Warning, this document has the copy protection flag set, ignoring." << endl;
|
cout << "Warning, this document has the copy protection flag set, ignoring." << endl;
|
||||||
|
|
||||||
globalParams->setTextEncoding(encoding);
|
globalParams->setTextEncoding(encoding);
|
||||||
|
|
||||||
int first_page = 1;
|
int doc_pages = doc->getNumPages();
|
||||||
int last_page = doc->getNumPages();
|
if (last_page < 1 or last_page > doc_pages) last_page = doc_pages;
|
||||||
|
if (first_page < 1) first_page = 1;
|
||||||
|
if (first_page > last_page) first_page = last_page;
|
||||||
|
|
||||||
XMLOutputDev *xml_out = new XMLOutputDev(this->doc);
|
XMLOutputDev *xml_out = new XMLOutputDev(this->doc);
|
||||||
doc->displayPages(xml_out, first_page, last_page,
|
doc->displayPages(xml_out, first_page, last_page,
|
||||||
@ -733,7 +735,8 @@ Reflow::render() {
|
|||||||
false //Printing
|
false //Printing
|
||||||
);
|
);
|
||||||
|
|
||||||
this->dump_outline();
|
if (last_page - first_page == doc_pages - 1)
|
||||||
|
this->dump_outline();
|
||||||
|
|
||||||
delete xml_out;
|
delete xml_out;
|
||||||
}
|
}
|
||||||
|
@ -66,7 +66,7 @@ class Reflow {
|
|||||||
~Reflow();
|
~Reflow();
|
||||||
|
|
||||||
/* Convert the PDF to XML. All files are output to the current directory */
|
/* Convert the PDF to XML. All files are output to the current directory */
|
||||||
void render();
|
void render(int first_page, int last_page);
|
||||||
|
|
||||||
/* Get the PDF Info Dictionary */
|
/* Get the PDF Info Dictionary */
|
||||||
map<string, string> get_info();
|
map<string, string> get_info();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user