mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New PDF Engine: Add parameters to only convert a specified set of pages to XML. Useful when trying to extract an ISBN efficiently
This commit is contained in:
parent
ec571a7a19
commit
b854aecf12
@ -34,7 +34,7 @@ class PDFInput(InputFormatPlugin):
|
||||
from calibre.ebooks.pdf.reflow import PDFDocument
|
||||
if pdfreflow_err:
|
||||
raise RuntimeError('Failed to load pdfreflow: ' + pdfreflow_err)
|
||||
pdfreflow.reflow(stream.read())
|
||||
pdfreflow.reflow(stream.read(), 1, -1)
|
||||
xml = open('index.xml', 'rb').read()
|
||||
PDFDocument(xml, self.opts, self.log)
|
||||
return os.path.join(os.getcwd(), 'metadata.opf')
|
||||
|
@ -24,13 +24,14 @@ extern "C" {
|
||||
pdfreflow_reflow(PyObject *self, PyObject *args) {
|
||||
char *pdfdata;
|
||||
Py_ssize_t size;
|
||||
int first_page, last_page;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "s#", &pdfdata, &size))
|
||||
if (!PyArg_ParseTuple(args, "s#ii", &pdfdata, &size, &first_page, &last_page))
|
||||
return NULL;
|
||||
|
||||
try {
|
||||
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
|
||||
reflow.render();
|
||||
reflow.render(first_page, last_page);
|
||||
} catch (std::exception &e) {
|
||||
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
|
||||
} catch (...) {
|
||||
@ -166,7 +167,7 @@ extern "C" {
|
||||
static
|
||||
PyMethodDef pdfreflow_methods[] = {
|
||||
{"reflow", pdfreflow_reflow, METH_VARARGS,
|
||||
"reflow(pdf_data)\n\n"
|
||||
"reflow(pdf_data, first_page, last_page)\n\n"
|
||||
"Reflow the specified PDF."
|
||||
},
|
||||
{"get_metadata", pdfreflow_get_metadata, METH_VARARGS,
|
||||
|
@ -713,15 +713,17 @@ Reflow::Reflow(char *pdfdata, size_t sz) :
|
||||
}
|
||||
|
||||
void
|
||||
Reflow::render() {
|
||||
Reflow::render(int first_page, int last_page) {
|
||||
|
||||
if (!this->doc->okToCopy())
|
||||
cout << "Warning, this document has the copy protection flag set, ignoring." << endl;
|
||||
|
||||
globalParams->setTextEncoding(encoding);
|
||||
|
||||
int first_page = 1;
|
||||
int last_page = doc->getNumPages();
|
||||
int doc_pages = doc->getNumPages();
|
||||
if (last_page < 1 or last_page > doc_pages) last_page = doc_pages;
|
||||
if (first_page < 1) first_page = 1;
|
||||
if (first_page > last_page) first_page = last_page;
|
||||
|
||||
XMLOutputDev *xml_out = new XMLOutputDev(this->doc);
|
||||
doc->displayPages(xml_out, first_page, last_page,
|
||||
@ -733,7 +735,8 @@ Reflow::render() {
|
||||
false //Printing
|
||||
);
|
||||
|
||||
this->dump_outline();
|
||||
if (last_page - first_page == doc_pages - 1)
|
||||
this->dump_outline();
|
||||
|
||||
delete xml_out;
|
||||
}
|
||||
|
@ -66,7 +66,7 @@ class Reflow {
|
||||
~Reflow();
|
||||
|
||||
/* Convert the PDF to XML. All files are output to the current directory */
|
||||
void render();
|
||||
void render(int first_page, int last_page);
|
||||
|
||||
/* Get the PDF Info Dictionary */
|
||||
map<string, string> get_info();
|
||||
|
Loading…
x
Reference in New Issue
Block a user