PDF Input: Support image rotation commands in PDF files. Fixes the long standing problem of some images being flipped when converting from PDF in calibre.

This commit is contained in:
Kovid Goyal 2012-05-19 13:55:17 +05:30
parent aca782e169
commit 48ce2b54f8
7 changed files with 58 additions and 8 deletions

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, socket, struct, subprocess
import os, socket, struct, subprocess, glob
from distutils.spawn import find_executable
from PyQt4 import pyqtconfig
@ -128,8 +128,9 @@ if iswindows:
elif isosx:
fc_inc = '/sw/include/fontconfig'
fc_lib = '/sw/lib'
poppler = glob.glob('/sw/build/poppler-*')[-1]
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
'/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
'{0}/poppler:{0}'.format(poppler))
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
'/sw/lib')
poppler_libs = ['poppler']

View File

@ -32,7 +32,7 @@ binary_includes = [
'/lib/libz.so.1',
'/usr/lib/libtiff.so.5',
'/lib/libbz2.so.1',
'/usr/lib/libpoppler.so.7',
'/usr/lib/libpoppler.so.25',
'/usr/lib/libxml2.so.2',
'/usr/lib/libopenjpeg.so.2',
'/usr/lib/libxslt.so.1',

View File

@ -385,7 +385,7 @@ class Py2App(object):
@flush
def add_poppler(self):
info('\nAdding poppler')
for x in ('libpoppler.7.dylib',):
for x in ('libpoppler.25.dylib',):
self.install_dylib(os.path.join(SW, 'lib', x))
self.install_dylib(os.path.join(SW, 'bin', 'pdftohtml'), False)

View File

@ -295,7 +295,7 @@ NOTE: poppler must be built as a static library, unless you build the qt4 bindin
Now do the same for the pdftohtml project
cp poppler/*.h ~/sw/include/poppler && cp goo/*.h ~/sw/include/poppler/goo && cp splash/*.h ~/sw/include/poppler/splash && cp build/Release/poppler.lib ../../lib/ && cp build/utils/Release/*.exe ../../bin/
cp poppler/*.h ~/sw/include/poppler && cp goo/*.h ~/sw/include/poppler/goo && cp splash/*.h ~/sw/include/poppler/splash && cp build/Release/poppler.lib ../../lib/ && cp build/utils/Release/pdftohtml.exe ../../bin/
podofo

View File

@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \
'2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import errno, os, sys, subprocess, shutil
import errno, os, sys, subprocess, shutil, re
from functools import partial
from calibre.ebooks import ConversionError, DRMError
@ -96,8 +96,36 @@ def pdftohtml(output_dir, pdf_path, no_images):
with open(index, 'r+b') as i:
raw = i.read()
raw = flip_images(raw)
raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
i.seek(0)
i.truncate()
i.write(raw)
# versions of pdftohtml >= 0.20 output self closing <br> tags, this
# breaks the pdf heuristics regexps, so replace them
i.write(raw.replace(b'<br/>', b'<br>'))
def flip_image(img, flip):
from calibre.utils.magick import Image
im = Image()
im.open(img)
if b'x' in flip:
im.flip(True)
if b'y' in flip:
im.flip()
im.save(img)
def flip_images(raw):
for match in re.finditer(b'<IMG[^>]+/?>', raw):
img = match.group()
m = re.search(br'class="(x|y|xy)flip"', img)
if m is None: continue
flip = m.group(1)
src = re.search(br'src="([^"]+)"', img)
if src is None: continue
img = src.group(1)
if not os.path.exists(img): continue
print ('Flipping image %s: %s'%(img, flip))
flip_image(img, flip)
raw = re.sub(br'<STYLE.+?</STYLE>\s*', b'', raw, flags=re.I|re.DOTALL)
return raw

View File

@ -669,7 +669,6 @@ Some limitations of PDF input are:
* Complex, multi-column, and image based documents are not supported.
* Extraction of vector images and tables from within the document is also not supported.
* Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF.
* Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well.
* Links and Tables of Contents are not supported
* PDFs that use embedded non-unicode fonts to represent non-English characters will result in garbled output for those characters
* Some PDFs are made up of photographs of the page with OCRed text behind them. In such cases |app| uses the OCRed text, which can be very different from what you see when you view the PDF file

View File

@ -909,6 +909,24 @@ magick_Image_rotate(magick_Image *self, PyObject *args, PyObject *kwargs) {
}
// }}}
// Image.rotate {{{
static PyObject *
magick_Image_flip(magick_Image *self, PyObject *args, PyObject *kwargs) {
PyObject *obj = NULL;
MagickBooleanType ret = 0;
NULL_CHECK(NULL)
if (!PyArg_ParseTuple(args, "|O", &obj)) return NULL;
ret = (obj != NULL && PyObject_IsTrue(obj)) ? MagickFlopImage(self->wand) : MagickFlipImage(self->wand);
if (!ret) { PyErr_SetString(PyExc_ValueError, "Failed to flip image"); return NULL; }
Py_RETURN_NONE;
}
// }}}
// Image.set_page {{{
static PyObject *
@ -1174,6 +1192,10 @@ static PyMethodDef magick_Image_methods[] = {
{"rotate", (PyCFunction)magick_Image_rotate, METH_VARARGS,
"rotate(background_pixel_wand, degrees) \n\n Rotate image by specified degrees."
},
{"flip", (PyCFunction)magick_Image_flip, METH_VARARGS,
"flip(horizontal=False) \n\n Flip image about a vertical axis. If horizontal is True, flip about horizontal axis instead."
},
{"normalize", (PyCFunction)magick_Image_normalize, METH_VARARGS,
"normalize() \n\n enhances the contrast of a color image by adjusting the pixels color to span the entire range of colors available."