PDF Input: Support image rotation commands in PDF files. Fixes the long standing problem of some images being flipped when converting from PDF in calibre.

2025-07-07 10:14:46 -04:00 · 2012-05-19 13:55:17 +05:30 · 2012-05-19 13:55:17 +05:30 · 48ce2b54f8
commit 48ce2b54f8
parent aca782e169
7 changed files with 58 additions and 8 deletions
--- a/setup/build_environment.py
+++ b/setup/build_environment.py
@ -6,7 +6,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import os, socket, struct, subprocess
+import os, socket, struct, subprocess, glob
 from distutils.spawn import find_executable

 from PyQt4 import pyqtconfig
@ -128,8 +128,9 @@ if iswindows:
 elif isosx:
    fc_inc = '/sw/include/fontconfig'
    fc_lib = '/sw/lib'
+    poppler = glob.glob('/sw/build/poppler-*')[-1]
    poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
-            '/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
+            '{0}/poppler:{0}'.format(poppler))
    poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
            '/sw/lib')
    poppler_libs = ['poppler']
--- a/setup/installer/linux/freeze2.py
+++ b/setup/installer/linux/freeze2.py
@ -32,7 +32,7 @@ binary_includes = [
                '/lib/libz.so.1',
                '/usr/lib/libtiff.so.5',
                '/lib/libbz2.so.1',
-                '/usr/lib/libpoppler.so.7',
+                '/usr/lib/libpoppler.so.25',
                '/usr/lib/libxml2.so.2',
                '/usr/lib/libopenjpeg.so.2',
                '/usr/lib/libxslt.so.1',
--- a/setup/installer/osx/app/main.py
+++ b/setup/installer/osx/app/main.py
@ -385,7 +385,7 @@ class Py2App(object):
    @flush
    def add_poppler(self):
        info('\nAdding poppler')
-        for x in ('libpoppler.7.dylib',):
+        for x in ('libpoppler.25.dylib',):
            self.install_dylib(os.path.join(SW, 'lib', x))
        self.install_dylib(os.path.join(SW, 'bin', 'pdftohtml'), False)

--- a/setup/installer/windows/notes.rst
+++ b/setup/installer/windows/notes.rst
@ -295,7 +295,7 @@ NOTE: poppler must be built as a static library, unless you build the qt4 bindin

 Now do the same for the pdftohtml project

-cp poppler/*.h ~/sw/include/poppler && cp goo/*.h ~/sw/include/poppler/goo && cp splash/*.h ~/sw/include/poppler/splash && cp build/Release/poppler.lib ../../lib/ && cp build/utils/Release/*.exe ../../bin/
+cp poppler/*.h ~/sw/include/poppler && cp goo/*.h ~/sw/include/poppler/goo && cp splash/*.h ~/sw/include/poppler/splash && cp build/Release/poppler.lib ../../lib/ && cp build/utils/Release/pdftohtml.exe ../../bin/


 podofo
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \
                '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

-import errno, os, sys, subprocess, shutil
+import errno, os, sys, subprocess, shutil, re
 from functools import partial

 from calibre.ebooks import ConversionError, DRMError
@ -96,8 +96,36 @@ def pdftohtml(output_dir, pdf_path, no_images):

        with open(index, 'r+b') as i:
            raw = i.read()
+            raw = flip_images(raw)
            raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
            i.seek(0)
            i.truncate()
-            i.write(raw)
+            # versions of pdftohtml >= 0.20 output self closing <br> tags, this
+            # breaks the pdf heuristics regexps, so replace them
+            i.write(raw.replace(b'<br/>', b'<br>'))
+
+def flip_image(img, flip):
+    from calibre.utils.magick import Image
+    im = Image()
+    im.open(img)
+    if b'x' in flip:
+        im.flip(True)
+    if b'y' in flip:
+        im.flip()
+    im.save(img)
+
+def flip_images(raw):
+    for match in re.finditer(b'<IMG[^>]+/?>', raw):
+        img = match.group()
+        m = re.search(br'class="(x|y|xy)flip"', img)
+        if m is None: continue
+        flip = m.group(1)
+        src = re.search(br'src="([^"]+)"', img)
+        if src is None: continue
+        img = src.group(1)
+        if not os.path.exists(img): continue
+        print ('Flipping image %s: %s'%(img, flip))
+        flip_image(img, flip)
+    raw = re.sub(br'<STYLE.+?</STYLE>\s*', b'', raw, flags=re.I|re.DOTALL)
+    return raw

--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@ -669,7 +669,6 @@ Some limitations of PDF input are:
    * Complex, multi-column, and image based documents are not supported.
    * Extraction of vector images and tables from within the document is also not supported.
    * Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF.
-    * Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well. 
    * Links and Tables of Contents are not supported
    * PDFs that use embedded non-unicode fonts to represent non-English characters will result in garbled output for those characters
    * Some PDFs are made up of photographs of the page with OCRed text behind them. In such cases |app| uses the OCRed text, which can be very different from what you see when you view the PDF file
--- a/src/calibre/utils/magick/magick.c
+++ b/src/calibre/utils/magick/magick.c
@ -909,6 +909,24 @@ magick_Image_rotate(magick_Image *self, PyObject *args, PyObject *kwargs) {
 }
 // }}}

+// Image.rotate {{{
+
+static PyObject *
+magick_Image_flip(magick_Image *self, PyObject *args, PyObject *kwargs) {
+    PyObject *obj = NULL;
+    MagickBooleanType ret = 0;
+    
+    NULL_CHECK(NULL)
+
+    if (!PyArg_ParseTuple(args, "|O", &obj)) return NULL;
+    ret = (obj != NULL && PyObject_IsTrue(obj)) ? MagickFlopImage(self->wand) : MagickFlipImage(self->wand);
+    if (!ret) { PyErr_SetString(PyExc_ValueError, "Failed to flip image"); return NULL; }
+
+    Py_RETURN_NONE;
+}
+// }}}
+
+
 // Image.set_page {{{

 static PyObject *
@ -1174,6 +1192,10 @@ static PyMethodDef magick_Image_methods[] = {
    {"rotate", (PyCFunction)magick_Image_rotate, METH_VARARGS,
     "rotate(background_pixel_wand, degrees) \n\n Rotate image by specified degrees."
    },
+    {"flip", (PyCFunction)magick_Image_flip, METH_VARARGS,
+     "flip(horizontal=False) \n\n Flip image about a vertical axis. If horizontal is True, flip about horizontal axis instead."
+    },
+

    {"normalize", (PyCFunction)magick_Image_normalize, METH_VARARGS,
     "normalize() \n\n enhances the contrast of a color image by adjusting the pixels color to span the entire range of colors available."