Merge pull request #100 from danielquinn/issue/98

Issue/98
2025-12-14 00:55:08 -05:00 · 2016-03-27 20:32:44 +01:00 · 2016-03-27 20:32:44 +01:00 · 66a81cf6ac
commit 66a81cf6ac
parent 396ff98b41 23aa79f307
5 changed files with 94 additions and 19 deletions
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -3,6 +3,10 @@ Changelog
 * 0.2.0
  * `#98`_: Added optional environment variables for ImageMagick so that it
    doesn't explode when handling Very Large Documents or when it's just
    running on a low-memory system.  Thanks to `Florian Harr`_ for his help on
    this one.
  * Added support for guessing the date from the file name along with the
    correspondent, title, and tags.  Thanks to `Tikitu de Jager`_ for his pull
    request that I took forever to merge and to `Pit`_ for his efforts on the
@ -97,6 +101,7 @@ Changelog
 .. _zedster: https://github.com/zedster
 .. _Martin Honermeyer: https://github.com/djmaze
 .. _Tim White: https://github.com/timwhite
 .. _Florian Harr: https://github.com/evils
 .. _#20: https://github.com/danielquinn/paperless/issues/20
 .. _#44: https://github.com/danielquinn/paperless/issues/44
@ -111,3 +116,4 @@ Changelog
 .. _#68: https://github.com/danielquinn/paperless/issues/68
 .. _#71: https://github.com/danielquinn/paperless/issues/71
 .. _#94: https://github.com/danielquinn/paperless/issues/71
 .. _#98: https://github.com/danielquinn/paperless/issues/71
--- a/docs/troubleshooting.rst
+++ b/docs/troubleshooting.rst
@ -3,17 +3,47 @@
 Troubleshooting
 ===============
-.. _troubleshooting_ocr_language_files_missing:
+.. _troubleshooting-languagemissing:
 Consumer warns ``OCR for XX failed``
 ------------------------------------
-If you find the OCR accuracy to be too low, and/or the document consumer warns that ``OCR for
+If you find the OCR accuracy to be too low, and/or the document consumer warns
-XX failed, but we're going to stick with what we've got since FORGIVING_OCR is enabled``, then you
+that ``OCR for XX failed, but we're going to stick with what we've got since
-might need to install the `Tesseract language files
+FORGIVING_OCR is enabled``, then you might need to install the
-<http://packages.ubuntu.com/search?keywords=tesseract-ocr>`_ marching your documents languages.
+`Tesseract language files <http://packages.ubuntu.com/search?keywords=tesseract-ocr>`_
 marching your documents languages.
-As an example, if you are running Paperless from the Vagrant setup provided (or from any Ubuntu or Debian
+As an example, if you are running Paperless from the Vagrant setup provided
-box), and your documents are written in Spanish you may need to run::
+(or from any Ubuntu or Debian box), and your documents are written in Spanish
 you may need to run::
    apt-get install -y tesseract-ocr-spa
 .. _troubleshooting-convertpixelcache:
 Consumer dies with ``convert: unable to extent pixel cache``
 ------------------------------------------------------------
 During the consumption process, Paperless invokes ImageMagick's ``convert``
 program to translate the source document into something that the OCR engine can
 understand and this can burn a Very Large amount of memory if the original
 document is rather long.  Similarly, if your system doesn't have a lot of
 memory to begin with (ie. a Rasberry Pi), then this can happen for even
 medium-sized documents.
 The solution is to tell ImageMagick *not* to Use All The RAM, as is its
 default, and instead tell it to used a fixed amount.  ``convert`` will then
 break up the job into hundreds of individual files and use them to slowly
 compile the finished image.  Simply set ``PAPERLESS_CONVERT_MEMORY_LIMIT`` in
 ``/etc/paperless.conf`` to something like ``32000000`` and you'll limit
 ``convert`` to 32MB.  Fiddle with this value as you like.
 **HOWEVER**: Simply setting this value may not be enough on system where
 ``/tmp`` is mounted as tmpfs, as this is where ``convert`` will write its
 temporary files.  In these cases (most Systemd machines), you need to tell
 ImageMagick to use a different space for its scratch work.  You do this by
 setting ``PAPERLESS_CONVERT_TMPDIR`` in ``/etc/paperless.conf`` to somewhere
 that's actually on a physical disk (and writable by the user running
 Paperless), like ``/var/tmp/paperless`` or ``/home/my_user/tmp`` in a pinch.
--- a/paperless.conf.example
+++ b/paperless.conf.example
@ -32,7 +32,32 @@ PAPERLESS_PASSPHRASE="secret"
 # have a shared secret here.
 PAPERLESS_SHARED_SECRET=""
 #
 # The following values use sensible defaults for modern systems, but if you're
 # running Paperless on a low-resource machine (like a Rasberry Pi), modifying
 # some of these values may be necessary.
 #
 # By default, Paperless will attempt to use all available CPU cores to process
 # a document, but if you would like to limit that, you can set this value to
 # an integer:
 #PAPERLESS_OCR_THREADS=1
 # On smaller systems, or even in the case of Very Large Documents, the consumer
 # may explode, complaining about how it's "unable to extent pixel cache".  In
 # such cases, try setting this to a reasonably low value, like 32000000.  The
 # default is to use whatever is necessary to do everything without writing to
 # disk, and units are in megabytes.
 #
 # For more information on how to use this value, you should probably search
 # the web for "MAGICK_MEMORY_LIMIT".
 #PAPERLESS_CONVERT_MEMORY_LIMIT=0
 # Similar to the memory limit, if you've got a small system and your OS mounts
 # /tmp as tmpfs, you should set this to a path that's on a physical disk, like
 # /home/your_user/tmp or something.  ImageMagick will use this as scratch space
 # when crunching through very large documents.
 #
 # For more information on how to use this value, you should probably search
 # the web for "MAGICK_TMPDIR".
 #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -129,10 +129,13 @@ class Consumer(object):
        # Convert PDF to multiple PNMs
        pnm = os.path.join(tempdir, "convert-%04d.pnm")
-        subprocess.Popen((
+        run_convert(
-            self.CONVERT, "-density", "300", "-depth", "8",
+            self.CONVERT,
-            "-type", "grayscale", doc, pnm
+            "-density", "300",
-        )).wait()
+            "-depth", "8",
            "-type", "grayscale",
            doc, pnm,
        )
        # Get a list of converted images
        pnms = []
@ -159,13 +162,12 @@ class Consumer(object):
        self.log("info", "Generating the thumbnail")
-        subprocess.Popen((
+        run_convert(
            self.CONVERT,
            "-scale", "500x5000",
            "-alpha", "remove",
-            doc,
+            doc, os.path.join(tempdir, "convert-%04d.png")
-            os.path.join(tempdir, "convert-%04d.png")
+        )
        )).wait()
        return os.path.join(tempdir, "convert-0000.png")
@ -334,6 +336,16 @@ def image_to_string(args):
 def run_unpaper(args):
    unpaper, pnm = args
-    subprocess.Popen((
+    subprocess.Popen(
-        unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
+        (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
-    )).wait()
+
 def run_convert(*args):
    environment = {}
    if settings.CONVERT_MEMORY_LIMIT:
        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
    if settings.CONVERT_TMPDIR:
        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
    subprocess.Popen(args, env=environment).wait()
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -189,6 +189,8 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
 # Convert is part of the ImageMagick package
 CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
 CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
 CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
 # Unpaper
 UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
@ -226,7 +228,7 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
 SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "")
 #
-# TODO: Remove after 1.2
+# TODO: Remove after 0.2
 #
 # This logic is here to address issue #44, wherein we were using inconsistent
 # constant names vs. environment variables.  If you're using Paperless for the