mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-25 07:49:06 -04:00 
			
		
		
		
	
						commit
						66a81cf6ac
					
				| @ -3,6 +3,10 @@ Changelog | ||||
| 
 | ||||
| * 0.2.0 | ||||
| 
 | ||||
|   * `#98`_: Added optional environment variables for ImageMagick so that it | ||||
|     doesn't explode when handling Very Large Documents or when it's just | ||||
|     running on a low-memory system.  Thanks to `Florian Harr`_ for his help on | ||||
|     this one. | ||||
|   * Added support for guessing the date from the file name along with the | ||||
|     correspondent, title, and tags.  Thanks to `Tikitu de Jager`_ for his pull | ||||
|     request that I took forever to merge and to `Pit`_ for his efforts on the | ||||
| @ -97,6 +101,7 @@ Changelog | ||||
| .. _zedster: https://github.com/zedster | ||||
| .. _Martin Honermeyer: https://github.com/djmaze | ||||
| .. _Tim White: https://github.com/timwhite | ||||
| .. _Florian Harr: https://github.com/evils | ||||
| 
 | ||||
| .. _#20: https://github.com/danielquinn/paperless/issues/20 | ||||
| .. _#44: https://github.com/danielquinn/paperless/issues/44 | ||||
| @ -111,3 +116,4 @@ Changelog | ||||
| .. _#68: https://github.com/danielquinn/paperless/issues/68 | ||||
| .. _#71: https://github.com/danielquinn/paperless/issues/71 | ||||
| .. _#94: https://github.com/danielquinn/paperless/issues/71 | ||||
| .. _#98: https://github.com/danielquinn/paperless/issues/71 | ||||
|  | ||||
| @ -3,17 +3,47 @@ | ||||
| Troubleshooting | ||||
| =============== | ||||
| 
 | ||||
| .. _troubleshooting_ocr_language_files_missing: | ||||
| .. _troubleshooting-languagemissing: | ||||
| 
 | ||||
| Consumer warns ``OCR for XX failed`` | ||||
| ------------------------------------ | ||||
| 
 | ||||
| If you find the OCR accuracy to be too low, and/or the document consumer warns that ``OCR for | ||||
| XX failed, but we're going to stick with what we've got since FORGIVING_OCR is enabled``, then you | ||||
| might need to install the `Tesseract language files | ||||
| <http://packages.ubuntu.com/search?keywords=tesseract-ocr>`_ marching your documents languages. | ||||
| If you find the OCR accuracy to be too low, and/or the document consumer warns | ||||
| that ``OCR for XX failed, but we're going to stick with what we've got since | ||||
| FORGIVING_OCR is enabled``, then you might need to install the | ||||
| `Tesseract language files <http://packages.ubuntu.com/search?keywords=tesseract-ocr>`_ | ||||
| marching your documents languages. | ||||
| 
 | ||||
| As an example, if you are running Paperless from the Vagrant setup provided (or from any Ubuntu or Debian | ||||
| box), and your documents are written in Spanish you may need to run:: | ||||
| As an example, if you are running Paperless from the Vagrant setup provided | ||||
| (or from any Ubuntu or Debian box), and your documents are written in Spanish | ||||
| you may need to run:: | ||||
| 
 | ||||
|     apt-get install -y tesseract-ocr-spa | ||||
| 
 | ||||
| 
 | ||||
| .. _troubleshooting-convertpixelcache: | ||||
| 
 | ||||
| Consumer dies with ``convert: unable to extent pixel cache`` | ||||
| ------------------------------------------------------------ | ||||
| 
 | ||||
| During the consumption process, Paperless invokes ImageMagick's ``convert`` | ||||
| program to translate the source document into something that the OCR engine can | ||||
| understand and this can burn a Very Large amount of memory if the original | ||||
| document is rather long.  Similarly, if your system doesn't have a lot of | ||||
| memory to begin with (ie. a Rasberry Pi), then this can happen for even | ||||
| medium-sized documents. | ||||
| 
 | ||||
| The solution is to tell ImageMagick *not* to Use All The RAM, as is its | ||||
| default, and instead tell it to used a fixed amount.  ``convert`` will then | ||||
| break up the job into hundreds of individual files and use them to slowly | ||||
| compile the finished image.  Simply set ``PAPERLESS_CONVERT_MEMORY_LIMIT`` in | ||||
| ``/etc/paperless.conf`` to something like ``32000000`` and you'll limit | ||||
| ``convert`` to 32MB.  Fiddle with this value as you like. | ||||
| 
 | ||||
| **HOWEVER**: Simply setting this value may not be enough on system where | ||||
| ``/tmp`` is mounted as tmpfs, as this is where ``convert`` will write its | ||||
| temporary files.  In these cases (most Systemd machines), you need to tell | ||||
| ImageMagick to use a different space for its scratch work.  You do this by | ||||
| setting ``PAPERLESS_CONVERT_TMPDIR`` in ``/etc/paperless.conf`` to somewhere | ||||
| that's actually on a physical disk (and writable by the user running | ||||
| Paperless), like ``/var/tmp/paperless`` or ``/home/my_user/tmp`` in a pinch. | ||||
|  | ||||
| @ -32,7 +32,32 @@ PAPERLESS_PASSPHRASE="secret" | ||||
| # have a shared secret here. | ||||
| PAPERLESS_SHARED_SECRET="" | ||||
| 
 | ||||
| # | ||||
| # The following values use sensible defaults for modern systems, but if you're | ||||
| # running Paperless on a low-resource machine (like a Rasberry Pi), modifying | ||||
| # some of these values may be necessary. | ||||
| # | ||||
| 
 | ||||
| # By default, Paperless will attempt to use all available CPU cores to process | ||||
| # a document, but if you would like to limit that, you can set this value to | ||||
| # an integer: | ||||
| #PAPERLESS_OCR_THREADS=1 | ||||
| 
 | ||||
| # On smaller systems, or even in the case of Very Large Documents, the consumer | ||||
| # may explode, complaining about how it's "unable to extent pixel cache".  In | ||||
| # such cases, try setting this to a reasonably low value, like 32000000.  The | ||||
| # default is to use whatever is necessary to do everything without writing to | ||||
| # disk, and units are in megabytes. | ||||
| # | ||||
| # For more information on how to use this value, you should probably search | ||||
| # the web for "MAGICK_MEMORY_LIMIT". | ||||
| #PAPERLESS_CONVERT_MEMORY_LIMIT=0 | ||||
| 
 | ||||
| # Similar to the memory limit, if you've got a small system and your OS mounts | ||||
| # /tmp as tmpfs, you should set this to a path that's on a physical disk, like | ||||
| # /home/your_user/tmp or something.  ImageMagick will use this as scratch space | ||||
| # when crunching through very large documents. | ||||
| # | ||||
| # For more information on how to use this value, you should probably search | ||||
| # the web for "MAGICK_TMPDIR". | ||||
| #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless | ||||
|  | ||||
| @ -129,10 +129,13 @@ class Consumer(object): | ||||
| 
 | ||||
|         # Convert PDF to multiple PNMs | ||||
|         pnm = os.path.join(tempdir, "convert-%04d.pnm") | ||||
|         subprocess.Popen(( | ||||
|             self.CONVERT, "-density", "300", "-depth", "8", | ||||
|             "-type", "grayscale", doc, pnm | ||||
|         )).wait() | ||||
|         run_convert( | ||||
|             self.CONVERT, | ||||
|             "-density", "300", | ||||
|             "-depth", "8", | ||||
|             "-type", "grayscale", | ||||
|             doc, pnm, | ||||
|         ) | ||||
| 
 | ||||
|         # Get a list of converted images | ||||
|         pnms = [] | ||||
| @ -159,13 +162,12 @@ class Consumer(object): | ||||
| 
 | ||||
|         self.log("info", "Generating the thumbnail") | ||||
| 
 | ||||
|         subprocess.Popen(( | ||||
|         run_convert( | ||||
|             self.CONVERT, | ||||
|             "-scale", "500x5000", | ||||
|             "-alpha", "remove", | ||||
|             doc, | ||||
|             os.path.join(tempdir, "convert-%04d.png") | ||||
|         )).wait() | ||||
|             doc, os.path.join(tempdir, "convert-%04d.png") | ||||
|         ) | ||||
| 
 | ||||
|         return os.path.join(tempdir, "convert-0000.png") | ||||
| 
 | ||||
| @ -334,6 +336,16 @@ def image_to_string(args): | ||||
| 
 | ||||
| def run_unpaper(args): | ||||
|     unpaper, pnm = args | ||||
|     subprocess.Popen(( | ||||
|         unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm") | ||||
|     )).wait() | ||||
|     subprocess.Popen( | ||||
|         (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait() | ||||
| 
 | ||||
| 
 | ||||
| def run_convert(*args): | ||||
| 
 | ||||
|     environment = {} | ||||
|     if settings.CONVERT_MEMORY_LIMIT: | ||||
|         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT | ||||
|     if settings.CONVERT_TMPDIR: | ||||
|         environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR | ||||
| 
 | ||||
|     subprocess.Popen(args, env=environment).wait() | ||||
|  | ||||
| @ -189,6 +189,8 @@ GNUPG_HOME = os.getenv("HOME", "/tmp") | ||||
| 
 | ||||
| # Convert is part of the ImageMagick package | ||||
| CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY") | ||||
| CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") | ||||
| CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") | ||||
| 
 | ||||
| # Unpaper | ||||
| UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") | ||||
| @ -226,7 +228,7 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE") | ||||
| SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "") | ||||
| 
 | ||||
| # | ||||
| # TODO: Remove after 1.2 | ||||
| # TODO: Remove after 0.2 | ||||
| # | ||||
| # This logic is here to address issue #44, wherein we were using inconsistent | ||||
| # constant names vs. environment variables.  If you're using Paperless for the | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user