mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 00:02:35 -04:00 
			
		
		
		
	
						commit
						66a81cf6ac
					
				| @ -3,6 +3,10 @@ Changelog | |||||||
| 
 | 
 | ||||||
| * 0.2.0 | * 0.2.0 | ||||||
| 
 | 
 | ||||||
|  |   * `#98`_: Added optional environment variables for ImageMagick so that it | ||||||
|  |     doesn't explode when handling Very Large Documents or when it's just | ||||||
|  |     running on a low-memory system.  Thanks to `Florian Harr`_ for his help on | ||||||
|  |     this one. | ||||||
|   * Added support for guessing the date from the file name along with the |   * Added support for guessing the date from the file name along with the | ||||||
|     correspondent, title, and tags.  Thanks to `Tikitu de Jager`_ for his pull |     correspondent, title, and tags.  Thanks to `Tikitu de Jager`_ for his pull | ||||||
|     request that I took forever to merge and to `Pit`_ for his efforts on the |     request that I took forever to merge and to `Pit`_ for his efforts on the | ||||||
| @ -97,6 +101,7 @@ Changelog | |||||||
| .. _zedster: https://github.com/zedster | .. _zedster: https://github.com/zedster | ||||||
| .. _Martin Honermeyer: https://github.com/djmaze | .. _Martin Honermeyer: https://github.com/djmaze | ||||||
| .. _Tim White: https://github.com/timwhite | .. _Tim White: https://github.com/timwhite | ||||||
|  | .. _Florian Harr: https://github.com/evils | ||||||
| 
 | 
 | ||||||
| .. _#20: https://github.com/danielquinn/paperless/issues/20 | .. _#20: https://github.com/danielquinn/paperless/issues/20 | ||||||
| .. _#44: https://github.com/danielquinn/paperless/issues/44 | .. _#44: https://github.com/danielquinn/paperless/issues/44 | ||||||
| @ -111,3 +116,4 @@ Changelog | |||||||
| .. _#68: https://github.com/danielquinn/paperless/issues/68 | .. _#68: https://github.com/danielquinn/paperless/issues/68 | ||||||
| .. _#71: https://github.com/danielquinn/paperless/issues/71 | .. _#71: https://github.com/danielquinn/paperless/issues/71 | ||||||
| .. _#94: https://github.com/danielquinn/paperless/issues/71 | .. _#94: https://github.com/danielquinn/paperless/issues/71 | ||||||
|  | .. _#98: https://github.com/danielquinn/paperless/issues/71 | ||||||
|  | |||||||
| @ -3,17 +3,47 @@ | |||||||
| Troubleshooting | Troubleshooting | ||||||
| =============== | =============== | ||||||
| 
 | 
 | ||||||
| .. _troubleshooting_ocr_language_files_missing: | .. _troubleshooting-languagemissing: | ||||||
| 
 | 
 | ||||||
| Consumer warns ``OCR for XX failed`` | Consumer warns ``OCR for XX failed`` | ||||||
| ------------------------------------ | ------------------------------------ | ||||||
| 
 | 
 | ||||||
| If you find the OCR accuracy to be too low, and/or the document consumer warns that ``OCR for | If you find the OCR accuracy to be too low, and/or the document consumer warns | ||||||
| XX failed, but we're going to stick with what we've got since FORGIVING_OCR is enabled``, then you | that ``OCR for XX failed, but we're going to stick with what we've got since | ||||||
| might need to install the `Tesseract language files | FORGIVING_OCR is enabled``, then you might need to install the | ||||||
| <http://packages.ubuntu.com/search?keywords=tesseract-ocr>`_ marching your documents languages. | `Tesseract language files <http://packages.ubuntu.com/search?keywords=tesseract-ocr>`_ | ||||||
|  | marching your documents languages. | ||||||
| 
 | 
 | ||||||
| As an example, if you are running Paperless from the Vagrant setup provided (or from any Ubuntu or Debian | As an example, if you are running Paperless from the Vagrant setup provided | ||||||
| box), and your documents are written in Spanish you may need to run:: | (or from any Ubuntu or Debian box), and your documents are written in Spanish | ||||||
|  | you may need to run:: | ||||||
| 
 | 
 | ||||||
|     apt-get install -y tesseract-ocr-spa |     apt-get install -y tesseract-ocr-spa | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _troubleshooting-convertpixelcache: | ||||||
|  | 
 | ||||||
|  | Consumer dies with ``convert: unable to extent pixel cache`` | ||||||
|  | ------------------------------------------------------------ | ||||||
|  | 
 | ||||||
|  | During the consumption process, Paperless invokes ImageMagick's ``convert`` | ||||||
|  | program to translate the source document into something that the OCR engine can | ||||||
|  | understand and this can burn a Very Large amount of memory if the original | ||||||
|  | document is rather long.  Similarly, if your system doesn't have a lot of | ||||||
|  | memory to begin with (ie. a Rasberry Pi), then this can happen for even | ||||||
|  | medium-sized documents. | ||||||
|  | 
 | ||||||
|  | The solution is to tell ImageMagick *not* to Use All The RAM, as is its | ||||||
|  | default, and instead tell it to used a fixed amount.  ``convert`` will then | ||||||
|  | break up the job into hundreds of individual files and use them to slowly | ||||||
|  | compile the finished image.  Simply set ``PAPERLESS_CONVERT_MEMORY_LIMIT`` in | ||||||
|  | ``/etc/paperless.conf`` to something like ``32000000`` and you'll limit | ||||||
|  | ``convert`` to 32MB.  Fiddle with this value as you like. | ||||||
|  | 
 | ||||||
|  | **HOWEVER**: Simply setting this value may not be enough on system where | ||||||
|  | ``/tmp`` is mounted as tmpfs, as this is where ``convert`` will write its | ||||||
|  | temporary files.  In these cases (most Systemd machines), you need to tell | ||||||
|  | ImageMagick to use a different space for its scratch work.  You do this by | ||||||
|  | setting ``PAPERLESS_CONVERT_TMPDIR`` in ``/etc/paperless.conf`` to somewhere | ||||||
|  | that's actually on a physical disk (and writable by the user running | ||||||
|  | Paperless), like ``/var/tmp/paperless`` or ``/home/my_user/tmp`` in a pinch. | ||||||
|  | |||||||
| @ -32,7 +32,32 @@ PAPERLESS_PASSPHRASE="secret" | |||||||
| # have a shared secret here. | # have a shared secret here. | ||||||
| PAPERLESS_SHARED_SECRET="" | PAPERLESS_SHARED_SECRET="" | ||||||
| 
 | 
 | ||||||
|  | # | ||||||
|  | # The following values use sensible defaults for modern systems, but if you're | ||||||
|  | # running Paperless on a low-resource machine (like a Rasberry Pi), modifying | ||||||
|  | # some of these values may be necessary. | ||||||
|  | # | ||||||
|  | 
 | ||||||
| # By default, Paperless will attempt to use all available CPU cores to process | # By default, Paperless will attempt to use all available CPU cores to process | ||||||
| # a document, but if you would like to limit that, you can set this value to | # a document, but if you would like to limit that, you can set this value to | ||||||
| # an integer: | # an integer: | ||||||
| #PAPERLESS_OCR_THREADS=1 | #PAPERLESS_OCR_THREADS=1 | ||||||
|  | 
 | ||||||
|  | # On smaller systems, or even in the case of Very Large Documents, the consumer | ||||||
|  | # may explode, complaining about how it's "unable to extent pixel cache".  In | ||||||
|  | # such cases, try setting this to a reasonably low value, like 32000000.  The | ||||||
|  | # default is to use whatever is necessary to do everything without writing to | ||||||
|  | # disk, and units are in megabytes. | ||||||
|  | # | ||||||
|  | # For more information on how to use this value, you should probably search | ||||||
|  | # the web for "MAGICK_MEMORY_LIMIT". | ||||||
|  | #PAPERLESS_CONVERT_MEMORY_LIMIT=0 | ||||||
|  | 
 | ||||||
|  | # Similar to the memory limit, if you've got a small system and your OS mounts | ||||||
|  | # /tmp as tmpfs, you should set this to a path that's on a physical disk, like | ||||||
|  | # /home/your_user/tmp or something.  ImageMagick will use this as scratch space | ||||||
|  | # when crunching through very large documents. | ||||||
|  | # | ||||||
|  | # For more information on how to use this value, you should probably search | ||||||
|  | # the web for "MAGICK_TMPDIR". | ||||||
|  | #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless | ||||||
|  | |||||||
| @ -129,10 +129,13 @@ class Consumer(object): | |||||||
| 
 | 
 | ||||||
|         # Convert PDF to multiple PNMs |         # Convert PDF to multiple PNMs | ||||||
|         pnm = os.path.join(tempdir, "convert-%04d.pnm") |         pnm = os.path.join(tempdir, "convert-%04d.pnm") | ||||||
|         subprocess.Popen(( |         run_convert( | ||||||
|             self.CONVERT, "-density", "300", "-depth", "8", |             self.CONVERT, | ||||||
|             "-type", "grayscale", doc, pnm |             "-density", "300", | ||||||
|         )).wait() |             "-depth", "8", | ||||||
|  |             "-type", "grayscale", | ||||||
|  |             doc, pnm, | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|         # Get a list of converted images |         # Get a list of converted images | ||||||
|         pnms = [] |         pnms = [] | ||||||
| @ -159,13 +162,12 @@ class Consumer(object): | |||||||
| 
 | 
 | ||||||
|         self.log("info", "Generating the thumbnail") |         self.log("info", "Generating the thumbnail") | ||||||
| 
 | 
 | ||||||
|         subprocess.Popen(( |         run_convert( | ||||||
|             self.CONVERT, |             self.CONVERT, | ||||||
|             "-scale", "500x5000", |             "-scale", "500x5000", | ||||||
|             "-alpha", "remove", |             "-alpha", "remove", | ||||||
|             doc, |             doc, os.path.join(tempdir, "convert-%04d.png") | ||||||
|             os.path.join(tempdir, "convert-%04d.png") |         ) | ||||||
|         )).wait() |  | ||||||
| 
 | 
 | ||||||
|         return os.path.join(tempdir, "convert-0000.png") |         return os.path.join(tempdir, "convert-0000.png") | ||||||
| 
 | 
 | ||||||
| @ -334,6 +336,16 @@ def image_to_string(args): | |||||||
| 
 | 
 | ||||||
| def run_unpaper(args): | def run_unpaper(args): | ||||||
|     unpaper, pnm = args |     unpaper, pnm = args | ||||||
|     subprocess.Popen(( |     subprocess.Popen( | ||||||
|         unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm") |         (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait() | ||||||
|     )).wait() | 
 | ||||||
|  | 
 | ||||||
|  | def run_convert(*args): | ||||||
|  | 
 | ||||||
|  |     environment = {} | ||||||
|  |     if settings.CONVERT_MEMORY_LIMIT: | ||||||
|  |         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT | ||||||
|  |     if settings.CONVERT_TMPDIR: | ||||||
|  |         environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR | ||||||
|  | 
 | ||||||
|  |     subprocess.Popen(args, env=environment).wait() | ||||||
|  | |||||||
| @ -189,6 +189,8 @@ GNUPG_HOME = os.getenv("HOME", "/tmp") | |||||||
| 
 | 
 | ||||||
| # Convert is part of the ImageMagick package | # Convert is part of the ImageMagick package | ||||||
| CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY") | CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY") | ||||||
|  | CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") | ||||||
|  | CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") | ||||||
| 
 | 
 | ||||||
| # Unpaper | # Unpaper | ||||||
| UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") | UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") | ||||||
| @ -226,7 +228,7 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE") | |||||||
| SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "") | SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "") | ||||||
| 
 | 
 | ||||||
| # | # | ||||||
| # TODO: Remove after 1.2 | # TODO: Remove after 0.2 | ||||||
| # | # | ||||||
| # This logic is here to address issue #44, wherein we were using inconsistent | # This logic is here to address issue #44, wherein we were using inconsistent | ||||||
| # constant names vs. environment variables.  If you're using Paperless for the | # constant names vs. environment variables.  If you're using Paperless for the | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user