mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 00:02:35 -04:00 
			
		
		
		
	new setting: PAPERLESS_OCR_PAGES
This commit is contained in:
		
							parent
							
								
									ea089de3b3
								
							
						
					
					
						commit
						fec9e54049
					
				| @ -26,6 +26,9 @@ next | |||||||
| 
 | 
 | ||||||
| * Much better admin for mail rule editing. | * Much better admin for mail rule editing. | ||||||
| 
 | 
 | ||||||
|  | * New setting ``PAPERLESS_OCR_PAGES`` limits the tesseract parser | ||||||
|  |   to the first n pages of scanned documents. | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| paperless-ng 0.9.1 | paperless-ng 0.9.1 | ||||||
| ################## | ################## | ||||||
|  | |||||||
| @ -184,6 +184,16 @@ PAPERLESS_TIME_ZONE=<timezone> | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | PAPERLESS_OCR_PAGES=<num> | ||||||
|  |     Tells paperless to use only the specified amount of pages for OCR. Documents | ||||||
|  |     with less than the specified amount of pages get OCR'ed completely. | ||||||
|  | 
 | ||||||
|  |     Specifying 1 here will only use the first page. | ||||||
|  | 
 | ||||||
|  |     Defaults to 0, which disables this feature and always uses all pages. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| PAPERLESS_OCR_LANGUAGE=<lang> | PAPERLESS_OCR_LANGUAGE=<lang> | ||||||
|     Customize the default language that tesseract will attempt to use when |     Customize the default language that tesseract will attempt to use when | ||||||
|     parsing documents. The default language is used whenever |     parsing documents. The default language is used whenever | ||||||
|  | |||||||
| @ -358,4 +358,30 @@ management commands as below. | |||||||
| 7.  Start paperless. | 7.  Start paperless. | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | Considerations for less powerful devices | ||||||
|  | ######################################## | ||||||
|  | 
 | ||||||
|  | Paperless runs on Raspberry Pi. However, some things are rather slow on the Pi and  | ||||||
|  | configuring some options in paperless can help improve performance immensely: | ||||||
|  | 
 | ||||||
|  | *   Consider setting ``PAPERLESS_OCR_PAGES`` to 1, so that paperless will only OCR | ||||||
|  |     the first page of your documents. | ||||||
|  | *   ``PAPERLESS_TASK_WORKERS`` and ``PAPERLESS_THREADS_PER_WORKER`` are configured | ||||||
|  |     to use all cores. The Raspberry Pi models 3 and up have 4 cores, meaning that | ||||||
|  |     paperless will use 2 workers and 2 threads per worker. This may result in | ||||||
|  |     slugish response times during consumption, so you might want to lower these | ||||||
|  |     settings (example: 2 workers and 1 thread to always have some computing power | ||||||
|  |     left for other tasks). | ||||||
|  | *   Keep ``PAPERLESS_OCR_ALWAYS`` at its default value 'false' and consider OCR'ing | ||||||
|  |     your documents before feeding them into paperless. Some scanners are able to | ||||||
|  |     do this! | ||||||
|  | *   Lower ``PAPERLESS_CONVERT_DENSITY`` from its default value 300 to 200. This | ||||||
|  |     will still result in rather accurate OCR, but will decrease consumption time | ||||||
|  |     by quite a bit. | ||||||
|  | *   Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption | ||||||
|  |     times. Thumbnails will be about 20% larger. | ||||||
|  | 
 | ||||||
|  | For details, refer to :ref:`configuration`. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| .. _redis: https://redis.io/ | .. _redis: https://redis.io/ | ||||||
|  | |||||||
| @ -35,6 +35,7 @@ | |||||||
| #PAPERLESS_TASK_WORKERS=1 | #PAPERLESS_TASK_WORKERS=1 | ||||||
| #PAPERLESS_THREADS_PER_WORKER=1 | #PAPERLESS_THREADS_PER_WORKER=1 | ||||||
| #PAPERLESS_TIME_ZONE=UTC | #PAPERLESS_TIME_ZONE=UTC | ||||||
|  | #PAPERLESS_OCR_PAGES=1 | ||||||
| #PAPERLESS_OCR_LANGUAGE=eng | #PAPERLESS_OCR_LANGUAGE=eng | ||||||
| #PAPERLESS_OCR_ALWAYS=false | #PAPERLESS_OCR_ALWAYS=false | ||||||
| #PAPERLESS_CONSUMER_POLLING=10 | #PAPERLESS_CONSUMER_POLLING=10 | ||||||
|  | |||||||
| @ -322,6 +322,8 @@ CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES | |||||||
| 
 | 
 | ||||||
| OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") | OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") | ||||||
| 
 | 
 | ||||||
|  | OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0)) | ||||||
|  | 
 | ||||||
| # The default language that tesseract will attempt to use when parsing | # The default language that tesseract will attempt to use when parsing | ||||||
| # documents.  It should be a 3-letter language code consistent with ISO 639. | # documents.  It should be a 3-letter language code consistent with ISO 639. | ||||||
| OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") | OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") | ||||||
|  | |||||||
| @ -147,18 +147,25 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|         Greyscale images are easier for Tesseract to OCR |         Greyscale images are easier for Tesseract to OCR | ||||||
|         """ |         """ | ||||||
| 
 | 
 | ||||||
|  |         # Convert PDF to multiple PNMs | ||||||
|  |         input_file = self.document_path | ||||||
|  | 
 | ||||||
|  |         if settings.OCR_PAGES == 1: | ||||||
|  |             input_file += "[0]" | ||||||
|  |         elif settings.OCR_PAGES > 1: | ||||||
|  |             input_file += f"[0-{settings.OCR_PAGES - 1}]" | ||||||
|  | 
 | ||||||
|         self.log( |         self.log( | ||||||
|             "debug", |             "debug", | ||||||
|             f"Converting document {self.document_path} into greyscale images") |             f"Converting document {input_file} into greyscale images") | ||||||
| 
 | 
 | ||||||
|         # Convert PDF to multiple PNMs |         output_files = os.path.join(self.tempdir, "convert-%04d.pnm") | ||||||
|         pnm = os.path.join(self.tempdir, "convert-%04d.pnm") |  | ||||||
| 
 | 
 | ||||||
|         run_convert(density=settings.CONVERT_DENSITY, |         run_convert(density=settings.CONVERT_DENSITY, | ||||||
|                     depth="8", |                     depth="8", | ||||||
|                     type="grayscale", |                     type="grayscale", | ||||||
|                     input_file=self.document_path, |                     input_file=input_file, | ||||||
|                     output_file=pnm, |                     output_file=output_files, | ||||||
|                     logging_group=self.logging_group) |                     logging_group=self.logging_group) | ||||||
| 
 | 
 | ||||||
|         # Get a list of converted images |         # Get a list of converted images | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user