mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 00:02:35 -04:00 
			
		
		
		
	Merge pull request #1008 from paperless-ngx/bugfix-max-pixel-setting
Bugfix: Corrects the setting of max pixel size for OCR
This commit is contained in:
		
						commit
						a4927477fb
					
				| @ -424,14 +424,23 @@ PAPERLESS_OCR_IMAGE_DPI=<num> | |||||||
|     the produced PDF documents are A4 sized. |     the produced PDF documents are A4 sized. | ||||||
| 
 | 
 | ||||||
| PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num> | PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num> | ||||||
|     Paperless will not OCR images that have more pixels than this limit. |     Paperless will raise a warning when OCRing images which are over this limit and | ||||||
|     This is intended to prevent decompression bombs from overloading paperless. |     will not OCR images which are more than twice this limit.  Note this does not | ||||||
|     Increasing this limit is desired if you face a DecompressionBombError despite |     prevent the document from being consumed, but could result in missing text content. | ||||||
|     the concerning file not being malicious; this could e.g. be caused by invalidly | 
 | ||||||
|     recognized metadata. |     If unset, will default to the value determined by | ||||||
|     If you have enough resources or if you are certain that your uploaded files |     `Pillow <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS>`_. | ||||||
|     are not malicious you can increase this value to your needs. | 
 | ||||||
|     The default value is 256000000, an image with more pixels than that would not be parsed. |     .. note:: | ||||||
|  | 
 | ||||||
|  |         Increasing this limit could cause Paperless to consume additional resources | ||||||
|  |         when consuming a file.  Be sure you have sufficient system resources. | ||||||
|  | 
 | ||||||
|  |     .. caution:: | ||||||
|  | 
 | ||||||
|  |         The limit is intended to prevent malicious files from consuming system resources | ||||||
|  |         and causing crashes and other errors.  Only increase this value if you are certain | ||||||
|  |         your documents are not malicious and you need the text which was not OCRed | ||||||
| 
 | 
 | ||||||
| PAPERLESS_OCR_USER_ARGS=<json> | PAPERLESS_OCR_USER_ARGS=<json> | ||||||
|     OCRmyPDF offers many more options. Use this parameter to specify any |     OCRmyPDF offers many more options. Use this parameter to specify any | ||||||
|  | |||||||
| @ -5,6 +5,7 @@ import multiprocessing | |||||||
| import os | import os | ||||||
| import re | import re | ||||||
| from typing import Final | from typing import Final | ||||||
|  | from typing import Optional | ||||||
| from typing import Set | from typing import Set | ||||||
| from urllib.parse import urlparse | from urllib.parse import urlparse | ||||||
| 
 | 
 | ||||||
| @ -551,10 +552,9 @@ OCR_ROTATE_PAGES_THRESHOLD = float( | |||||||
|     os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0), |     os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0), | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| OCR_MAX_IMAGE_PIXELS = os.environ.get( | OCR_MAX_IMAGE_PIXELS: Optional[int] = None | ||||||
|     "PAPERLESS_OCR_MAX_IMAGE_PIXELS", | if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None: | ||||||
|     256000000, |     OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS")) | ||||||
| ) |  | ||||||
| 
 | 
 | ||||||
| OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") | OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf | |||||||
| from documents.parsers import ParseError | from documents.parsers import ParseError | ||||||
| from PIL import Image | from PIL import Image | ||||||
| 
 | 
 | ||||||
| Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| class NoTextFoundException(Exception): | class NoTextFoundException(Exception): | ||||||
|     pass |     pass | ||||||
| @ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|                     f"they will not be used. Error: {e}", |                     f"they will not be used. Error: {e}", | ||||||
|                 ) |                 ) | ||||||
| 
 | 
 | ||||||
|  |         if settings.OCR_MAX_IMAGE_PIXELS is not None: | ||||||
|  |             # Convert pixels to mega-pixels and provide to ocrmypdf | ||||||
|  |             max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0 | ||||||
|  |             if max_pixels_mpixels > 0: | ||||||
|  | 
 | ||||||
|  |                 self.log( | ||||||
|  |                     "debug", | ||||||
|  |                     f"Calculated {max_pixels_mpixels} megapixels for OCR", | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |                 ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels | ||||||
|  |             else: | ||||||
|  |                 self.log( | ||||||
|  |                     "warning", | ||||||
|  |                     "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, " | ||||||
|  |                     "this value must be at least 1 megapixel if set", | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|         return ocrmypdf_args |         return ocrmypdf_args | ||||||
| 
 | 
 | ||||||
|     def parse(self, document_path, mime_type, file_name=None): |     def parse(self, document_path, mime_type, file_name=None): | ||||||
|  | |||||||
| @ -6,8 +6,6 @@ from PIL import Image | |||||||
| from PIL import ImageDraw | from PIL import ImageDraw | ||||||
| from PIL import ImageFont | from PIL import ImageFont | ||||||
| 
 | 
 | ||||||
| Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| class TextDocumentParser(DocumentParser): | class TextDocumentParser(DocumentParser): | ||||||
|     """ |     """ | ||||||
| @ -28,7 +26,7 @@ class TextDocumentParser(DocumentParser): | |||||||
|         font = ImageFont.truetype( |         font = ImageFont.truetype( | ||||||
|             font=settings.THUMBNAIL_FONT_NAME, |             font=settings.THUMBNAIL_FONT_NAME, | ||||||
|             size=20, |             size=20, | ||||||
|             layout_engine=ImageFont.LAYOUT_BASIC, |             layout_engine=ImageFont.Layout.BASIC, | ||||||
|         ) |         ) | ||||||
|         draw.text((5, 5), read_text(), font=font, fill="black") |         draw.text((5, 5), read_text(), font=font, fill="black") | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user