Merge pull request #1008 from paperless-ngx/bugfix-max-pixel-setting

Bugfix: Corrects the setting of max pixel size for OCR
2025-12-13 16:45:07 -05:00 · 2022-05-26 09:12:24 -07:00 · 2022-05-26 09:12:24 -07:00 · a4927477fb
commit a4927477fb
parent d0a6c6a2f3 985b774378
4 changed files with 40 additions and 17 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -424,14 +424,23 @@ PAPERLESS_OCR_IMAGE_DPI=<num>
    the produced PDF documents are A4 sized.
 PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num>
-    Paperless will not OCR images that have more pixels than this limit.
+    Paperless will raise a warning when OCRing images which are over this limit and
-    This is intended to prevent decompression bombs from overloading paperless.
+    will not OCR images which are more than twice this limit.  Note this does not
-    Increasing this limit is desired if you face a DecompressionBombError despite
+    prevent the document from being consumed, but could result in missing text content.
-    the concerning file not being malicious; this could e.g. be caused by invalidly
+
-    recognized metadata.
+    If unset, will default to the value determined by
-    If you have enough resources or if you are certain that your uploaded files
+    `Pillow <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS>`_.
-    are not malicious you can increase this value to your needs.
+
-    The default value is 256000000, an image with more pixels than that would not be parsed.
+    .. note::
        Increasing this limit could cause Paperless to consume additional resources
        when consuming a file.  Be sure you have sufficient system resources.
    .. caution::
        The limit is intended to prevent malicious files from consuming system resources
        and causing crashes and other errors.  Only increase this value if you are certain
        your documents are not malicious and you need the text which was not OCRed
 PAPERLESS_OCR_USER_ARGS=<json>
    OCRmyPDF offers many more options. Use this parameter to specify any
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -5,6 +5,7 @@ import multiprocessing
 import os
 import re
 from typing import Final
 from typing import Optional
 from typing import Set
 from urllib.parse import urlparse
@ -551,10 +552,9 @@ OCR_ROTATE_PAGES_THRESHOLD = float(
    os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
 )
-OCR_MAX_IMAGE_PIXELS = os.environ.get(
+OCR_MAX_IMAGE_PIXELS: Optional[int] = None
-    "PAPERLESS_OCR_MAX_IMAGE_PIXELS",
+if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
-    256000000,
+    OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
 )
 OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf
 from documents.parsers import ParseError
 from PIL import Image
 Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
 class NoTextFoundException(Exception):
    pass
@ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser):
                    f"they will not be used. Error: {e}",
                )
        if settings.OCR_MAX_IMAGE_PIXELS is not None:
            # Convert pixels to mega-pixels and provide to ocrmypdf
            max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
            if max_pixels_mpixels > 0:
                self.log(
                    "debug",
                    f"Calculated {max_pixels_mpixels} megapixels for OCR",
                )
                ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
            else:
                self.log(
                    "warning",
                    "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
                    "this value must be at least 1 megapixel if set",
                )
        return ocrmypdf_args
    def parse(self, document_path, mime_type, file_name=None):
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@ -6,8 +6,6 @@ from PIL import Image
 from PIL import ImageDraw
 from PIL import ImageFont
 Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
 class TextDocumentParser(DocumentParser):
    """
@ -28,7 +26,7 @@ class TextDocumentParser(DocumentParser):
        font = ImageFont.truetype(
            font=settings.THUMBNAIL_FONT_NAME,
            size=20,
-            layout_engine=ImageFont.LAYOUT_BASIC,
+            layout_engine=ImageFont.Layout.BASIC,
        )
        draw.text((5, 5), read_text(), font=font, fill="black")