Fix for #154

* Added a test with a faked pyocr and tesseract * Added a catch for pyocr's *other* TesseractError
2025-12-21 20:37:24 -05:00 · 2016-11-27 15:06:45 +00:00 · 2016-11-27 15:06:45 +00:00 · 18495ce9da
commit 18495ce9da
parent b88e0fd902
3 changed files with 65 additions and 21 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -1,33 +1,31 @@
-import datetime
-import hashlib
-import logging
-import tempfile
-import uuid
-
-from multiprocessing.pool import Pool
-
-import itertools
-
-import langdetect
 import os
 import re
+import uuid
+import shutil
+import hashlib
+import logging
+import datetime
+import tempfile
+import itertools
 import subprocess
+from multiprocessing.pool import Pool

 import pyocr
-import shutil
-
+import langdetect
 from PIL import Image
-
 from django.conf import settings
 from django.utils import timezone
-from pyocr.tesseract import TesseractError
-
 from paperless.db import GnuPG
+from pyocr.tesseract import TesseractError
+from pyocr.libtesseract.tesseract_raw import \
+    TesseractError as OtherTesseractError

 from .models import Tag, Document, FileInfo
-from .languages import ISO639
 from .signals import (
-    document_consumption_started, document_consumption_finished)
+    document_consumption_started,
+    document_consumption_finished
+)
+from .languages import ISO639


 class OCRError(Exception):
@ -381,7 +379,7 @@ def image_to_string(args):
            try:
                orientation = ocr.detect_orientation(f, lang=lang)
                f = f.rotate(orientation["angle"], expand=1)
-            except TesseractError:
+            except (TesseractError, OtherTesseractError):
                pass
        return ocr.image_to_string(f, lang=lang)

--- a/src/documents/tests/samples/no-text.png
+++ b/src/documents/tests/samples/no-text.png
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@ -1,7 +1,13 @@
-from django.test import TestCase
+import os
+from unittest import mock, skipIf
+
+import pyocr
+from django.test import TestCase
+from pyocr.libtesseract.tesseract_raw import \
+    TesseractError as OtherTesseractError

-from ..consumer import strip_excess_whitespace
 from ..models import FileInfo
+from ..consumer import image_to_string, strip_excess_whitespace


 class TestAttributes(TestCase):
@ -304,6 +310,28 @@ class TestFieldPermutations(TestCase):
                            template.format(**spec), **spec)


+class FakeTesseract(object):
+
+    @staticmethod
+    def can_detect_orientation():
+        return True
+
+    @staticmethod
+    def detect_orientation(file_handle, lang):
+        raise OtherTesseractError("arbitrary status", "message")
+
+    @staticmethod
+    def image_to_string(file_handle, lang):
+        return "This is test text"
+
+
+class FakePyOcr(object):
+
+    @staticmethod
+    def get_available_tools():
+        return [FakeTesseract]
+
+
 class TestOCR(TestCase):

    text_cases = [
@ -317,6 +345,9 @@ class TestOCR(TestCase):
            "utf-8 строка с пробелами в конце"
        )
    ]
+    
+    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
+    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())

    def test_strip_excess_whitespace(self):
        for source, result in self.text_cases:
@ -330,3 +361,18 @@ class TestOCR(TestCase):
                    actual_result
                )
            )
+
+    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
+    @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
+    @mock.patch("documents.consumer.pyocr", FakePyOcr)
+    def test_image_to_string_with_text_free_page(self):
+        """
+        This test is sort of silly, since it's really just reproducing an odd
+        exception thrown by pyocr when it encounters a page with no text.
+        Actually running this test against an installation of Tesseract results
+        in a segmentation fault rooted somewhere deep inside pyocr where I
+        don't care to dig.  Regardless, if you run the consumer normally,
+        text-free pages are now handled correctly so long as we work around
+        this weird exception.
+        """
+        image_to_string(["text.png", "en"])