mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-04 03:27:12 -05:00 
			
		
		
		
	Try a new way of extracting text from a given PDF file
This commit is contained in:
		
							parent
							
								
									da38efebdf
								
							
						
					
					
						commit
						7be9ae9c02
					
				@ -2,6 +2,7 @@ import json
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import subprocess
 | 
					import subprocess
 | 
				
			||||||
 | 
					import tempfile
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Optional
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
        if not os.path.isfile(pdf_file):
 | 
					        if not os.path.isfile(pdf_file):
 | 
				
			||||||
            return None
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        from pdfminer.high_level import extract_text as pdfminer_extract_text
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            stripped = post_process_text(pdfminer_extract_text(pdf_file))
 | 
					            text = None
 | 
				
			||||||
 | 
					            with tempfile.NamedTemporaryFile(
 | 
				
			||||||
 | 
					                mode="w+",
 | 
				
			||||||
 | 
					                dir=settings.SCRATCH_DIR,
 | 
				
			||||||
 | 
					            ) as tmp:
 | 
				
			||||||
 | 
					                subprocess.run(
 | 
				
			||||||
 | 
					                    [
 | 
				
			||||||
 | 
					                        "pdftotext",
 | 
				
			||||||
 | 
					                        "-q",
 | 
				
			||||||
 | 
					                        "-layout",
 | 
				
			||||||
 | 
					                        "-enc",
 | 
				
			||||||
 | 
					                        "UTF-8",
 | 
				
			||||||
 | 
					                        pdf_file,
 | 
				
			||||||
 | 
					                        tmp.name,
 | 
				
			||||||
 | 
					                    ],
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                text = tmp.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.log("debug", f"Extracted text from PDF file {pdf_file}")
 | 
					            return post_process_text(text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # pdfminer.six does not handle RTL text
 | 
					 | 
				
			||||||
            # as a hack, for some languages, return no text, to force
 | 
					 | 
				
			||||||
            # OCRMyPdf/Tesseract do handle this correctly
 | 
					 | 
				
			||||||
            from langdetect import detect
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            lang = detect(stripped)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            self.log("debug", f"Detected language {lang}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            if (
 | 
					 | 
				
			||||||
                lang
 | 
					 | 
				
			||||||
                in {
 | 
					 | 
				
			||||||
                    "ar",  # Arabic
 | 
					 | 
				
			||||||
                    "he",  # Hebrew,
 | 
					 | 
				
			||||||
                    "fa",  # Persian
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
                and pdf_file.name != "archive-fallback.pdf"
 | 
					 | 
				
			||||||
            ):
 | 
					 | 
				
			||||||
                raise RtlLanguageException()
 | 
					 | 
				
			||||||
            return stripped
 | 
					 | 
				
			||||||
        except RtlLanguageException:
 | 
					 | 
				
			||||||
            self.log("warning", f"Detected RTL language {lang}")
 | 
					 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
        except Exception:
 | 
					        except Exception:
 | 
				
			||||||
            # TODO catch all for various issues with PDFminer.six.
 | 
					            # TODO catch all for various issues with PDFminer.six.
 | 
				
			||||||
            #  If PDFminer fails, fall back to OCR.
 | 
					            #  If PDFminer fails, fall back to OCR.
 | 
				
			||||||
@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
            if original_has_text:
 | 
					            if original_has_text:
 | 
				
			||||||
                self.text = text_original
 | 
					                self.text = text_original
 | 
				
			||||||
        except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
 | 
					        except (NoTextFoundException, InputFileError) as e:
 | 
				
			||||||
            self.log(
 | 
					            self.log(
 | 
				
			||||||
                "warning",
 | 
					                "warning",
 | 
				
			||||||
                f"Encountered an error while running OCR: {str(e)}. "
 | 
					                f"Encountered an error while running OCR: {str(e)}. "
 | 
				
			||||||
 | 
				
			|||||||
@ -661,28 +661,14 @@ class TestParser(DirectoriesMixin, TestCase):
 | 
				
			|||||||
            - Text from the document is extracted
 | 
					            - Text from the document is extracted
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        parser = RasterisedDocumentParser(None)
 | 
					        parser = RasterisedDocumentParser(None)
 | 
				
			||||||
        with mock.patch.object(
 | 
					 | 
				
			||||||
            parser,
 | 
					 | 
				
			||||||
            "construct_ocrmypdf_parameters",
 | 
					 | 
				
			||||||
            wraps=parser.construct_ocrmypdf_parameters,
 | 
					 | 
				
			||||||
        ) as wrapped:
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            parser.parse(
 | 
					        parser.parse(
 | 
				
			||||||
                os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
 | 
					            os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
 | 
				
			||||||
                "application/pdf",
 | 
					            "application/pdf",
 | 
				
			||||||
            )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # There isn't a good way to actually check this working, with RTL correctly return
 | 
					        # Copied from the PDF to here.  Don't even look at it
 | 
				
			||||||
            #  as it would require tesseract-ocr-ara installed for everyone running the
 | 
					        self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
 | 
				
			||||||
            #  test suite.  This test does provide the coverage though and attempts to ensure
 | 
					 | 
				
			||||||
            # the force OCR happens
 | 
					 | 
				
			||||||
            self.assertIsNotNone(parser.get_text())
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
 | 
					 | 
				
			||||||
            # Check the last call kwargs
 | 
					 | 
				
			||||||
            self.assertTrue(
 | 
					 | 
				
			||||||
                parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TestParserFileTypes(DirectoriesMixin, TestCase):
 | 
					class TestParserFileTypes(DirectoriesMixin, TestCase):
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user