mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-04 03:27:12 -05:00 
			
		
		
		
	local import of ocrmypdf so that the webserver does not load that
This commit is contained in:
		
							parent
							
								
									416101d557
								
							
						
					
					
						commit
						56bd966c02
					
				@ -2,12 +2,8 @@ import json
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import ocrmypdf
 | 
					 | 
				
			||||||
import pdftotext
 | 
					 | 
				
			||||||
import pikepdf
 | 
					 | 
				
			||||||
from PIL import Image
 | 
					from PIL import Image
 | 
				
			||||||
from django.conf import settings
 | 
					from django.conf import settings
 | 
				
			||||||
from ocrmypdf import InputFileError, EncryptedPdfError
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from documents.parsers import DocumentParser, ParseError, \
 | 
					from documents.parsers import DocumentParser, ParseError, \
 | 
				
			||||||
    make_thumbnail_from_pdf
 | 
					    make_thumbnail_from_pdf
 | 
				
			||||||
@ -22,6 +18,8 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
    logging_name = "paperless.parsing.tesseract"
 | 
					    logging_name = "paperless.parsing.tesseract"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def extract_metadata(self, document_path, mime_type):
 | 
					    def extract_metadata(self, document_path, mime_type):
 | 
				
			||||||
 | 
					        import pikepdf
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        namespace_pattern = re.compile(r"\{(.*)\}(.*)")
 | 
					        namespace_pattern = re.compile(r"\{(.*)\}(.*)")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        result = []
 | 
					        result = []
 | 
				
			||||||
@ -91,6 +89,9 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
            return None
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def parse(self, document_path, mime_type, file_name=None):
 | 
					    def parse(self, document_path, mime_type, file_name=None):
 | 
				
			||||||
 | 
					        import ocrmypdf
 | 
				
			||||||
 | 
					        from ocrmypdf import InputFileError, EncryptedPdfError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        mode = settings.OCR_MODE
 | 
					        mode = settings.OCR_MODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        text_original = get_text_from_pdf(document_path)
 | 
					        text_original = get_text_from_pdf(document_path)
 | 
				
			||||||
@ -223,6 +224,7 @@ def strip_excess_whitespace(text):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_text_from_pdf(pdf_file):
 | 
					def get_text_from_pdf(pdf_file):
 | 
				
			||||||
 | 
					    import pdftotext
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if not os.path.isfile(pdf_file):
 | 
					    if not os.path.isfile(pdf_file):
 | 
				
			||||||
        return None
 | 
					        return None
 | 
				
			||||||
 | 
				
			|||||||
@ -164,17 +164,12 @@ class TestParser(DirectoriesMixin, TestCase):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        self.assertRaises(ParseError, f)
 | 
					        self.assertRaises(ParseError, f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
 | 
					    def test_image_calc_a4_dpi(self):
 | 
				
			||||||
    def test_image_calc_a4_dpi(self, m):
 | 
					 | 
				
			||||||
        parser = RasterisedDocumentParser(None)
 | 
					        parser = RasterisedDocumentParser(None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
 | 
					        dpi = parser.calculate_a4_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        m.assert_called_once()
 | 
					        self.assertEqual(dpi, 62)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        args, kwargs = m.call_args
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self.assertEqual(kwargs['image_dpi'], 62)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
 | 
					    @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
 | 
				
			||||||
    def test_image_dpi_fail(self, m):
 | 
					    def test_image_dpi_fail(self, m):
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user