mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	Style and removal of Python 2.7 stuff
This commit is contained in:
		
							parent
							
								
									9cef689106
								
							
						
					
					
						commit
						fb1da4834c
					
				@ -22,7 +22,7 @@ class ConsumerError(Exception):
 | 
				
			|||||||
    pass
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Consumer(object):
 | 
					class Consumer:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Loop over every file found in CONSUMPTION_DIR and:
 | 
					    Loop over every file found in CONSUMPTION_DIR and:
 | 
				
			||||||
      1. Convert it to a greyscale pnm
 | 
					      1. Convert it to a greyscale pnm
 | 
				
			||||||
 | 
				
			|||||||
@ -52,15 +52,13 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
        return os.path.join(self.tempdir, "convert-0000.png")
 | 
					        return os.path.join(self.tempdir, "convert-0000.png")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _is_ocred(self):
 | 
					    def _is_ocred(self):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Extract text from PDF using pdftotext
 | 
					        # Extract text from PDF using pdftotext
 | 
				
			||||||
        text = get_text_from_pdf(self.document_path)
 | 
					        text = get_text_from_pdf(self.document_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # We assume, that a PDF with at least 50 characters contains text
 | 
					        # We assume, that a PDF with at least 50 characters contains text
 | 
				
			||||||
        # (so no OCR required)
 | 
					        # (so no OCR required)
 | 
				
			||||||
        if len(text) > 50:
 | 
					        return len(text) > 50
 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return False
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_text(self):
 | 
					    def get_text(self):
 | 
				
			||||||
        if self.TEXT_CACHE is not None:
 | 
					        if self.TEXT_CACHE is not None:
 | 
				
			||||||
@ -74,7 +72,6 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
        images = self._get_greyscale()
 | 
					        images = self._get_greyscale()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
 | 
					 | 
				
			||||||
            self.TEXT_CACHE = self._get_ocr(images)
 | 
					            self.TEXT_CACHE = self._get_ocr(images)
 | 
				
			||||||
            return self.TEXT_CACHE
 | 
					            return self.TEXT_CACHE
 | 
				
			||||||
        except OCRError as e:
 | 
					        except OCRError as e:
 | 
				
			||||||
@ -262,6 +259,7 @@ def image_to_string(args):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_text_from_pdf(pdf_file):
 | 
					def get_text_from_pdf(pdf_file):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with open(pdf_file, "rb") as f:
 | 
					    with open(pdf_file, "rb") as f:
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            pdf = pdftotext.PDF(f)
 | 
					            pdf = pdftotext.PDF(f)
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user