mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-04 03:27:12 -05:00 
			
		
		
		
	A handy script to redo ocr on all documents,
This commit is contained in:
		
							parent
							
								
									6f3d25d7b1
								
							
						
					
					
						commit
						f4cebda085
					
				@ -12,9 +12,8 @@ from django.utils import timezone
 | 
				
			|||||||
from paperless.db import GnuPG
 | 
					from paperless.db import GnuPG
 | 
				
			||||||
from .classifier import DocumentClassifier
 | 
					from .classifier import DocumentClassifier
 | 
				
			||||||
from .models import Document, FileInfo
 | 
					from .models import Document, FileInfo
 | 
				
			||||||
from .parsers import ParseError
 | 
					from .parsers import ParseError, get_parser_class
 | 
				
			||||||
from .signals import (
 | 
					from .signals import (
 | 
				
			||||||
    document_consumer_declaration,
 | 
					 | 
				
			||||||
    document_consumption_finished,
 | 
					    document_consumption_finished,
 | 
				
			||||||
    document_consumption_started
 | 
					    document_consumption_started
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
@ -61,15 +60,6 @@ class Consumer:
 | 
				
			|||||||
            raise ConsumerError(
 | 
					            raise ConsumerError(
 | 
				
			||||||
                "Consumption directory {} does not exist".format(self.consume))
 | 
					                "Consumption directory {} does not exist".format(self.consume))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.parsers = []
 | 
					 | 
				
			||||||
        for response in document_consumer_declaration.send(self):
 | 
					 | 
				
			||||||
            self.parsers.append(response[1])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if not self.parsers:
 | 
					 | 
				
			||||||
            raise ConsumerError(
 | 
					 | 
				
			||||||
                "No parsers could be found, not even the default.  "
 | 
					 | 
				
			||||||
                "This is a problem."
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def log(self, level, message):
 | 
					    def log(self, level, message):
 | 
				
			||||||
        getattr(self.logger, level)(message, extra={
 | 
					        getattr(self.logger, level)(message, extra={
 | 
				
			||||||
@ -82,6 +72,8 @@ class Consumer:
 | 
				
			|||||||
        Return True if file was consumed
 | 
					        Return True if file was consumed
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.logging_group = uuid.uuid4()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not re.match(FileInfo.REGEXES["title"], file):
 | 
					        if not re.match(FileInfo.REGEXES["title"], file):
 | 
				
			||||||
            return False
 | 
					            return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -96,13 +88,13 @@ class Consumer:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        self.log("info", "Consuming {}".format(doc))
 | 
					        self.log("info", "Consuming {}".format(doc))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        parser_class = self._get_parser_class(doc)
 | 
					        parser_class = get_parser_class(doc)
 | 
				
			||||||
        if not parser_class:
 | 
					        if not parser_class:
 | 
				
			||||||
            self.log(
 | 
					            self.log(
 | 
				
			||||||
                "error", "No parsers could be found for {}".format(doc))
 | 
					                "error", "No parsers could be found for {}".format(doc))
 | 
				
			||||||
            return False
 | 
					            return False
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
        self.logging_group = uuid.uuid4()
 | 
					            self.log("info", "Parser: {}".format(parser_class.__name__))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        document_consumption_started.send(
 | 
					        document_consumption_started.send(
 | 
				
			||||||
@ -114,6 +106,7 @@ class Consumer:
 | 
				
			|||||||
        document_parser = parser_class(doc, self.logging_group)
 | 
					        document_parser = parser_class(doc, self.logging_group)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
 | 
					            self.log("info", "Generating thumbnail for {}...".format(doc))
 | 
				
			||||||
            thumbnail = document_parser.get_optimised_thumbnail()
 | 
					            thumbnail = document_parser.get_optimised_thumbnail()
 | 
				
			||||||
            date = document_parser.get_date()
 | 
					            date = document_parser.get_date()
 | 
				
			||||||
            document = self._store(
 | 
					            document = self._store(
 | 
				
			||||||
@ -154,31 +147,6 @@ class Consumer:
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
            return True
 | 
					            return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _get_parser_class(self, doc):
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        Determine the appropriate parser class based on the file
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        options = []
 | 
					 | 
				
			||||||
        for parser in self.parsers:
 | 
					 | 
				
			||||||
            result = parser(doc)
 | 
					 | 
				
			||||||
            if result:
 | 
					 | 
				
			||||||
                options.append(result)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self.log(
 | 
					 | 
				
			||||||
            "info",
 | 
					 | 
				
			||||||
            "Parsers available: {}".format(
 | 
					 | 
				
			||||||
                ", ".join([str(o["parser"].__name__) for o in options])
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if not options:
 | 
					 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Return the parser with the highest weight.
 | 
					 | 
				
			||||||
        return sorted(
 | 
					 | 
				
			||||||
            options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _store(self, text, doc, thumbnail, date):
 | 
					    def _store(self, text, doc, thumbnail, date):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        file_info = FileInfo.from_path(doc)
 | 
					        file_info = FileInfo.from_path(doc)
 | 
				
			||||||
@ -211,10 +179,9 @@ class Consumer:
 | 
				
			|||||||
        self._write(document, doc, document.source_path)
 | 
					        self._write(document, doc, document.source_path)
 | 
				
			||||||
        self._write(document, thumbnail, document.thumbnail_path)
 | 
					        self._write(document, thumbnail, document.thumbnail_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        #TODO: why do we need to save the document again?
 | 
				
			||||||
        document.save()
 | 
					        document.save()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.log("debug", "Completed")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return document
 | 
					        return document
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _write(self, document, source, target):
 | 
					    def _write(self, document, source, target):
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										60
									
								
								src/documents/management/commands/document_rerun_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								src/documents/management/commands/document_rerun_ocr.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,60 @@
 | 
				
			|||||||
 | 
					import argparse
 | 
				
			||||||
 | 
					import threading
 | 
				
			||||||
 | 
					from multiprocessing import Pool
 | 
				
			||||||
 | 
					from multiprocessing.pool import ThreadPool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from django.core.management.base import BaseCommand
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from documents.consumer import Consumer
 | 
				
			||||||
 | 
					from documents.models import Log, Document
 | 
				
			||||||
 | 
					from documents.parsers import get_parser_class
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def process_document(doc):
 | 
				
			||||||
 | 
					    parser_class = get_parser_class(doc.file_name)
 | 
				
			||||||
 | 
					    if not parser_class:
 | 
				
			||||||
 | 
					        print("no parser available")
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        print("Parser: {}".format(parser_class.__name__))
 | 
				
			||||||
 | 
					        parser = parser_class(doc.source_path, None)
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            text = parser.get_text()
 | 
				
			||||||
 | 
					            doc.content = text
 | 
				
			||||||
 | 
					            doc.save()
 | 
				
			||||||
 | 
					        finally:
 | 
				
			||||||
 | 
					            parser.cleanup()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def document_index(value):
 | 
				
			||||||
 | 
					    ivalue = int(value)
 | 
				
			||||||
 | 
					    if not (1 <= ivalue <= Document.objects.count()):
 | 
				
			||||||
 | 
					        raise argparse.ArgumentTypeError(
 | 
				
			||||||
 | 
					            "{} is not a valid document index (out of range)".format(value))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return ivalue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Command(BaseCommand):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    help = "Performs OCR on all documents again!"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def add_arguments(self, parser):
 | 
				
			||||||
 | 
					        parser.add_argument(
 | 
				
			||||||
 | 
					            "-s", "--start_index",
 | 
				
			||||||
 | 
					            default=None,
 | 
				
			||||||
 | 
					            type=document_index
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def handle(self, *args, **options):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        docs = Document.objects.all().order_by("added")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for i in indices:
 | 
				
			||||||
 | 
					            doc = docs[i]
 | 
				
			||||||
 | 
					            print("==================================")
 | 
				
			||||||
 | 
					            print("{} out of {}: {}".format(i+1, len(docs), doc.file_name))
 | 
				
			||||||
 | 
					            print("==================================")
 | 
				
			||||||
 | 
					            process_document(doc)
 | 
				
			||||||
@ -20,6 +20,8 @@ from django.utils import timezone
 | 
				
			|||||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
					# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
				
			||||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
 | 
					# - MONTH ZZZZ, with ZZZZ being 4 digits
 | 
				
			||||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
 | 
					# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
 | 
				
			||||||
 | 
					from documents.signals import document_consumer_declaration
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DATE_REGEX = re.compile(
 | 
					DATE_REGEX = re.compile(
 | 
				
			||||||
    r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' +  # NOQA: E501
 | 
					    r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' +  # NOQA: E501
 | 
				
			||||||
    r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' +  # NOQA: E501
 | 
					    r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' +  # NOQA: E501
 | 
				
			||||||
@ -32,6 +34,31 @@ DATE_REGEX = re.compile(
 | 
				
			|||||||
logger = logging.getLogger(__name__)
 | 
					logger = logging.getLogger(__name__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_parser_class(doc):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Determine the appropriate parser class based on the file
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    parsers = []
 | 
				
			||||||
 | 
					    for response in document_consumer_declaration.send(None):
 | 
				
			||||||
 | 
					        parsers.append(response[1])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    #TODO: add a check that checks parser availability.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    options = []
 | 
				
			||||||
 | 
					    for parser in parsers:
 | 
				
			||||||
 | 
					        result = parser(doc)
 | 
				
			||||||
 | 
					        if result:
 | 
				
			||||||
 | 
					            options.append(result)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if not options:
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Return the parser with the highest weight.
 | 
				
			||||||
 | 
					    return sorted(
 | 
				
			||||||
 | 
					        options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
 | 
					def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
 | 
				
			||||||
    environment = os.environ.copy()
 | 
					    environment = os.environ.copy()
 | 
				
			||||||
    if settings.CONVERT_MEMORY_LIMIT:
 | 
					    if settings.CONVERT_MEMORY_LIMIT:
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user