mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	Saves work on a new management comment to re-ocr a file
This commit is contained in:
		
							parent
							
								
									4f176682dc
								
							
						
					
					
						commit
						dfd16c5187
					
				
							
								
								
									
										69
									
								
								src/documents/management/commands/document_redo_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								src/documents/management/commands/document_redo_ocr.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,69 @@
 | 
				
			|||||||
 | 
					import logging
 | 
				
			||||||
 | 
					import shutil
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Type
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from django.core.exceptions import ObjectDoesNotExist
 | 
				
			||||||
 | 
					from django.core.management.base import BaseCommand
 | 
				
			||||||
 | 
					from documents.models import Document
 | 
				
			||||||
 | 
					from documents.parsers import DocumentParser
 | 
				
			||||||
 | 
					from documents.parsers import get_parser_class_for_mime_type
 | 
				
			||||||
 | 
					from documents.parsers import ParseError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Command(BaseCommand):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    help = """
 | 
				
			||||||
 | 
					        This will rename all documents to match the latest filename format.
 | 
				
			||||||
 | 
					    """.replace(
 | 
				
			||||||
 | 
					        "    ",
 | 
				
			||||||
 | 
					        "",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def add_arguments(self, parser):
 | 
				
			||||||
 | 
					        parser.add_argument(
 | 
				
			||||||
 | 
					            "documents",
 | 
				
			||||||
 | 
					            nargs="+",
 | 
				
			||||||
 | 
					            help="Document primary keys for re-processing OCR on",
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def handle(self, *args, **options):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        logging.getLogger().handlers[0].level = logging.ERROR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        all_docs = Document.objects.all()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for doc_pk in args.documents:
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                self.stdout.write(f"Parsing document {doc_pk}")
 | 
				
			||||||
 | 
					                doc: Document = all_docs.get(pk=doc_pk)
 | 
				
			||||||
 | 
					            except ObjectDoesNotExist:
 | 
				
			||||||
 | 
					                self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist"))
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Get the correct parser for this mime type
 | 
				
			||||||
 | 
					            parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
 | 
				
			||||||
 | 
					                doc.mime_type,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            document_parser: DocumentParser = parser_class(
 | 
				
			||||||
 | 
					                "redo-ocr",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Create a file path to copy the original file to for working on
 | 
				
			||||||
 | 
					            temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            shutil.copy(doc.source_path, temp_file)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                # Try to re-parse the document into text
 | 
				
			||||||
 | 
					                document_parser.parse(str(temp_file), doc.mime_type)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                doc.content = document_parser.get_text()
 | 
				
			||||||
 | 
					                doc.save()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            except ParseError as e:
 | 
				
			||||||
 | 
					                self.stdout.write(self.style.ERROR(f"Error parsing document: {e}"))
 | 
				
			||||||
 | 
					            finally:
 | 
				
			||||||
 | 
					                # Remove the file path if it was created
 | 
				
			||||||
 | 
					                if temp_file.exists() and temp_file.is_file():
 | 
				
			||||||
 | 
					                    temp_file.unlink()
 | 
				
			||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user