mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-25 15:52:35 -04:00 
			
		
		
		
	Merge pull request #1139 from paperless-ngx/feature-redo-ocr
Feature: Management command to redo OCR
This commit is contained in:
		
						commit
						4bea4c69a4
					
				| @ -66,23 +66,30 @@ | ||||
|   </div> | ||||
|   <div class="col-auto ms-auto mb-2 mb-xl-0 d-flex"> | ||||
|     <div class="btn-group btn-group-sm me-2"> | ||||
|       <button type="button" [disabled]="awaitingDownload" class="btn btn-outline-primary btn-sm" (click)="downloadSelected()"> | ||||
|         <svg *ngIf="!awaitingDownload" width="1em" height="1em" viewBox="0 0 16 16" fill="currentColor"> | ||||
|           <use xlink:href="assets/bootstrap-icons.svg#download" /> | ||||
|         </svg> | ||||
|         <div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status"> | ||||
|           <span class="visually-hidden">Preparing download...</span> | ||||
|         </div> | ||||
|           | ||||
|         <ng-container i18n>Download</ng-container> | ||||
|       </button> | ||||
|       <div class="btn-group" ngbDropdown role="group" aria-label="Button group with nested dropdown"> | ||||
|         <button [disabled]="awaitingDownload" class="btn btn-outline-primary btn-sm dropdown-toggle-split" ngbDropdownToggle></button> | ||||
|         <div class="dropdown-menu shadow" ngbDropdownMenu> | ||||
|           <button ngbDropdownItem i18n (click)="downloadSelected('originals')">Download originals</button> | ||||
| 
 | ||||
|       <div ngbDropdown class="me-2 d-flex"> | ||||
|         <button class="btn btn-sm btn-outline-primary" id="dropdownSelect" ngbDropdownToggle> | ||||
|           <svg class="toolbaricon" fill="currentColor"> | ||||
|             <use xlink:href="assets/bootstrap-icons.svg#three-dots" /> | ||||
|           </svg> | ||||
|           <div class="d-none d-sm-inline"> <ng-container i18n>Actions</ng-container></div> | ||||
|         </button> | ||||
|         <div ngbDropdownMenu aria-labelledby="dropdownSelect" class="shadow"> | ||||
|           <button ngbDropdownItem [disabled]="awaitingDownload" (click)="downloadSelected()" i18n> | ||||
|             Download | ||||
|             <div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status"> | ||||
|               <span class="visually-hidden">Preparing download...</span> | ||||
|             </div> | ||||
|           </button> | ||||
|           <button ngbDropdownItem [disabled]="awaitingDownload" (click)="downloadSelected('originals')" i18n> | ||||
|             Download originals | ||||
|             <div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status"> | ||||
|               <span class="visually-hidden">Preparing download...</span> | ||||
|             </div> | ||||
|           </button> | ||||
|           <button ngbDropdownItem (click)="redoOcrSelected()" i18n>Redo OCR</button> | ||||
|         </div> | ||||
|       </div> | ||||
|     </div> | ||||
| 
 | ||||
|     <button type="button" class="btn btn-sm btn-outline-danger" (click)="applyDelete()"> | ||||
|       <svg width="1em" height="1em" viewBox="0 0 16 16" fill="currentColor"> | ||||
|  | ||||
| @ -379,4 +379,19 @@ export class BulkEditorComponent { | ||||
|         this.awaitingDownload = false | ||||
|       }) | ||||
|   } | ||||
| 
 | ||||
|   redoOcrSelected() { | ||||
|     let modal = this.modalService.open(ConfirmDialogComponent, { | ||||
|       backdrop: 'static', | ||||
|     }) | ||||
|     modal.componentInstance.title = $localize`Redo OCR confirm` | ||||
|     modal.componentInstance.messageBold = $localize`This operation will permanently redo OCR for ${this.list.selected.size} selected document(s).` | ||||
|     modal.componentInstance.message = $localize`This operation cannot be undone.` | ||||
|     modal.componentInstance.btnClass = 'btn-danger' | ||||
|     modal.componentInstance.btnCaption = $localize`Proceed` | ||||
|     modal.componentInstance.confirmClicked.subscribe(() => { | ||||
|       modal.componentInstance.buttonsEnabled = false | ||||
|       this.executeBulkOperation(modal, 'redo_ocr', {}) | ||||
|     }) | ||||
|   } | ||||
| } | ||||
|  | ||||
| @ -118,3 +118,10 @@ def delete(doc_ids): | ||||
|             index.remove_document_by_id(writer, id) | ||||
| 
 | ||||
|     return "OK" | ||||
| 
 | ||||
| 
 | ||||
| def redo_ocr(doc_ids): | ||||
| 
 | ||||
|     async_task("documents.tasks.redo_ocr", document_ids=doc_ids) | ||||
| 
 | ||||
|     return "OK" | ||||
|  | ||||
							
								
								
									
										35
									
								
								src/documents/management/commands/document_redo_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								src/documents/management/commands/document_redo_ocr.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,35 @@ | ||||
| import tqdm | ||||
| from django.core.management.base import BaseCommand | ||||
| from documents.tasks import redo_ocr | ||||
| 
 | ||||
| 
 | ||||
| class Command(BaseCommand): | ||||
| 
 | ||||
|     help = """ | ||||
|         This will rename all documents to match the latest filename format. | ||||
|     """.replace( | ||||
|         "    ", | ||||
|         "", | ||||
|     ) | ||||
| 
 | ||||
|     def add_arguments(self, parser): | ||||
| 
 | ||||
|         parser.add_argument( | ||||
|             "--no-progress-bar", | ||||
|             default=False, | ||||
|             action="store_true", | ||||
|             help="If set, the progress bar will not be shown", | ||||
|         ) | ||||
| 
 | ||||
|         parser.add_argument( | ||||
|             "documents", | ||||
|             nargs="+", | ||||
|             help="Document primary keys for re-processing OCR on", | ||||
|         ) | ||||
| 
 | ||||
|     def handle(self, *args, **options): | ||||
|         doc_pks = tqdm.tqdm( | ||||
|             options["documents"], | ||||
|             disable=options["no_progress_bar"], | ||||
|         ) | ||||
|         redo_ocr(doc_pks) | ||||
| @ -323,6 +323,7 @@ class BulkEditSerializer(DocumentListSerializer): | ||||
|             "remove_tag", | ||||
|             "modify_tags", | ||||
|             "delete", | ||||
|             "redo_ocr", | ||||
|         ], | ||||
|         label="Method", | ||||
|         write_only=True, | ||||
| @ -356,6 +357,8 @@ class BulkEditSerializer(DocumentListSerializer): | ||||
|             return bulk_edit.modify_tags | ||||
|         elif method == "delete": | ||||
|             return bulk_edit.delete | ||||
|         elif method == "redo_ocr": | ||||
|             return bulk_edit.redo_ocr | ||||
|         else: | ||||
|             raise serializers.ValidationError("Unsupported method.") | ||||
| 
 | ||||
|  | ||||
| @ -1,10 +1,14 @@ | ||||
| import logging | ||||
| import os | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from typing import Type | ||||
| 
 | ||||
| import tqdm | ||||
| from asgiref.sync import async_to_sync | ||||
| from channels.layers import get_channel_layer | ||||
| from django.conf import settings | ||||
| from django.core.exceptions import ObjectDoesNotExist | ||||
| from django.db.models.signals import post_save | ||||
| from documents import barcodes | ||||
| from documents import index | ||||
| @ -18,6 +22,9 @@ from documents.models import Document | ||||
| from documents.models import DocumentType | ||||
| from documents.models import StoragePath | ||||
| from documents.models import Tag | ||||
| from documents.parsers import DocumentParser | ||||
| from documents.parsers import get_parser_class_for_mime_type | ||||
| from documents.parsers import ParseError | ||||
| from documents.sanity_checker import SanityCheckFailedException | ||||
| from whoosh.writing import AsyncWriter | ||||
| 
 | ||||
| @ -198,3 +205,46 @@ def bulk_update_documents(document_ids): | ||||
|     with AsyncWriter(ix) as writer: | ||||
|         for doc in documents: | ||||
|             index.update_document(writer, doc) | ||||
| 
 | ||||
| 
 | ||||
| def redo_ocr(document_ids): | ||||
|     all_docs = Document.objects.all() | ||||
| 
 | ||||
|     for doc_pk in document_ids: | ||||
|         try: | ||||
|             logger.info(f"Parsing document {doc_pk}") | ||||
|             doc: Document = all_docs.get(pk=doc_pk) | ||||
|         except ObjectDoesNotExist: | ||||
|             logger.error(f"Document {doc_pk} does not exist") | ||||
|             continue | ||||
| 
 | ||||
|         # Get the correct parser for this mime type | ||||
|         parser_class: Type[DocumentParser] = get_parser_class_for_mime_type( | ||||
|             doc.mime_type, | ||||
|         ) | ||||
|         document_parser: DocumentParser = parser_class( | ||||
|             "redo-ocr", | ||||
|         ) | ||||
| 
 | ||||
|         # Create a file path to copy the original file to for working on | ||||
|         temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() | ||||
| 
 | ||||
|         shutil.copy(doc.source_path, temp_file) | ||||
| 
 | ||||
|         try: | ||||
|             logger.info( | ||||
|                 f"Using {type(document_parser).__name__} for document", | ||||
|             ) | ||||
|             # Try to re-parse the document into text | ||||
|             document_parser.parse(str(temp_file), doc.mime_type) | ||||
| 
 | ||||
|             doc.content = document_parser.get_text() | ||||
|             doc.save() | ||||
|             logger.info("Document OCR updated") | ||||
| 
 | ||||
|         except ParseError as e: | ||||
|             logger.error(f"Error parsing document: {e}") | ||||
|         finally: | ||||
|             # Remove the file path if it was created | ||||
|             if temp_file.exists() and temp_file.is_file(): | ||||
|                 temp_file.unlink() | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user