mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 08:12:34 -04:00 
			
		
		
		
	Merge pull request #1139 from paperless-ngx/feature-redo-ocr
Feature: Management command to redo OCR
This commit is contained in:
		
						commit
						4bea4c69a4
					
				| @ -66,23 +66,30 @@ | |||||||
|   </div> |   </div> | ||||||
|   <div class="col-auto ms-auto mb-2 mb-xl-0 d-flex"> |   <div class="col-auto ms-auto mb-2 mb-xl-0 d-flex"> | ||||||
|     <div class="btn-group btn-group-sm me-2"> |     <div class="btn-group btn-group-sm me-2"> | ||||||
|       <button type="button" [disabled]="awaitingDownload" class="btn btn-outline-primary btn-sm" (click)="downloadSelected()"> | 
 | ||||||
|         <svg *ngIf="!awaitingDownload" width="1em" height="1em" viewBox="0 0 16 16" fill="currentColor"> |       <div ngbDropdown class="me-2 d-flex"> | ||||||
|           <use xlink:href="assets/bootstrap-icons.svg#download" /> |         <button class="btn btn-sm btn-outline-primary" id="dropdownSelect" ngbDropdownToggle> | ||||||
|         </svg> |           <svg class="toolbaricon" fill="currentColor"> | ||||||
|         <div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status"> |             <use xlink:href="assets/bootstrap-icons.svg#three-dots" /> | ||||||
|           <span class="visually-hidden">Preparing download...</span> |           </svg> | ||||||
|         </div> |           <div class="d-none d-sm-inline"> <ng-container i18n>Actions</ng-container></div> | ||||||
|           |         </button> | ||||||
|         <ng-container i18n>Download</ng-container> |         <div ngbDropdownMenu aria-labelledby="dropdownSelect" class="shadow"> | ||||||
|       </button> |           <button ngbDropdownItem [disabled]="awaitingDownload" (click)="downloadSelected()" i18n> | ||||||
|       <div class="btn-group" ngbDropdown role="group" aria-label="Button group with nested dropdown"> |             Download | ||||||
|         <button [disabled]="awaitingDownload" class="btn btn-outline-primary btn-sm dropdown-toggle-split" ngbDropdownToggle></button> |             <div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status"> | ||||||
|         <div class="dropdown-menu shadow" ngbDropdownMenu> |               <span class="visually-hidden">Preparing download...</span> | ||||||
|           <button ngbDropdownItem i18n (click)="downloadSelected('originals')">Download originals</button> |             </div> | ||||||
|  |           </button> | ||||||
|  |           <button ngbDropdownItem [disabled]="awaitingDownload" (click)="downloadSelected('originals')" i18n> | ||||||
|  |             Download originals | ||||||
|  |             <div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status"> | ||||||
|  |               <span class="visually-hidden">Preparing download...</span> | ||||||
|  |             </div> | ||||||
|  |           </button> | ||||||
|  |           <button ngbDropdownItem (click)="redoOcrSelected()" i18n>Redo OCR</button> | ||||||
|         </div> |         </div> | ||||||
|       </div> |       </div> | ||||||
|     </div> |  | ||||||
| 
 | 
 | ||||||
|     <button type="button" class="btn btn-sm btn-outline-danger" (click)="applyDelete()"> |     <button type="button" class="btn btn-sm btn-outline-danger" (click)="applyDelete()"> | ||||||
|       <svg width="1em" height="1em" viewBox="0 0 16 16" fill="currentColor"> |       <svg width="1em" height="1em" viewBox="0 0 16 16" fill="currentColor"> | ||||||
|  | |||||||
| @ -379,4 +379,19 @@ export class BulkEditorComponent { | |||||||
|         this.awaitingDownload = false |         this.awaitingDownload = false | ||||||
|       }) |       }) | ||||||
|   } |   } | ||||||
|  | 
 | ||||||
|  |   redoOcrSelected() { | ||||||
|  |     let modal = this.modalService.open(ConfirmDialogComponent, { | ||||||
|  |       backdrop: 'static', | ||||||
|  |     }) | ||||||
|  |     modal.componentInstance.title = $localize`Redo OCR confirm` | ||||||
|  |     modal.componentInstance.messageBold = $localize`This operation will permanently redo OCR for ${this.list.selected.size} selected document(s).` | ||||||
|  |     modal.componentInstance.message = $localize`This operation cannot be undone.` | ||||||
|  |     modal.componentInstance.btnClass = 'btn-danger' | ||||||
|  |     modal.componentInstance.btnCaption = $localize`Proceed` | ||||||
|  |     modal.componentInstance.confirmClicked.subscribe(() => { | ||||||
|  |       modal.componentInstance.buttonsEnabled = false | ||||||
|  |       this.executeBulkOperation(modal, 'redo_ocr', {}) | ||||||
|  |     }) | ||||||
|  |   } | ||||||
| } | } | ||||||
|  | |||||||
| @ -118,3 +118,10 @@ def delete(doc_ids): | |||||||
|             index.remove_document_by_id(writer, id) |             index.remove_document_by_id(writer, id) | ||||||
| 
 | 
 | ||||||
|     return "OK" |     return "OK" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def redo_ocr(doc_ids): | ||||||
|  | 
 | ||||||
|  |     async_task("documents.tasks.redo_ocr", document_ids=doc_ids) | ||||||
|  | 
 | ||||||
|  |     return "OK" | ||||||
|  | |||||||
							
								
								
									
										35
									
								
								src/documents/management/commands/document_redo_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								src/documents/management/commands/document_redo_ocr.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,35 @@ | |||||||
|  | import tqdm | ||||||
|  | from django.core.management.base import BaseCommand | ||||||
|  | from documents.tasks import redo_ocr | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Command(BaseCommand): | ||||||
|  | 
 | ||||||
|  |     help = """ | ||||||
|  |         This will rename all documents to match the latest filename format. | ||||||
|  |     """.replace( | ||||||
|  |         "    ", | ||||||
|  |         "", | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     def add_arguments(self, parser): | ||||||
|  | 
 | ||||||
|  |         parser.add_argument( | ||||||
|  |             "--no-progress-bar", | ||||||
|  |             default=False, | ||||||
|  |             action="store_true", | ||||||
|  |             help="If set, the progress bar will not be shown", | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         parser.add_argument( | ||||||
|  |             "documents", | ||||||
|  |             nargs="+", | ||||||
|  |             help="Document primary keys for re-processing OCR on", | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def handle(self, *args, **options): | ||||||
|  |         doc_pks = tqdm.tqdm( | ||||||
|  |             options["documents"], | ||||||
|  |             disable=options["no_progress_bar"], | ||||||
|  |         ) | ||||||
|  |         redo_ocr(doc_pks) | ||||||
| @ -323,6 +323,7 @@ class BulkEditSerializer(DocumentListSerializer): | |||||||
|             "remove_tag", |             "remove_tag", | ||||||
|             "modify_tags", |             "modify_tags", | ||||||
|             "delete", |             "delete", | ||||||
|  |             "redo_ocr", | ||||||
|         ], |         ], | ||||||
|         label="Method", |         label="Method", | ||||||
|         write_only=True, |         write_only=True, | ||||||
| @ -356,6 +357,8 @@ class BulkEditSerializer(DocumentListSerializer): | |||||||
|             return bulk_edit.modify_tags |             return bulk_edit.modify_tags | ||||||
|         elif method == "delete": |         elif method == "delete": | ||||||
|             return bulk_edit.delete |             return bulk_edit.delete | ||||||
|  |         elif method == "redo_ocr": | ||||||
|  |             return bulk_edit.redo_ocr | ||||||
|         else: |         else: | ||||||
|             raise serializers.ValidationError("Unsupported method.") |             raise serializers.ValidationError("Unsupported method.") | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -1,10 +1,14 @@ | |||||||
| import logging | import logging | ||||||
| import os | import os | ||||||
|  | import shutil | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Type | ||||||
| 
 | 
 | ||||||
| import tqdm | import tqdm | ||||||
| from asgiref.sync import async_to_sync | from asgiref.sync import async_to_sync | ||||||
| from channels.layers import get_channel_layer | from channels.layers import get_channel_layer | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
|  | from django.core.exceptions import ObjectDoesNotExist | ||||||
| from django.db.models.signals import post_save | from django.db.models.signals import post_save | ||||||
| from documents import barcodes | from documents import barcodes | ||||||
| from documents import index | from documents import index | ||||||
| @ -18,6 +22,9 @@ from documents.models import Document | |||||||
| from documents.models import DocumentType | from documents.models import DocumentType | ||||||
| from documents.models import StoragePath | from documents.models import StoragePath | ||||||
| from documents.models import Tag | from documents.models import Tag | ||||||
|  | from documents.parsers import DocumentParser | ||||||
|  | from documents.parsers import get_parser_class_for_mime_type | ||||||
|  | from documents.parsers import ParseError | ||||||
| from documents.sanity_checker import SanityCheckFailedException | from documents.sanity_checker import SanityCheckFailedException | ||||||
| from whoosh.writing import AsyncWriter | from whoosh.writing import AsyncWriter | ||||||
| 
 | 
 | ||||||
| @ -198,3 +205,46 @@ def bulk_update_documents(document_ids): | |||||||
|     with AsyncWriter(ix) as writer: |     with AsyncWriter(ix) as writer: | ||||||
|         for doc in documents: |         for doc in documents: | ||||||
|             index.update_document(writer, doc) |             index.update_document(writer, doc) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def redo_ocr(document_ids): | ||||||
|  |     all_docs = Document.objects.all() | ||||||
|  | 
 | ||||||
|  |     for doc_pk in document_ids: | ||||||
|  |         try: | ||||||
|  |             logger.info(f"Parsing document {doc_pk}") | ||||||
|  |             doc: Document = all_docs.get(pk=doc_pk) | ||||||
|  |         except ObjectDoesNotExist: | ||||||
|  |             logger.error(f"Document {doc_pk} does not exist") | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         # Get the correct parser for this mime type | ||||||
|  |         parser_class: Type[DocumentParser] = get_parser_class_for_mime_type( | ||||||
|  |             doc.mime_type, | ||||||
|  |         ) | ||||||
|  |         document_parser: DocumentParser = parser_class( | ||||||
|  |             "redo-ocr", | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # Create a file path to copy the original file to for working on | ||||||
|  |         temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() | ||||||
|  | 
 | ||||||
|  |         shutil.copy(doc.source_path, temp_file) | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             logger.info( | ||||||
|  |                 f"Using {type(document_parser).__name__} for document", | ||||||
|  |             ) | ||||||
|  |             # Try to re-parse the document into text | ||||||
|  |             document_parser.parse(str(temp_file), doc.mime_type) | ||||||
|  | 
 | ||||||
|  |             doc.content = document_parser.get_text() | ||||||
|  |             doc.save() | ||||||
|  |             logger.info("Document OCR updated") | ||||||
|  | 
 | ||||||
|  |         except ParseError as e: | ||||||
|  |             logger.error(f"Error parsing document: {e}") | ||||||
|  |         finally: | ||||||
|  |             # Remove the file path if it was created | ||||||
|  |             if temp_file.exists() and temp_file.is_file(): | ||||||
|  |                 temp_file.unlink() | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user