mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	Implements a better re-do of OCR by making the document archiver function common. Actually creates updated file now
This commit is contained in:
		
							parent
							
								
									fccea022fa
								
							
						
					
					
						commit
						ab761e837c
					
				@ -122,6 +122,10 @@ def delete(doc_ids):
 | 
			
		||||
 | 
			
		||||
def redo_ocr(doc_ids):
 | 
			
		||||
 | 
			
		||||
    async_task("documents.tasks.redo_ocr", document_ids=doc_ids)
 | 
			
		||||
    for document_id in doc_ids:
 | 
			
		||||
        async_task(
 | 
			
		||||
            "documents.tasks.update_document_archive_file",
 | 
			
		||||
            document_id=document_id,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    return "OK"
 | 
			
		||||
 | 
			
		||||
@ -1,85 +1,18 @@
 | 
			
		||||
import hashlib
 | 
			
		||||
import logging
 | 
			
		||||
import multiprocessing
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
import uuid
 | 
			
		||||
 | 
			
		||||
import tqdm
 | 
			
		||||
from django import db
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.core.management.base import BaseCommand
 | 
			
		||||
from django.db import transaction
 | 
			
		||||
from documents.models import Document
 | 
			
		||||
from filelock import FileLock
 | 
			
		||||
 | 
			
		||||
from ... import index
 | 
			
		||||
from ...file_handling import create_source_path_directory
 | 
			
		||||
from ...file_handling import generate_unique_filename
 | 
			
		||||
from ...parsers import get_parser_class_for_mime_type
 | 
			
		||||
from documents.tasks import update_document_archive_file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger("paperless.management.archiver")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def handle_document(document_id):
 | 
			
		||||
    document = Document.objects.get(id=document_id)
 | 
			
		||||
 | 
			
		||||
    mime_type = document.mime_type
 | 
			
		||||
 | 
			
		||||
    parser_class = get_parser_class_for_mime_type(mime_type)
 | 
			
		||||
 | 
			
		||||
    if not parser_class:
 | 
			
		||||
        logger.error(
 | 
			
		||||
            f"No parser found for mime type {mime_type}, cannot "
 | 
			
		||||
            f"archive document {document} (ID: {document_id})",
 | 
			
		||||
        )
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    parser = parser_class(logging_group=uuid.uuid4())
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        parser.parse(document.source_path, mime_type, document.get_public_filename())
 | 
			
		||||
 | 
			
		||||
        thumbnail = parser.get_thumbnail(
 | 
			
		||||
            document.source_path,
 | 
			
		||||
            mime_type,
 | 
			
		||||
            document.get_public_filename(),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        if parser.get_archive_path():
 | 
			
		||||
            with transaction.atomic():
 | 
			
		||||
                with open(parser.get_archive_path(), "rb") as f:
 | 
			
		||||
                    checksum = hashlib.md5(f.read()).hexdigest()
 | 
			
		||||
                # I'm going to save first so that in case the file move
 | 
			
		||||
                # fails, the database is rolled back.
 | 
			
		||||
                # We also don't use save() since that triggers the filehandling
 | 
			
		||||
                # logic, and we don't want that yet (file not yet in place)
 | 
			
		||||
                document.archive_filename = generate_unique_filename(
 | 
			
		||||
                    document,
 | 
			
		||||
                    archive_filename=True,
 | 
			
		||||
                )
 | 
			
		||||
                Document.objects.filter(pk=document.pk).update(
 | 
			
		||||
                    archive_checksum=checksum,
 | 
			
		||||
                    content=parser.get_text(),
 | 
			
		||||
                    archive_filename=document.archive_filename,
 | 
			
		||||
                )
 | 
			
		||||
                with FileLock(settings.MEDIA_LOCK):
 | 
			
		||||
                    create_source_path_directory(document.archive_path)
 | 
			
		||||
                    shutil.move(parser.get_archive_path(), document.archive_path)
 | 
			
		||||
                    shutil.move(thumbnail, document.thumbnail_path)
 | 
			
		||||
 | 
			
		||||
            with index.open_index_writer() as writer:
 | 
			
		||||
                index.update_document(writer, document)
 | 
			
		||||
 | 
			
		||||
    except Exception:
 | 
			
		||||
        logger.exception(
 | 
			
		||||
            f"Error while parsing document {document} " f"(ID: {document_id})",
 | 
			
		||||
        )
 | 
			
		||||
    finally:
 | 
			
		||||
        parser.cleanup()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Command(BaseCommand):
 | 
			
		||||
 | 
			
		||||
    help = """
 | 
			
		||||
@ -146,7 +79,7 @@ class Command(BaseCommand):
 | 
			
		||||
            with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
 | 
			
		||||
                list(
 | 
			
		||||
                    tqdm.tqdm(
 | 
			
		||||
                        pool.imap_unordered(handle_document, document_ids),
 | 
			
		||||
                        pool.imap_unordered(update_document_archive_file, document_ids),
 | 
			
		||||
                        total=len(document_ids),
 | 
			
		||||
                        disable=options["no_progress_bar"],
 | 
			
		||||
                    ),
 | 
			
		||||
 | 
			
		||||
@ -1,35 +0,0 @@
 | 
			
		||||
import tqdm
 | 
			
		||||
from django.core.management.base import BaseCommand
 | 
			
		||||
from documents.tasks import redo_ocr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Command(BaseCommand):
 | 
			
		||||
 | 
			
		||||
    help = """
 | 
			
		||||
        This will rename all documents to match the latest filename format.
 | 
			
		||||
    """.replace(
 | 
			
		||||
        "    ",
 | 
			
		||||
        "",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    def add_arguments(self, parser):
 | 
			
		||||
 | 
			
		||||
        parser.add_argument(
 | 
			
		||||
            "--no-progress-bar",
 | 
			
		||||
            default=False,
 | 
			
		||||
            action="store_true",
 | 
			
		||||
            help="If set, the progress bar will not be shown",
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        parser.add_argument(
 | 
			
		||||
            "documents",
 | 
			
		||||
            nargs="+",
 | 
			
		||||
            help="Document primary keys for re-processing OCR on",
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def handle(self, *args, **options):
 | 
			
		||||
        doc_pks = tqdm.tqdm(
 | 
			
		||||
            options["documents"],
 | 
			
		||||
            disable=options["no_progress_bar"],
 | 
			
		||||
        )
 | 
			
		||||
        redo_ocr(doc_pks)
 | 
			
		||||
@ -1,6 +1,8 @@
 | 
			
		||||
import hashlib
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
import uuid
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import Type
 | 
			
		||||
 | 
			
		||||
@ -8,7 +10,7 @@ import tqdm
 | 
			
		||||
from asgiref.sync import async_to_sync
 | 
			
		||||
from channels.layers import get_channel_layer
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.core.exceptions import ObjectDoesNotExist
 | 
			
		||||
from django.db import transaction
 | 
			
		||||
from django.db.models.signals import post_save
 | 
			
		||||
from documents import barcodes
 | 
			
		||||
from documents import index
 | 
			
		||||
@ -17,6 +19,8 @@ from documents.classifier import DocumentClassifier
 | 
			
		||||
from documents.classifier import load_classifier
 | 
			
		||||
from documents.consumer import Consumer
 | 
			
		||||
from documents.consumer import ConsumerError
 | 
			
		||||
from documents.file_handling import create_source_path_directory
 | 
			
		||||
from documents.file_handling import generate_unique_filename
 | 
			
		||||
from documents.models import Correspondent
 | 
			
		||||
from documents.models import Document
 | 
			
		||||
from documents.models import DocumentType
 | 
			
		||||
@ -24,8 +28,8 @@ from documents.models import StoragePath
 | 
			
		||||
from documents.models import Tag
 | 
			
		||||
from documents.parsers import DocumentParser
 | 
			
		||||
from documents.parsers import get_parser_class_for_mime_type
 | 
			
		||||
from documents.parsers import ParseError
 | 
			
		||||
from documents.sanity_checker import SanityCheckFailedException
 | 
			
		||||
from filelock import FileLock
 | 
			
		||||
from whoosh.writing import AsyncWriter
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -213,44 +217,62 @@ def bulk_update_documents(document_ids):
 | 
			
		||||
            index.update_document(writer, doc)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def redo_ocr(document_ids):
 | 
			
		||||
    all_docs = Document.objects.all()
 | 
			
		||||
def update_document_archive_file(document_id):
 | 
			
		||||
    """
 | 
			
		||||
    Re-creates the archive file of a document, including new OCR content and thumbnail
 | 
			
		||||
    """
 | 
			
		||||
    document = Document.objects.get(id=document_id)
 | 
			
		||||
 | 
			
		||||
    for doc_pk in document_ids:
 | 
			
		||||
        try:
 | 
			
		||||
            logger.info(f"Parsing document {doc_pk}")
 | 
			
		||||
            doc: Document = all_docs.get(pk=doc_pk)
 | 
			
		||||
        except ObjectDoesNotExist:
 | 
			
		||||
            logger.error(f"Document {doc_pk} does not exist")
 | 
			
		||||
            continue
 | 
			
		||||
    mime_type = document.mime_type
 | 
			
		||||
 | 
			
		||||
        # Get the correct parser for this mime type
 | 
			
		||||
        parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
 | 
			
		||||
            doc.mime_type,
 | 
			
		||||
        )
 | 
			
		||||
        document_parser: DocumentParser = parser_class(
 | 
			
		||||
            "redo-ocr",
 | 
			
		||||
    parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
 | 
			
		||||
 | 
			
		||||
    if not parser_class:
 | 
			
		||||
        logger.error(
 | 
			
		||||
            f"No parser found for mime type {mime_type}, cannot "
 | 
			
		||||
            f"archive document {document} (ID: {document_id})",
 | 
			
		||||
        )
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
        # Create a file path to copy the original file to for working on
 | 
			
		||||
        temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
 | 
			
		||||
 | 
			
		||||
        shutil.copy(doc.source_path, temp_file)
 | 
			
		||||
    parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
            logger.info(
 | 
			
		||||
                f"Using {type(document_parser).__name__} for document",
 | 
			
		||||
        parser.parse(document.source_path, mime_type, document.get_public_filename())
 | 
			
		||||
 | 
			
		||||
        thumbnail = parser.get_thumbnail(
 | 
			
		||||
            document.source_path,
 | 
			
		||||
            mime_type,
 | 
			
		||||
            document.get_public_filename(),
 | 
			
		||||
        )
 | 
			
		||||
            # Try to re-parse the document into text
 | 
			
		||||
            document_parser.parse(str(temp_file), doc.mime_type)
 | 
			
		||||
 | 
			
		||||
            doc.content = document_parser.get_text()
 | 
			
		||||
            doc.save()
 | 
			
		||||
            logger.info("Document OCR updated")
 | 
			
		||||
        if parser.get_archive_path():
 | 
			
		||||
            with transaction.atomic():
 | 
			
		||||
                with open(parser.get_archive_path(), "rb") as f:
 | 
			
		||||
                    checksum = hashlib.md5(f.read()).hexdigest()
 | 
			
		||||
                # I'm going to save first so that in case the file move
 | 
			
		||||
                # fails, the database is rolled back.
 | 
			
		||||
                # We also don't use save() since that triggers the filehandling
 | 
			
		||||
                # logic, and we don't want that yet (file not yet in place)
 | 
			
		||||
                document.archive_filename = generate_unique_filename(
 | 
			
		||||
                    document,
 | 
			
		||||
                    archive_filename=True,
 | 
			
		||||
                )
 | 
			
		||||
                Document.objects.filter(pk=document.pk).update(
 | 
			
		||||
                    archive_checksum=checksum,
 | 
			
		||||
                    content=parser.get_text(),
 | 
			
		||||
                    archive_filename=document.archive_filename,
 | 
			
		||||
                )
 | 
			
		||||
                with FileLock(settings.MEDIA_LOCK):
 | 
			
		||||
                    create_source_path_directory(document.archive_path)
 | 
			
		||||
                    shutil.move(parser.get_archive_path(), document.archive_path)
 | 
			
		||||
                    shutil.move(thumbnail, document.thumbnail_path)
 | 
			
		||||
 | 
			
		||||
        except ParseError as e:
 | 
			
		||||
            logger.error(f"Error parsing document: {e}")
 | 
			
		||||
            with index.open_index_writer() as writer:
 | 
			
		||||
                index.update_document(writer, document)
 | 
			
		||||
 | 
			
		||||
    except Exception:
 | 
			
		||||
        logger.exception(
 | 
			
		||||
            f"Error while parsing document {document} " f"(ID: {document_id})",
 | 
			
		||||
        )
 | 
			
		||||
    finally:
 | 
			
		||||
            # Remove the file path if it was created
 | 
			
		||||
            if temp_file.exists() and temp_file.is_file():
 | 
			
		||||
                temp_file.unlink()
 | 
			
		||||
        parser.cleanup()
 | 
			
		||||
 | 
			
		||||
@ -10,8 +10,8 @@ from django.core.management import call_command
 | 
			
		||||
from django.test import override_settings
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
from documents.file_handling import generate_filename
 | 
			
		||||
from documents.management.commands.document_archiver import handle_document
 | 
			
		||||
from documents.models import Document
 | 
			
		||||
from documents.tasks import update_document_archive_file
 | 
			
		||||
from documents.tests.utils import DirectoriesMixin
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
 | 
			
		||||
            os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        handle_document(doc.pk)
 | 
			
		||||
        update_document_archive_file(doc.pk)
 | 
			
		||||
 | 
			
		||||
        doc = Document.objects.get(id=doc.id)
 | 
			
		||||
 | 
			
		||||
@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
 | 
			
		||||
        doc.save()
 | 
			
		||||
        shutil.copy(sample_file, doc.source_path)
 | 
			
		||||
 | 
			
		||||
        handle_document(doc.pk)
 | 
			
		||||
        update_document_archive_file(doc.pk)
 | 
			
		||||
 | 
			
		||||
        doc = Document.objects.get(id=doc.id)
 | 
			
		||||
 | 
			
		||||
@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, TestCase):
 | 
			
		||||
            os.path.join(self.dirs.originals_dir, f"document_01.pdf"),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        handle_document(doc2.pk)
 | 
			
		||||
        handle_document(doc1.pk)
 | 
			
		||||
        update_document_archive_file(doc2.pk)
 | 
			
		||||
        update_document_archive_file(doc1.pk)
 | 
			
		||||
 | 
			
		||||
        doc1 = Document.objects.get(id=doc1.id)
 | 
			
		||||
        doc2 = Document.objects.get(id=doc2.id)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user