proper document archiver with progress bar.

2025-11-04 03:27:12 -05:00 · 2020-12-03 01:04:52 +01:00 · 2020-12-03 01:04:52 +01:00 · 72a4ff0fca
commit 72a4ff0fca
parent e22769ca63
1 changed files with 53 additions and 28 deletions
--- a/src/documents/management/commands/document_archiver.py
+++ b/src/documents/management/commands/document_archiver.py
@ -5,38 +5,55 @@ import logging
 import os
 import shutil
 import uuid
 from time import sleep
 import tqdm
 from django.conf import settings
 from django.core.management.base import BaseCommand
 from django.db import transaction
 from whoosh.writing import AsyncWriter
 from documents.models import Document
 from ... import index
 from ...file_handling import create_source_path_directory
 from ...mixins import Renderable
 from ...parsers import get_parser_class_for_mime_type
 logger = logging.getLogger(__name__)
 def handle_document(document):
    mime_type = document.mime_type
    parser_class = get_parser_class_for_mime_type(mime_type)
    parser = parser_class(logging_group=uuid.uuid4())
    try:
        parser.parse(document.source_path, mime_type)
        if parser.get_archive_path():
-        shutil.copy(parser.get_archive_path(), document.archive_path)
+            with transaction.atomic():
-        with document.archive_file as f:
+                with open(parser.get_archive_path(), 'rb') as f:
-            document.archive_checksum = hashlib.md5(f.read()).hexdigest()
+                    checksum = hashlib.md5(f.read()).hexdigest()
-    else:
+                # i'm going to save first so that in case the file move
-        logging.getLogger(__name__).warning(
+                # fails, the database is rolled back.
-            f"Parser {parser} did not produce an archived document "
+                # we also don't use save() since that triggers the filehandling
-            f"for {document.file_name}"
+                # logic, and we don't want that yet (file not yet in place)
                Document.objects.filter(pk=document.pk).update(
                    archive_checksum=checksum,
                    content=parser.get_text()
                )
                create_source_path_directory(document.archive_path)
                shutil.move(parser.get_archive_path(), document.archive_path)
-    if parser.get_text():
+        with AsyncWriter(index.open_index()) as writer:
-        document.content = parser.get_text()
+            index.update_document(writer, document)
    document.save()
    except Exception as e:
        logger.error(f"Error while parsing document {document}: {str(e)}")
    finally:
        parser.cleanup()
@ -61,6 +78,14 @@ class Command(Renderable, BaseCommand):
            help="Recreates the archived document for documents that already "
                 "have an archived version."
        )
        parser.add_argument(
            "-d", "--document",
            default=None,
            type=int,
            required=False,
            help="Specify the ID of a document, and this command will only "
                 "run on this specific document."
        )
    def handle(self, *args, **options):
@ -68,22 +93,22 @@ class Command(Renderable, BaseCommand):
        overwrite = options["overwrite"]
        if options['document']:
            documents = Document.objects.filter(pk=options['document'])
        else:
            documents = Document.objects.all()
-        documents_to_process = filter(
+        documents_to_process = list(filter(
-            lambda d: overwrite or not os.path.exists(d.archive_path),
+            lambda d: overwrite or not d.archive_checksum,
            documents
-        )
+        ))
        logging.getLogger().handlers[0].level = logging.ERROR
        with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
-            list(
+            list(tqdm.tqdm(
-                pool.imap(
+                pool.imap_unordered(
                    handle_document,
-                    list(documents_to_process)
+                    documents_to_process
-                )
+                ),
-            )
+                total=len(documents_to_process)
-
+            ))
        ix = index.open_index()
        with AsyncWriter(ix) as writer:
            for d in documents_to_process:
                index.update_document(writer, d)