retries for archive generation

2025-11-03 19:17:13 -05:00 · 2021-02-10 14:50:20 +01:00 · 2021-02-10 14:50:20 +01:00 · 6c8f010f7a
commit 6c8f010f7a
parent ed84cf26e7
1 changed files with 66 additions and 34 deletions
--- a/src/documents/migrations/1012_fix_archive_files.py
+++ b/src/documents/migrations/1012_fix_archive_files.py
@ -4,6 +4,7 @@ import hashlib
 import logging
 import os
 import shutil
 from time import sleep
 import pathvalidate
 from django.conf import settings
@ -12,8 +13,12 @@ from django.template.defaultfilters import slugify
 from documents.file_handling import defaultdictNoStr, many_to_dictionary
 logger = logging.getLogger("paperless.migrations")
 ###############################################################################
 # This is code copied straight paperless before the change.
 ###############################################################################
 def archive_name_from_filename(filename):
    return os.path.splitext(filename)[0] + ".pdf"
@ -150,6 +155,65 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
    return filename
 ###############################################################################
 # This code performs bidirection archive file transformation.
 ###############################################################################
 def create_archive_version(doc, retry_count=4):
    from documents.parsers import get_parser_class_for_mime_type, \
        DocumentParser, \
        ParseError
    logger.info(
        f"Regenerating archive document for document ID:{doc.id}"
    )
    parser_class = get_parser_class_for_mime_type(doc.mime_type)
    for try_num in range(retry_count):
        parser: DocumentParser = parser_class(None, None)
        try:
            parser.parse(source_path(doc), doc.mime_type,
                         os.path.basename(doc.filename))
            doc.content = parser.get_text()
            if parser.get_archive_path() and os.path.isfile(
                parser.get_archive_path()):
                doc.archive_filename = generate_unique_filename(
                    doc, archive_filename=True)
                with open(parser.get_archive_path(), "rb") as f:
                    doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
                os.makedirs(os.path.dirname(archive_path_new(doc)),
                            exist_ok=True)
                shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
            else:
                doc.archive_checksum = None
                logger.error(
                    f"Parser did not return an archive document for document "
                    f"ID:{doc.id}. Removing archive document."
                )
            doc.save()
            return
        except ParseError:
            if try_num + 1 == retry_count:
                logger.exception(
                    f"Unable to regenerate archive document for ID:{doc.id}. You "
                    f"need to invoke the document_archiver management command "
                    f"manually for that document."
                )
                doc.archive_checksum = None
                doc.save()
                return
            else:
                # This is mostly here for the tika parser in docker
                # environemnts. The servers for parsing need to come up first,
                # and the docker setup doesn't ensure that tika is running
                # before attempting migrations.
                logger.error("Parse error, will try again in 5 seconds...")
                sleep(5)
        finally:
            parser.cleanup()
 def move_old_to_new_locations(apps, schema_editor):
    Document = apps.get_model("documents", "Document")
@ -199,42 +263,10 @@ def move_old_to_new_locations(apps, schema_editor):
    # regenerate archive documents
    for doc_id in affected_document_ids:
        from documents.parsers import get_parser_class_for_mime_type, \
            DocumentParser, \
            ParseError
        doc = Document.objects.get(id=doc_id)
-        logger.info(
+        create_archive_version(doc)
-            f"Regenerating archive document for document ID:{doc.id}"
+
        )
        parser_class = get_parser_class_for_mime_type(doc.mime_type)
        parser: DocumentParser = parser_class(None, None)
        try:
            parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename))
            doc.content = parser.get_text()
            if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
                doc.archive_filename = generate_unique_filename(
                    doc, archive_filename=True)
                with open(parser.get_archive_path(), "rb") as f:
                    doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
                os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
                shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
            else:
                doc.archive_checksum = None
                logger.error(
                    f"Parser did not return an archive document for document "
                    f"ID:{doc.id}. Removing archive document."
                )
            doc.save()
        except ParseError:
            logger.exception(
                f"Unable to regenerate archive document for ID:{doc.id}. You "
                f"need to invoke the document_archiver management command "
                f"manually for that document."
            )
        finally:
            parser.cleanup()
 def move_new_to_old_locations(apps, schema_editor):