mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	retries for archive generation
This commit is contained in:
		
							parent
							
								
									ed84cf26e7
								
							
						
					
					
						commit
						6c8f010f7a
					
				@ -4,6 +4,7 @@ import hashlib
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
from time import sleep
 | 
			
		||||
 | 
			
		||||
import pathvalidate
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
@ -12,8 +13,12 @@ from django.template.defaultfilters import slugify
 | 
			
		||||
 | 
			
		||||
from documents.file_handling import defaultdictNoStr, many_to_dictionary
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger("paperless.migrations")
 | 
			
		||||
 | 
			
		||||
###############################################################################
 | 
			
		||||
# This is code copied straight paperless before the change.
 | 
			
		||||
###############################################################################
 | 
			
		||||
 | 
			
		||||
def archive_name_from_filename(filename):
 | 
			
		||||
    return os.path.splitext(filename)[0] + ".pdf"
 | 
			
		||||
@ -150,6 +155,65 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
 | 
			
		||||
    return filename
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
###############################################################################
 | 
			
		||||
# This code performs bidirection archive file transformation.
 | 
			
		||||
###############################################################################
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def create_archive_version(doc, retry_count=4):
 | 
			
		||||
    from documents.parsers import get_parser_class_for_mime_type, \
 | 
			
		||||
        DocumentParser, \
 | 
			
		||||
        ParseError
 | 
			
		||||
 | 
			
		||||
    logger.info(
 | 
			
		||||
        f"Regenerating archive document for document ID:{doc.id}"
 | 
			
		||||
    )
 | 
			
		||||
    parser_class = get_parser_class_for_mime_type(doc.mime_type)
 | 
			
		||||
    for try_num in range(retry_count):
 | 
			
		||||
        parser: DocumentParser = parser_class(None, None)
 | 
			
		||||
        try:
 | 
			
		||||
            parser.parse(source_path(doc), doc.mime_type,
 | 
			
		||||
                         os.path.basename(doc.filename))
 | 
			
		||||
            doc.content = parser.get_text()
 | 
			
		||||
 | 
			
		||||
            if parser.get_archive_path() and os.path.isfile(
 | 
			
		||||
                parser.get_archive_path()):
 | 
			
		||||
                doc.archive_filename = generate_unique_filename(
 | 
			
		||||
                    doc, archive_filename=True)
 | 
			
		||||
                with open(parser.get_archive_path(), "rb") as f:
 | 
			
		||||
                    doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
 | 
			
		||||
                os.makedirs(os.path.dirname(archive_path_new(doc)),
 | 
			
		||||
                            exist_ok=True)
 | 
			
		||||
                shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
 | 
			
		||||
            else:
 | 
			
		||||
                doc.archive_checksum = None
 | 
			
		||||
                logger.error(
 | 
			
		||||
                    f"Parser did not return an archive document for document "
 | 
			
		||||
                    f"ID:{doc.id}. Removing archive document."
 | 
			
		||||
                )
 | 
			
		||||
            doc.save()
 | 
			
		||||
            return
 | 
			
		||||
        except ParseError:
 | 
			
		||||
            if try_num + 1 == retry_count:
 | 
			
		||||
                logger.exception(
 | 
			
		||||
                    f"Unable to regenerate archive document for ID:{doc.id}. You "
 | 
			
		||||
                    f"need to invoke the document_archiver management command "
 | 
			
		||||
                    f"manually for that document."
 | 
			
		||||
                )
 | 
			
		||||
                doc.archive_checksum = None
 | 
			
		||||
                doc.save()
 | 
			
		||||
                return
 | 
			
		||||
            else:
 | 
			
		||||
                # This is mostly here for the tika parser in docker
 | 
			
		||||
                # environemnts. The servers for parsing need to come up first,
 | 
			
		||||
                # and the docker setup doesn't ensure that tika is running
 | 
			
		||||
                # before attempting migrations.
 | 
			
		||||
                logger.error("Parse error, will try again in 5 seconds...")
 | 
			
		||||
                sleep(5)
 | 
			
		||||
        finally:
 | 
			
		||||
            parser.cleanup()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def move_old_to_new_locations(apps, schema_editor):
 | 
			
		||||
    Document = apps.get_model("documents", "Document")
 | 
			
		||||
 | 
			
		||||
@ -199,42 +263,10 @@ def move_old_to_new_locations(apps, schema_editor):
 | 
			
		||||
 | 
			
		||||
    # regenerate archive documents
 | 
			
		||||
    for doc_id in affected_document_ids:
 | 
			
		||||
        from documents.parsers import get_parser_class_for_mime_type, \
 | 
			
		||||
            DocumentParser, \
 | 
			
		||||
            ParseError
 | 
			
		||||
 | 
			
		||||
        doc = Document.objects.get(id=doc_id)
 | 
			
		||||
        logger.info(
 | 
			
		||||
            f"Regenerating archive document for document ID:{doc.id}"
 | 
			
		||||
        )
 | 
			
		||||
        parser_class = get_parser_class_for_mime_type(doc.mime_type)
 | 
			
		||||
        parser: DocumentParser = parser_class(None, None)
 | 
			
		||||
        try:
 | 
			
		||||
            parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename))
 | 
			
		||||
            doc.content = parser.get_text()
 | 
			
		||||
        create_archive_version(doc)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
            if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
 | 
			
		||||
                doc.archive_filename = generate_unique_filename(
 | 
			
		||||
                    doc, archive_filename=True)
 | 
			
		||||
                with open(parser.get_archive_path(), "rb") as f:
 | 
			
		||||
                    doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
 | 
			
		||||
                os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
 | 
			
		||||
                shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
 | 
			
		||||
            else:
 | 
			
		||||
                doc.archive_checksum = None
 | 
			
		||||
                logger.error(
 | 
			
		||||
                    f"Parser did not return an archive document for document "
 | 
			
		||||
                    f"ID:{doc.id}. Removing archive document."
 | 
			
		||||
                )
 | 
			
		||||
            doc.save()
 | 
			
		||||
        except ParseError:
 | 
			
		||||
            logger.exception(
 | 
			
		||||
                f"Unable to regenerate archive document for ID:{doc.id}. You "
 | 
			
		||||
                f"need to invoke the document_archiver management command "
 | 
			
		||||
                f"manually for that document."
 | 
			
		||||
            )
 | 
			
		||||
        finally:
 | 
			
		||||
            parser.cleanup()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def move_new_to_old_locations(apps, schema_editor):
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user