mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	retries for archive generation
This commit is contained in:
		
							parent
							
								
									ed84cf26e7
								
							
						
					
					
						commit
						6c8f010f7a
					
				@ -4,6 +4,7 @@ import hashlib
 | 
				
			|||||||
import logging
 | 
					import logging
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
 | 
					from time import sleep
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pathvalidate
 | 
					import pathvalidate
 | 
				
			||||||
from django.conf import settings
 | 
					from django.conf import settings
 | 
				
			||||||
@ -12,8 +13,12 @@ from django.template.defaultfilters import slugify
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from documents.file_handling import defaultdictNoStr, many_to_dictionary
 | 
					from documents.file_handling import defaultdictNoStr, many_to_dictionary
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
logger = logging.getLogger("paperless.migrations")
 | 
					logger = logging.getLogger("paperless.migrations")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					###############################################################################
 | 
				
			||||||
 | 
					# This is code copied straight paperless before the change.
 | 
				
			||||||
 | 
					###############################################################################
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def archive_name_from_filename(filename):
 | 
					def archive_name_from_filename(filename):
 | 
				
			||||||
    return os.path.splitext(filename)[0] + ".pdf"
 | 
					    return os.path.splitext(filename)[0] + ".pdf"
 | 
				
			||||||
@ -150,6 +155,65 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
 | 
				
			|||||||
    return filename
 | 
					    return filename
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					###############################################################################
 | 
				
			||||||
 | 
					# This code performs bidirection archive file transformation.
 | 
				
			||||||
 | 
					###############################################################################
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_archive_version(doc, retry_count=4):
 | 
				
			||||||
 | 
					    from documents.parsers import get_parser_class_for_mime_type, \
 | 
				
			||||||
 | 
					        DocumentParser, \
 | 
				
			||||||
 | 
					        ParseError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    logger.info(
 | 
				
			||||||
 | 
					        f"Regenerating archive document for document ID:{doc.id}"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser_class = get_parser_class_for_mime_type(doc.mime_type)
 | 
				
			||||||
 | 
					    for try_num in range(retry_count):
 | 
				
			||||||
 | 
					        parser: DocumentParser = parser_class(None, None)
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            parser.parse(source_path(doc), doc.mime_type,
 | 
				
			||||||
 | 
					                         os.path.basename(doc.filename))
 | 
				
			||||||
 | 
					            doc.content = parser.get_text()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if parser.get_archive_path() and os.path.isfile(
 | 
				
			||||||
 | 
					                parser.get_archive_path()):
 | 
				
			||||||
 | 
					                doc.archive_filename = generate_unique_filename(
 | 
				
			||||||
 | 
					                    doc, archive_filename=True)
 | 
				
			||||||
 | 
					                with open(parser.get_archive_path(), "rb") as f:
 | 
				
			||||||
 | 
					                    doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
 | 
				
			||||||
 | 
					                os.makedirs(os.path.dirname(archive_path_new(doc)),
 | 
				
			||||||
 | 
					                            exist_ok=True)
 | 
				
			||||||
 | 
					                shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                doc.archive_checksum = None
 | 
				
			||||||
 | 
					                logger.error(
 | 
				
			||||||
 | 
					                    f"Parser did not return an archive document for document "
 | 
				
			||||||
 | 
					                    f"ID:{doc.id}. Removing archive document."
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            doc.save()
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        except ParseError:
 | 
				
			||||||
 | 
					            if try_num + 1 == retry_count:
 | 
				
			||||||
 | 
					                logger.exception(
 | 
				
			||||||
 | 
					                    f"Unable to regenerate archive document for ID:{doc.id}. You "
 | 
				
			||||||
 | 
					                    f"need to invoke the document_archiver management command "
 | 
				
			||||||
 | 
					                    f"manually for that document."
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                doc.archive_checksum = None
 | 
				
			||||||
 | 
					                doc.save()
 | 
				
			||||||
 | 
					                return
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                # This is mostly here for the tika parser in docker
 | 
				
			||||||
 | 
					                # environemnts. The servers for parsing need to come up first,
 | 
				
			||||||
 | 
					                # and the docker setup doesn't ensure that tika is running
 | 
				
			||||||
 | 
					                # before attempting migrations.
 | 
				
			||||||
 | 
					                logger.error("Parse error, will try again in 5 seconds...")
 | 
				
			||||||
 | 
					                sleep(5)
 | 
				
			||||||
 | 
					        finally:
 | 
				
			||||||
 | 
					            parser.cleanup()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def move_old_to_new_locations(apps, schema_editor):
 | 
					def move_old_to_new_locations(apps, schema_editor):
 | 
				
			||||||
    Document = apps.get_model("documents", "Document")
 | 
					    Document = apps.get_model("documents", "Document")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -199,42 +263,10 @@ def move_old_to_new_locations(apps, schema_editor):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    # regenerate archive documents
 | 
					    # regenerate archive documents
 | 
				
			||||||
    for doc_id in affected_document_ids:
 | 
					    for doc_id in affected_document_ids:
 | 
				
			||||||
        from documents.parsers import get_parser_class_for_mime_type, \
 | 
					 | 
				
			||||||
            DocumentParser, \
 | 
					 | 
				
			||||||
            ParseError
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        doc = Document.objects.get(id=doc_id)
 | 
					        doc = Document.objects.get(id=doc_id)
 | 
				
			||||||
        logger.info(
 | 
					        create_archive_version(doc)
 | 
				
			||||||
            f"Regenerating archive document for document ID:{doc.id}"
 | 
					
 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        parser_class = get_parser_class_for_mime_type(doc.mime_type)
 | 
					 | 
				
			||||||
        parser: DocumentParser = parser_class(None, None)
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename))
 | 
					 | 
				
			||||||
            doc.content = parser.get_text()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
 | 
					 | 
				
			||||||
                doc.archive_filename = generate_unique_filename(
 | 
					 | 
				
			||||||
                    doc, archive_filename=True)
 | 
					 | 
				
			||||||
                with open(parser.get_archive_path(), "rb") as f:
 | 
					 | 
				
			||||||
                    doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
 | 
					 | 
				
			||||||
                os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
 | 
					 | 
				
			||||||
                shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                doc.archive_checksum = None
 | 
					 | 
				
			||||||
                logger.error(
 | 
					 | 
				
			||||||
                    f"Parser did not return an archive document for document "
 | 
					 | 
				
			||||||
                    f"ID:{doc.id}. Removing archive document."
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            doc.save()
 | 
					 | 
				
			||||||
        except ParseError:
 | 
					 | 
				
			||||||
            logger.exception(
 | 
					 | 
				
			||||||
                f"Unable to regenerate archive document for ID:{doc.id}. You "
 | 
					 | 
				
			||||||
                f"need to invoke the document_archiver management command "
 | 
					 | 
				
			||||||
                f"manually for that document."
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        finally:
 | 
					 | 
				
			||||||
            parser.cleanup()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def move_new_to_old_locations(apps, schema_editor):
 | 
					def move_new_to_old_locations(apps, schema_editor):
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user