mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-04 03:27:12 -05:00 
			
		
		
		
	proper document archiver with progress bar.
This commit is contained in:
		
							parent
							
								
									e22769ca63
								
							
						
					
					
						commit
						72a4ff0fca
					
				@ -5,38 +5,55 @@ import logging
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
import uuid
 | 
					import uuid
 | 
				
			||||||
 | 
					from time import sleep
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import tqdm
 | 
				
			||||||
from django.conf import settings
 | 
					from django.conf import settings
 | 
				
			||||||
from django.core.management.base import BaseCommand
 | 
					from django.core.management.base import BaseCommand
 | 
				
			||||||
 | 
					from django.db import transaction
 | 
				
			||||||
from whoosh.writing import AsyncWriter
 | 
					from whoosh.writing import AsyncWriter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from documents.models import Document
 | 
					from documents.models import Document
 | 
				
			||||||
from ... import index
 | 
					from ... import index
 | 
				
			||||||
 | 
					from ...file_handling import create_source_path_directory
 | 
				
			||||||
from ...mixins import Renderable
 | 
					from ...mixins import Renderable
 | 
				
			||||||
from ...parsers import get_parser_class_for_mime_type
 | 
					from ...parsers import get_parser_class_for_mime_type
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					logger = logging.getLogger(__name__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def handle_document(document):
 | 
					def handle_document(document):
 | 
				
			||||||
    mime_type = document.mime_type
 | 
					    mime_type = document.mime_type
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    parser_class = get_parser_class_for_mime_type(mime_type)
 | 
					    parser_class = get_parser_class_for_mime_type(mime_type)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    parser = parser_class(logging_group=uuid.uuid4())
 | 
					    parser = parser_class(logging_group=uuid.uuid4())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
        parser.parse(document.source_path, mime_type)
 | 
					        parser.parse(document.source_path, mime_type)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if parser.get_archive_path():
 | 
					        if parser.get_archive_path():
 | 
				
			||||||
        shutil.copy(parser.get_archive_path(), document.archive_path)
 | 
					            with transaction.atomic():
 | 
				
			||||||
        with document.archive_file as f:
 | 
					                with open(parser.get_archive_path(), 'rb') as f:
 | 
				
			||||||
            document.archive_checksum = hashlib.md5(f.read()).hexdigest()
 | 
					                    checksum = hashlib.md5(f.read()).hexdigest()
 | 
				
			||||||
    else:
 | 
					                # i'm going to save first so that in case the file move
 | 
				
			||||||
        logging.getLogger(__name__).warning(
 | 
					                # fails, the database is rolled back.
 | 
				
			||||||
            f"Parser {parser} did not produce an archived document "
 | 
					                # we also don't use save() since that triggers the filehandling
 | 
				
			||||||
            f"for {document.file_name}"
 | 
					                # logic, and we don't want that yet (file not yet in place)
 | 
				
			||||||
 | 
					                Document.objects.filter(pk=document.pk).update(
 | 
				
			||||||
 | 
					                    archive_checksum=checksum,
 | 
				
			||||||
 | 
					                    content=parser.get_text()
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					                create_source_path_directory(document.archive_path)
 | 
				
			||||||
 | 
					                shutil.move(parser.get_archive_path(), document.archive_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if parser.get_text():
 | 
					        with AsyncWriter(index.open_index()) as writer:
 | 
				
			||||||
        document.content = parser.get_text()
 | 
					            index.update_document(writer, document)
 | 
				
			||||||
    document.save()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    except Exception as e:
 | 
				
			||||||
 | 
					        logger.error(f"Error while parsing document {document}: {str(e)}")
 | 
				
			||||||
 | 
					    finally:
 | 
				
			||||||
        parser.cleanup()
 | 
					        parser.cleanup()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -61,6 +78,14 @@ class Command(Renderable, BaseCommand):
 | 
				
			|||||||
            help="Recreates the archived document for documents that already "
 | 
					            help="Recreates the archived document for documents that already "
 | 
				
			||||||
                 "have an archived version."
 | 
					                 "have an archived version."
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					        parser.add_argument(
 | 
				
			||||||
 | 
					            "-d", "--document",
 | 
				
			||||||
 | 
					            default=None,
 | 
				
			||||||
 | 
					            type=int,
 | 
				
			||||||
 | 
					            required=False,
 | 
				
			||||||
 | 
					            help="Specify the ID of a document, and this command will only "
 | 
				
			||||||
 | 
					                 "run on this specific document."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def handle(self, *args, **options):
 | 
					    def handle(self, *args, **options):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -68,22 +93,22 @@ class Command(Renderable, BaseCommand):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        overwrite = options["overwrite"]
 | 
					        overwrite = options["overwrite"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if options['document']:
 | 
				
			||||||
 | 
					            documents = Document.objects.filter(pk=options['document'])
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
            documents = Document.objects.all()
 | 
					            documents = Document.objects.all()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        documents_to_process = filter(
 | 
					        documents_to_process = list(filter(
 | 
				
			||||||
            lambda d: overwrite or not os.path.exists(d.archive_path),
 | 
					            lambda d: overwrite or not d.archive_checksum,
 | 
				
			||||||
            documents
 | 
					            documents
 | 
				
			||||||
        )
 | 
					        ))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        logging.getLogger().handlers[0].level = logging.ERROR
 | 
				
			||||||
        with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
 | 
					        with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
 | 
				
			||||||
            list(
 | 
					            list(tqdm.tqdm(
 | 
				
			||||||
                pool.imap(
 | 
					                pool.imap_unordered(
 | 
				
			||||||
                    handle_document,
 | 
					                    handle_document,
 | 
				
			||||||
                    list(documents_to_process)
 | 
					                    documents_to_process
 | 
				
			||||||
                )
 | 
					                ),
 | 
				
			||||||
            )
 | 
					                total=len(documents_to_process)
 | 
				
			||||||
 | 
					            ))
 | 
				
			||||||
        ix = index.open_index()
 | 
					 | 
				
			||||||
        with AsyncWriter(ix) as writer:
 | 
					 | 
				
			||||||
            for d in documents_to_process:
 | 
					 | 
				
			||||||
                index.update_document(writer, d)
 | 
					 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user