mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	added checksums for archived documents.
This commit is contained in:
		
							parent
							
								
									fdaf419a7e
								
							
						
					
					
						commit
						24767f62c7
					
				@ -6,6 +6,7 @@ import os
 | 
				
			|||||||
import magic
 | 
					import magic
 | 
				
			||||||
from django.conf import settings
 | 
					from django.conf import settings
 | 
				
			||||||
from django.db import transaction
 | 
					from django.db import transaction
 | 
				
			||||||
 | 
					from django.db.models import Q
 | 
				
			||||||
from django.utils import timezone
 | 
					from django.utils import timezone
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
 | 
					from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
 | 
				
			||||||
@ -42,7 +43,7 @@ class Consumer(LoggingMixin):
 | 
				
			|||||||
    def pre_check_duplicate(self):
 | 
					    def pre_check_duplicate(self):
 | 
				
			||||||
        with open(self.path, "rb") as f:
 | 
					        with open(self.path, "rb") as f:
 | 
				
			||||||
            checksum = hashlib.md5(f.read()).hexdigest()
 | 
					            checksum = hashlib.md5(f.read()).hexdigest()
 | 
				
			||||||
        if Document.objects.filter(checksum=checksum).exists():
 | 
					        if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists():  # NOQA: E501
 | 
				
			||||||
            if settings.CONSUMER_DELETE_DUPLICATES:
 | 
					            if settings.CONSUMER_DELETE_DUPLICATES:
 | 
				
			||||||
                os.unlink(self.path)
 | 
					                os.unlink(self.path)
 | 
				
			||||||
            raise ConsumerError(
 | 
					            raise ConsumerError(
 | 
				
			||||||
@ -184,6 +185,11 @@ class Consumer(LoggingMixin):
 | 
				
			|||||||
                    self._write(document.storage_type,
 | 
					                    self._write(document.storage_type,
 | 
				
			||||||
                                archive_path, document.archive_path)
 | 
					                                archive_path, document.archive_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    with open(archive_path, 'rb') as f:
 | 
				
			||||||
 | 
					                        document.archive_checksum = hashlib.md5(
 | 
				
			||||||
 | 
					                            f.read()).hexdigest()
 | 
				
			||||||
 | 
					                        document.save()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                # Delete the file only if it was successfully consumed
 | 
					                # Delete the file only if it was successfully consumed
 | 
				
			||||||
                self.log("debug", "Deleting file {}".format(self.path))
 | 
					                self.log("debug", "Deleting file {}".format(self.path))
 | 
				
			||||||
                os.unlink(self.path)
 | 
					                os.unlink(self.path)
 | 
				
			||||||
 | 
				
			|||||||
@ -1,3 +1,4 @@
 | 
				
			|||||||
 | 
					import hashlib
 | 
				
			||||||
import multiprocessing
 | 
					import multiprocessing
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import ocrmypdf
 | 
					import ocrmypdf
 | 
				
			||||||
@ -27,6 +28,8 @@ def handle_document(document):
 | 
				
			|||||||
    parser.parse(document.source_path, mime_type)
 | 
					    parser.parse(document.source_path, mime_type)
 | 
				
			||||||
    if parser.get_archive_path():
 | 
					    if parser.get_archive_path():
 | 
				
			||||||
        shutil.copy(parser.get_archive_path(), document.archive_path)
 | 
					        shutil.copy(parser.get_archive_path(), document.archive_path)
 | 
				
			||||||
 | 
					        with document.archive_file as f:
 | 
				
			||||||
 | 
					            document.archive_checksum = hashlib.md5(f.read()).hexdigest()
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        logging.getLogger(__name__).warning(
 | 
					        logging.getLogger(__name__).warning(
 | 
				
			||||||
            f"Parser {parser} did not produce an archived document "
 | 
					            f"Parser {parser} did not produce an archived document "
 | 
				
			||||||
@ -35,7 +38,7 @@ def handle_document(document):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    if parser.get_text():
 | 
					    if parser.get_text():
 | 
				
			||||||
        document.content = parser.get_text()
 | 
					        document.content = parser.get_text()
 | 
				
			||||||
        document.save()
 | 
					    document.save()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    parser.cleanup()
 | 
					    parser.cleanup()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										23
									
								
								src/documents/migrations/1005_checksums.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/documents/migrations/1005_checksums.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,23 @@
 | 
				
			|||||||
 | 
					# Generated by Django 3.1.3 on 2020-11-29 00:48
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from django.db import migrations, models
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Migration(migrations.Migration):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    dependencies = [
 | 
				
			||||||
 | 
					        ('documents', '1004_sanity_check_schedule'),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    operations = [
 | 
				
			||||||
 | 
					        migrations.AddField(
 | 
				
			||||||
 | 
					            model_name='document',
 | 
				
			||||||
 | 
					            name='archive_checksum',
 | 
				
			||||||
 | 
					            field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.AlterField(
 | 
				
			||||||
 | 
					            model_name='document',
 | 
				
			||||||
 | 
					            name='checksum',
 | 
				
			||||||
 | 
					            field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
@ -157,9 +157,15 @@ class Document(models.Model):
 | 
				
			|||||||
        max_length=32,
 | 
					        max_length=32,
 | 
				
			||||||
        editable=False,
 | 
					        editable=False,
 | 
				
			||||||
        unique=True,
 | 
					        unique=True,
 | 
				
			||||||
        help_text="The checksum of the original document (before it was "
 | 
					        help_text="The checksum of the original document."
 | 
				
			||||||
                  "encrypted).  We use this to prevent duplicate document "
 | 
					    )
 | 
				
			||||||
                  "imports."
 | 
					
 | 
				
			||||||
 | 
					    archive_checksum = models.CharField(
 | 
				
			||||||
 | 
					        max_length=32,
 | 
				
			||||||
 | 
					        editable=False,
 | 
				
			||||||
 | 
					        blank=True,
 | 
				
			||||||
 | 
					        null=True,
 | 
				
			||||||
 | 
					        help_text="The checksum of the archived document."
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    created = models.DateTimeField(
 | 
					    created = models.DateTimeField(
 | 
				
			||||||
 | 
				
			|||||||
@ -67,30 +67,34 @@ def check_sanity():
 | 
				
			|||||||
                f"Original of document {doc.pk} does not exist."))
 | 
					                f"Original of document {doc.pk} does not exist."))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            present_files.remove(os.path.normpath(doc.source_path))
 | 
					            present_files.remove(os.path.normpath(doc.source_path))
 | 
				
			||||||
            checksum = None
 | 
					 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                with doc.source_file as f:
 | 
					                with doc.source_file as f:
 | 
				
			||||||
                    checksum = hashlib.md5(f.read()).hexdigest()
 | 
					                    checksum = hashlib.md5(f.read()).hexdigest()
 | 
				
			||||||
            except OSError as e:
 | 
					            except OSError as e:
 | 
				
			||||||
                messages.append(SanityError(
 | 
					                messages.append(SanityError(
 | 
				
			||||||
                    f"Cannot read original file of document {doc.pk}: {e}"))
 | 
					                    f"Cannot read original file of document {doc.pk}: {e}"))
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
            if checksum and not checksum == doc.checksum:
 | 
					                if not checksum == doc.checksum:
 | 
				
			||||||
                messages.append(SanityError(
 | 
					                    messages.append(SanityError(
 | 
				
			||||||
                    f"Checksum mismatch of document {doc.pk}. "
 | 
					                        f"Checksum mismatch of document {doc.pk}. "
 | 
				
			||||||
                    f"Stored: {doc.checksum}, actual: {checksum}."
 | 
					                        f"Stored: {doc.checksum}, actual: {checksum}."
 | 
				
			||||||
                ))
 | 
					                    ))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if os.path.isfile(doc.archive_path):
 | 
					        if os.path.isfile(doc.archive_path):
 | 
				
			||||||
 | 
					            present_files.remove(os.path.normpath(doc.archive_path))
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                with doc.archive_file as f:
 | 
					                with doc.archive_file as f:
 | 
				
			||||||
                    f.read()
 | 
					                    checksum = hashlib.md5(f.read()).hexdigest()
 | 
				
			||||||
            except OSError as e:
 | 
					            except OSError as e:
 | 
				
			||||||
                messages.append(SanityError(
 | 
					                messages.append(SanityError(
 | 
				
			||||||
                    f"Cannot read archive file of document {doc.pk}: {e}"
 | 
					                    f"Cannot read archive file of document {doc.pk}: {e}"
 | 
				
			||||||
                ))
 | 
					                ))
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
            present_files.remove(os.path.normpath(doc.archive_path))
 | 
					                if not checksum == doc.archive_checksum:
 | 
				
			||||||
 | 
					                    messages.append(SanityError(
 | 
				
			||||||
 | 
					                        f"Checksum mismatch of document {doc.pk}. "
 | 
				
			||||||
 | 
					                        f"Stored: {doc.checksum}, actual: {checksum}."
 | 
				
			||||||
 | 
					                    ))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not doc.content:
 | 
					        if not doc.content:
 | 
				
			||||||
            messages.append(SanityWarning(
 | 
					            messages.append(SanityWarning(
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user