mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	mime type handling
This commit is contained in:
		
							parent
							
								
									bd45a804a7
								
							
						
					
					
						commit
						41650f20f4
					
				@ -50,7 +50,7 @@ class DocumentTypeAdmin(admin.ModelAdmin):
 | 
			
		||||
class DocumentAdmin(admin.ModelAdmin):
 | 
			
		||||
 | 
			
		||||
    search_fields = ("correspondent__name", "title", "content", "tags__name")
 | 
			
		||||
    readonly_fields = ("added", "file_type", "storage_type", "filename")
 | 
			
		||||
    readonly_fields = ("added", "mime_type", "storage_type", "filename")
 | 
			
		||||
    list_display = (
 | 
			
		||||
        "title",
 | 
			
		||||
        "created",
 | 
			
		||||
@ -58,8 +58,7 @@ class DocumentAdmin(admin.ModelAdmin):
 | 
			
		||||
        "correspondent",
 | 
			
		||||
        "tags_",
 | 
			
		||||
        "archive_serial_number",
 | 
			
		||||
        "document_type",
 | 
			
		||||
        "filename"
 | 
			
		||||
        "document_type"
 | 
			
		||||
    )
 | 
			
		||||
    list_filter = (
 | 
			
		||||
        "document_type",
 | 
			
		||||
 | 
			
		||||
@ -2,8 +2,8 @@ import datetime
 | 
			
		||||
import hashlib
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
import magic
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.db import transaction
 | 
			
		||||
from django.utils import timezone
 | 
			
		||||
@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
 | 
			
		||||
from .file_handling import generate_filename, create_source_path_directory
 | 
			
		||||
from .loggers import LoggingMixin
 | 
			
		||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
 | 
			
		||||
from .parsers import ParseError, get_parser_class
 | 
			
		||||
from .parsers import ParseError, get_parser_class_for_mime_type
 | 
			
		||||
from .signals import (
 | 
			
		||||
    document_consumption_finished,
 | 
			
		||||
    document_consumption_started
 | 
			
		||||
@ -51,12 +51,6 @@ class Consumer(LoggingMixin):
 | 
			
		||||
                "Consumption directory {} does not exist".format(
 | 
			
		||||
                    settings.CONSUMPTION_DIR))
 | 
			
		||||
 | 
			
		||||
    def pre_check_regex(self):
 | 
			
		||||
        if not re.match(FileInfo.REGEXES["title"], self.filename):
 | 
			
		||||
            raise ConsumerError(
 | 
			
		||||
                "Filename {} does not seem to be safe to "
 | 
			
		||||
                "consume".format(self.filename))
 | 
			
		||||
 | 
			
		||||
    def pre_check_duplicate(self):
 | 
			
		||||
        with open(self.path, "rb") as f:
 | 
			
		||||
            checksum = hashlib.md5(f.read()).hexdigest()
 | 
			
		||||
@ -100,18 +94,19 @@ class Consumer(LoggingMixin):
 | 
			
		||||
        self.pre_check_file_exists()
 | 
			
		||||
        self.pre_check_consumption_dir()
 | 
			
		||||
        self.pre_check_directories()
 | 
			
		||||
        self.pre_check_regex()
 | 
			
		||||
        self.pre_check_duplicate()
 | 
			
		||||
 | 
			
		||||
        self.log("info", "Consuming {}".format(self.filename))
 | 
			
		||||
 | 
			
		||||
        # Determine the parser class.
 | 
			
		||||
 | 
			
		||||
        parser_class = get_parser_class(self.filename)
 | 
			
		||||
        mime_type = magic.from_file(self.path, mime=True)
 | 
			
		||||
 | 
			
		||||
        parser_class = get_parser_class_for_mime_type(mime_type)
 | 
			
		||||
        if not parser_class:
 | 
			
		||||
            raise ConsumerError("No parsers abvailable for {}".format(self.filename))
 | 
			
		||||
        else:
 | 
			
		||||
            self.log("debug", "Parser: {}".format(parser_class.__name__))
 | 
			
		||||
            self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type))
 | 
			
		||||
 | 
			
		||||
        # Notify all listeners that we're going to do some work.
 | 
			
		||||
 | 
			
		||||
@ -162,7 +157,8 @@ class Consumer(LoggingMixin):
 | 
			
		||||
                # store the document.
 | 
			
		||||
                document = self._store(
 | 
			
		||||
                    text=text,
 | 
			
		||||
                    date=date
 | 
			
		||||
                    date=date,
 | 
			
		||||
                    mime_type=mime_type
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
                # If we get here, it was successful. Proceed with post-consume
 | 
			
		||||
@ -197,7 +193,7 @@ class Consumer(LoggingMixin):
 | 
			
		||||
 | 
			
		||||
        return document
 | 
			
		||||
 | 
			
		||||
    def _store(self, text, date):
 | 
			
		||||
    def _store(self, text, date, mime_type):
 | 
			
		||||
 | 
			
		||||
        # If someone gave us the original filename, use it instead of doc.
 | 
			
		||||
 | 
			
		||||
@ -220,7 +216,7 @@ class Consumer(LoggingMixin):
 | 
			
		||||
                correspondent=file_info.correspondent,
 | 
			
		||||
                title=file_info.title,
 | 
			
		||||
                content=text,
 | 
			
		||||
                file_type=file_info.extension,
 | 
			
		||||
                mime_type=mime_type,
 | 
			
		||||
                checksum=hashlib.md5(f.read()).hexdigest(),
 | 
			
		||||
                created=created,
 | 
			
		||||
                modified=created,
 | 
			
		||||
 | 
			
		||||
@ -91,9 +91,9 @@ def generate_filename(document):
 | 
			
		||||
 | 
			
		||||
    # Always append the primary key to guarantee uniqueness of filename
 | 
			
		||||
    if len(path) > 0:
 | 
			
		||||
        filename = "%s-%07i.%s" % (path, document.pk, document.file_type)
 | 
			
		||||
        filename = "%s-%07i%s" % (path, document.pk, document.file_type)
 | 
			
		||||
    else:
 | 
			
		||||
        filename = "%07i.%s" % (document.pk, document.file_type)
 | 
			
		||||
        filename = "%07i%s" % (document.pk, document.file_type)
 | 
			
		||||
 | 
			
		||||
    # Append .gpg for encrypted files
 | 
			
		||||
    if document.storage_type == document.STORAGE_TYPE_GPG:
 | 
			
		||||
 | 
			
		||||
@ -127,8 +127,8 @@ class Command(Renderable, BaseCommand):
 | 
			
		||||
        tags = ",".join([t.slug for t in doc.tags.all()])
 | 
			
		||||
 | 
			
		||||
        if tags:
 | 
			
		||||
            return "{} - {} - {} - {}.{}".format(
 | 
			
		||||
            return "{} - {} - {} - {}{}".format(
 | 
			
		||||
                created, doc.correspondent, doc.title, tags, doc.file_type)
 | 
			
		||||
 | 
			
		||||
        return "{} - {} - {}.{}".format(
 | 
			
		||||
        return "{} - {} - {}{}".format(
 | 
			
		||||
            created, doc.correspondent, doc.title, doc.file_type)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										50
									
								
								src/documents/migrations/1003_mime_types.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								src/documents/migrations/1003_mime_types.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,50 @@
 | 
			
		||||
# Generated by Django 3.1.3 on 2020-11-20 11:21
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
import magic
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.db import migrations, models
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def source_path(self):
 | 
			
		||||
    if self.filename:
 | 
			
		||||
        fname = str(self.filename)
 | 
			
		||||
    else:
 | 
			
		||||
        fname = "{:07}.{}".format(self.pk, self.file_type)
 | 
			
		||||
        if self.storage_type == self.STORAGE_TYPE_GPG:
 | 
			
		||||
            fname += ".gpg"
 | 
			
		||||
 | 
			
		||||
    return os.path.join(
 | 
			
		||||
        settings.ORIGINALS_DIR,
 | 
			
		||||
        fname
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def add_mime_types(apps, schema_editor):
 | 
			
		||||
    Document = apps.get_model("documents", "Document")
 | 
			
		||||
    documents = Document.objects.all()
 | 
			
		||||
 | 
			
		||||
    for d in documents:
 | 
			
		||||
        d.mime_type = magic.from_file(source_path(d), mime=True)
 | 
			
		||||
        d.save()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Migration(migrations.Migration):
 | 
			
		||||
 | 
			
		||||
    dependencies = [
 | 
			
		||||
        ('documents', '1002_auto_20201111_1105'),
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    operations = [
 | 
			
		||||
        migrations.AddField(
 | 
			
		||||
            model_name='document',
 | 
			
		||||
            name='mime_type',
 | 
			
		||||
            field=models.CharField(default="-", editable=False, max_length=256),
 | 
			
		||||
            preserve_default=False,
 | 
			
		||||
        ),
 | 
			
		||||
        migrations.RunPython(add_mime_types),
 | 
			
		||||
        migrations.RemoveField(
 | 
			
		||||
            model_name='document',
 | 
			
		||||
            name='file_type',
 | 
			
		||||
        ),
 | 
			
		||||
    ]
 | 
			
		||||
@ -1,6 +1,7 @@
 | 
			
		||||
# coding=utf-8
 | 
			
		||||
 | 
			
		||||
import logging
 | 
			
		||||
import mimetypes
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
from collections import OrderedDict
 | 
			
		||||
@ -113,18 +114,6 @@ class DocumentType(MatchingModel):
 | 
			
		||||
 | 
			
		||||
class Document(models.Model):
 | 
			
		||||
 | 
			
		||||
    # TODO: why do we need an explicit list
 | 
			
		||||
    TYPE_PDF = "pdf"
 | 
			
		||||
    TYPE_PNG = "png"
 | 
			
		||||
    TYPE_JPG = "jpg"
 | 
			
		||||
    TYPE_GIF = "gif"
 | 
			
		||||
    TYPE_TIF = "tiff"
 | 
			
		||||
    TYPE_TXT = "txt"
 | 
			
		||||
    TYPE_CSV = "csv"
 | 
			
		||||
    TYPE_MD = "md"
 | 
			
		||||
    TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
 | 
			
		||||
             TYPE_TXT, TYPE_CSV, TYPE_MD)
 | 
			
		||||
 | 
			
		||||
    STORAGE_TYPE_UNENCRYPTED = "unencrypted"
 | 
			
		||||
    STORAGE_TYPE_GPG = "gpg"
 | 
			
		||||
    STORAGE_TYPES = (
 | 
			
		||||
@ -156,10 +145,9 @@ class Document(models.Model):
 | 
			
		||||
                  "primarily used for searching."
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    file_type = models.CharField(
 | 
			
		||||
        max_length=4,
 | 
			
		||||
        editable=False,
 | 
			
		||||
        choices=tuple([(t, t.upper()) for t in TYPES])
 | 
			
		||||
    mime_type = models.CharField(
 | 
			
		||||
        max_length=256,
 | 
			
		||||
        editable=False
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    tags = models.ManyToManyField(
 | 
			
		||||
@ -223,7 +211,7 @@ class Document(models.Model):
 | 
			
		||||
        if self.filename:
 | 
			
		||||
            fname = str(self.filename)
 | 
			
		||||
        else:
 | 
			
		||||
            fname = "{:07}.{}".format(self.pk, self.file_type)
 | 
			
		||||
            fname = "{:07}{}".format(self.pk, self.file_type)
 | 
			
		||||
            if self.storage_type == self.STORAGE_TYPE_GPG:
 | 
			
		||||
                fname += ".gpg"
 | 
			
		||||
 | 
			
		||||
@ -238,7 +226,11 @@ class Document(models.Model):
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def file_name(self):
 | 
			
		||||
        return slugify(str(self)) + "." + self.file_type
 | 
			
		||||
        return slugify(str(self)) + self.file_type
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def file_type(self):
 | 
			
		||||
        return mimetypes.guess_extension(str(self.mime_type))
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def thumbnail_path(self):
 | 
			
		||||
 | 
			
		||||
@ -6,6 +6,7 @@ import subprocess
 | 
			
		||||
import tempfile
 | 
			
		||||
 | 
			
		||||
import dateparser
 | 
			
		||||
import magic
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.utils import timezone
 | 
			
		||||
 | 
			
		||||
@ -37,10 +38,11 @@ DATE_REGEX = re.compile(
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_parser_class(doc):
 | 
			
		||||
    """
 | 
			
		||||
    Determine the appropriate parser class based on the file
 | 
			
		||||
    """
 | 
			
		||||
def is_mime_type_supported(mime_type):
 | 
			
		||||
    return get_parser_class_for_mime_type(mime_type) is not None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_parser_class_for_mime_type(mime_type):
 | 
			
		||||
 | 
			
		||||
    options = []
 | 
			
		||||
 | 
			
		||||
@ -48,9 +50,9 @@ def get_parser_class(doc):
 | 
			
		||||
 | 
			
		||||
    for response in document_consumer_declaration.send(None):
 | 
			
		||||
        parser_declaration = response[1]
 | 
			
		||||
        parser_test = parser_declaration["test"]
 | 
			
		||||
        supported_mime_types = parser_declaration["mime_types"]
 | 
			
		||||
 | 
			
		||||
        if parser_test(doc):
 | 
			
		||||
        if mime_type in supported_mime_types:
 | 
			
		||||
            options.append(parser_declaration)
 | 
			
		||||
 | 
			
		||||
    if not options:
 | 
			
		||||
@ -61,6 +63,16 @@ def get_parser_class(doc):
 | 
			
		||||
        options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_parser_class(path):
 | 
			
		||||
    """
 | 
			
		||||
    Determine the appropriate parser class based on the file
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    mime_type = magic.from_file(path, mime=True)
 | 
			
		||||
 | 
			
		||||
    return get_parser_class_for_mime_type(mime_type)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
 | 
			
		||||
    environment = os.environ.copy()
 | 
			
		||||
    if settings.CONVERT_MEMORY_LIMIT:
 | 
			
		||||
 | 
			
		||||
@ -91,7 +91,7 @@ class DocumentSerializer(serializers.ModelSerializer):
 | 
			
		||||
            "document_type_id",
 | 
			
		||||
            "title",
 | 
			
		||||
            "content",
 | 
			
		||||
            "file_type",
 | 
			
		||||
            "mime_type",
 | 
			
		||||
            "tags",
 | 
			
		||||
            "tags_id",
 | 
			
		||||
            "checksum",
 | 
			
		||||
 | 
			
		||||
@ -45,7 +45,7 @@ class DocumentApiTest(APITestCase):
 | 
			
		||||
        dt = DocumentType.objects.create(name="dt", pk=63)
 | 
			
		||||
        tag = Tag.objects.create(name="t", pk=85)
 | 
			
		||||
 | 
			
		||||
        doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
 | 
			
		||||
        doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123", mime_type="application/pdf")
 | 
			
		||||
 | 
			
		||||
        doc.tags.add(tag)
 | 
			
		||||
 | 
			
		||||
@ -95,7 +95,7 @@ class DocumentApiTest(APITestCase):
 | 
			
		||||
        with open(filename, "wb") as f:
 | 
			
		||||
            f.write(content)
 | 
			
		||||
 | 
			
		||||
        doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
 | 
			
		||||
        doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")
 | 
			
		||||
 | 
			
		||||
        with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
 | 
			
		||||
            f.write(content_thumbnail)
 | 
			
		||||
@ -117,7 +117,7 @@ class DocumentApiTest(APITestCase):
 | 
			
		||||
 | 
			
		||||
    def test_document_actions_not_existing_file(self):
 | 
			
		||||
 | 
			
		||||
        doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
 | 
			
		||||
        doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
 | 
			
		||||
 | 
			
		||||
        response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
 | 
			
		||||
        self.assertEqual(response.status_code, 404)
 | 
			
		||||
@ -130,9 +130,9 @@ class DocumentApiTest(APITestCase):
 | 
			
		||||
 | 
			
		||||
    def test_document_filters(self):
 | 
			
		||||
 | 
			
		||||
        doc1 = Document.objects.create(title="none1", checksum="A")
 | 
			
		||||
        doc2 = Document.objects.create(title="none2", checksum="B")
 | 
			
		||||
        doc3 = Document.objects.create(title="none3", checksum="C")
 | 
			
		||||
        doc1 = Document.objects.create(title="none1", checksum="A", mime_type="application/pdf")
 | 
			
		||||
        doc2 = Document.objects.create(title="none2", checksum="B", mime_type="application/pdf")
 | 
			
		||||
        doc3 = Document.objects.create(title="none3", checksum="C", mime_type="application/pdf")
 | 
			
		||||
 | 
			
		||||
        tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
 | 
			
		||||
        tag_2 = Tag.objects.create(name="t2")
 | 
			
		||||
 | 
			
		||||
@ -437,6 +437,18 @@ class FaultyParser(DocumentParser):
 | 
			
		||||
        raise ParseError("Does not compute.")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def fake_magic_from_file(file, mime=False):
 | 
			
		||||
 | 
			
		||||
    if mime:
 | 
			
		||||
        if os.path.splitext(file)[1] == ".pdf":
 | 
			
		||||
            return "application/pdf"
 | 
			
		||||
        else:
 | 
			
		||||
            return "unknown"
 | 
			
		||||
    else:
 | 
			
		||||
        return "A verbose string that describes the contents of the file"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
 | 
			
		||||
class TestConsumer(TestCase):
 | 
			
		||||
 | 
			
		||||
    def make_dummy_parser(self, path, logging_group):
 | 
			
		||||
@ -462,7 +474,7 @@ class TestConsumer(TestCase):
 | 
			
		||||
        m = patcher.start()
 | 
			
		||||
        m.return_value = [(None, {
 | 
			
		||||
            "parser": self.make_dummy_parser,
 | 
			
		||||
            "test": lambda _: True,
 | 
			
		||||
            "mime_types": ["application/pdf"],
 | 
			
		||||
            "weight": 0
 | 
			
		||||
        })]
 | 
			
		||||
 | 
			
		||||
@ -592,7 +604,7 @@ class TestConsumer(TestCase):
 | 
			
		||||
    def testFaultyParser(self, m):
 | 
			
		||||
        m.return_value = [(None, {
 | 
			
		||||
            "parser": self.make_faulty_parser,
 | 
			
		||||
            "test": lambda _: True,
 | 
			
		||||
            "mime_types": ["application/pdf"],
 | 
			
		||||
            "weight": 0
 | 
			
		||||
        })]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -13,9 +13,12 @@ class TestDocument(TestCase):
 | 
			
		||||
            title="Title",
 | 
			
		||||
            content="content",
 | 
			
		||||
            checksum="checksum",
 | 
			
		||||
            mime_type="application/pdf"
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        file_path = document.source_path
 | 
			
		||||
        thumb_path = document.thumbnail_path
 | 
			
		||||
 | 
			
		||||
        with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
 | 
			
		||||
            document.delete()
 | 
			
		||||
            mock_unlink.assert_any_call(file_path)
 | 
			
		||||
 | 
			
		||||
@ -31,7 +31,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="")
 | 
			
		||||
    def test_generate_source_filename(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -44,7 +44,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
 | 
			
		||||
    def test_file_renaming(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -81,7 +81,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
 | 
			
		||||
    def test_file_renaming_missing_permissions(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -111,10 +111,10 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
 | 
			
		||||
    def test_file_renaming_database_error(self):
 | 
			
		||||
 | 
			
		||||
        document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
 | 
			
		||||
        document1 = Document.objects.create(mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
 | 
			
		||||
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.checksum = "BBBBB"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
@ -149,7 +149,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
 | 
			
		||||
    def test_document_delete(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -170,7 +170,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
 | 
			
		||||
    def test_document_delete_nofile(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -179,7 +179,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
 | 
			
		||||
    def test_directory_not_empty(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -206,7 +206,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
 | 
			
		||||
    def test_tags_with_underscore(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -222,7 +222,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
 | 
			
		||||
    def test_tags_with_dash(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -238,7 +238,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
 | 
			
		||||
    def test_tags_malformed(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -254,7 +254,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
 | 
			
		||||
    def test_tags_all(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -269,7 +269,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
 | 
			
		||||
    def test_tags_out_of_bounds(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -284,7 +284,7 @@ class TestDate(TestCase):
 | 
			
		||||
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
 | 
			
		||||
    def test_nested_directory_cleanup(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        document.save()
 | 
			
		||||
 | 
			
		||||
@ -309,7 +309,7 @@ class TestDate(TestCase):
 | 
			
		||||
    def test_format_none(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.pk = 1
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(generate_filename(document), "0000001.pdf")
 | 
			
		||||
@ -335,7 +335,7 @@ class TestDate(TestCase):
 | 
			
		||||
    def test_invalid_format(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.pk = 1
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(generate_filename(document), "0000001.pdf")
 | 
			
		||||
@ -344,7 +344,7 @@ class TestDate(TestCase):
 | 
			
		||||
    def test_invalid_format_key(self):
 | 
			
		||||
        document = Document()
 | 
			
		||||
        document.pk = 1
 | 
			
		||||
        document.file_type = "pdf"
 | 
			
		||||
        document.mime_type = "application/pdf"
 | 
			
		||||
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(generate_filename(document), "0000001.pdf")
 | 
			
		||||
 | 
			
		||||
@ -213,7 +213,7 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
 | 
			
		||||
        TestCase.setUp(self)
 | 
			
		||||
        User.objects.create_user(username='test_consumer', password='12345')
 | 
			
		||||
        self.doc_contains = Document.objects.create(
 | 
			
		||||
            content="I contain the keyword.", file_type="pdf")
 | 
			
		||||
            content="I contain the keyword.", mime_type="application/pdf")
 | 
			
		||||
 | 
			
		||||
    def test_tag_applied_any(self):
 | 
			
		||||
        t1 = Tag.objects.create(
 | 
			
		||||
 | 
			
		||||
@ -1,3 +1,4 @@
 | 
			
		||||
import os
 | 
			
		||||
from tempfile import TemporaryDirectory
 | 
			
		||||
from unittest import mock
 | 
			
		||||
 | 
			
		||||
@ -5,7 +6,18 @@ from django.test import TestCase
 | 
			
		||||
 | 
			
		||||
from documents.parsers import get_parser_class
 | 
			
		||||
 | 
			
		||||
def fake_magic_from_file(file, mime=False):
 | 
			
		||||
 | 
			
		||||
    if mime:
 | 
			
		||||
        if os.path.splitext(file)[1] == ".pdf":
 | 
			
		||||
            return "application/pdf"
 | 
			
		||||
        else:
 | 
			
		||||
            return "unknown"
 | 
			
		||||
    else:
 | 
			
		||||
        return "A verbose string that describes the contents of the file"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
 | 
			
		||||
class TestParserDiscovery(TestCase):
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.parsers.document_consumer_declaration.send")
 | 
			
		||||
@ -14,7 +26,7 @@ class TestParserDiscovery(TestCase):
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
        m.return_value = (
 | 
			
		||||
            (None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
 | 
			
		||||
            (None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(
 | 
			
		||||
@ -32,8 +44,8 @@ class TestParserDiscovery(TestCase):
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
        m.return_value = (
 | 
			
		||||
            (None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
 | 
			
		||||
            (None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
 | 
			
		||||
            (None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
 | 
			
		||||
            (None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(
 | 
			
		||||
 | 
			
		||||
@ -104,18 +104,6 @@ class DocumentViewSet(RetrieveModelMixin,
 | 
			
		||||
        return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
 | 
			
		||||
 | 
			
		||||
    def file_response(self, pk, disposition):
 | 
			
		||||
        # TODO: this should not be necessary here.
 | 
			
		||||
        content_types = {
 | 
			
		||||
            Document.TYPE_PDF: "application/pdf",
 | 
			
		||||
            Document.TYPE_PNG: "image/png",
 | 
			
		||||
            Document.TYPE_JPG: "image/jpeg",
 | 
			
		||||
            Document.TYPE_GIF: "image/gif",
 | 
			
		||||
            Document.TYPE_TIF: "image/tiff",
 | 
			
		||||
            Document.TYPE_CSV: "text/csv",
 | 
			
		||||
            Document.TYPE_MD: "text/markdown",
 | 
			
		||||
            Document.TYPE_TXT: "text/plain"
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        doc = Document.objects.get(id=pk)
 | 
			
		||||
 | 
			
		||||
        if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
 | 
			
		||||
@ -123,7 +111,7 @@ class DocumentViewSet(RetrieveModelMixin,
 | 
			
		||||
        else:
 | 
			
		||||
            file_handle = GnuPG.decrypted(doc.source_file)
 | 
			
		||||
 | 
			
		||||
        response = HttpResponse(file_handle, content_type=content_types[doc.file_type])
 | 
			
		||||
        response = HttpResponse(file_handle, content_type=doc.mime_type)
 | 
			
		||||
        response["Content-Disposition"] = '{}; filename="{}"'.format(
 | 
			
		||||
            disposition, doc.file_name)
 | 
			
		||||
        return response
 | 
			
		||||
 | 
			
		||||
@ -10,6 +10,7 @@ from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
 | 
			
		||||
 | 
			
		||||
from documents.loggers import LoggingMixin
 | 
			
		||||
from documents.models import Correspondent
 | 
			
		||||
from documents.parsers import is_mime_type_supported
 | 
			
		||||
from paperless_mail.models import MailAccount, MailRule
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -249,8 +250,7 @@ class MailAccountHandler(LoggingMixin):
 | 
			
		||||
 | 
			
		||||
            title = get_title(message, att, rule)
 | 
			
		||||
 | 
			
		||||
            # TODO: check with parsers what files types are supported
 | 
			
		||||
            if att.content_type == 'application/pdf':
 | 
			
		||||
            if is_mime_type_supported(att.content_type):
 | 
			
		||||
 | 
			
		||||
                os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
 | 
			
		||||
                _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,3 @@
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from .parsers import RasterisedDocumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -7,12 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs):
 | 
			
		||||
    return {
 | 
			
		||||
        "parser": RasterisedDocumentParser,
 | 
			
		||||
        "weight": 0,
 | 
			
		||||
        "test": tesseract_consumer_test
 | 
			
		||||
        "mime_types": [
 | 
			
		||||
            "application/pdf",
 | 
			
		||||
            "image/jpeg",
 | 
			
		||||
            "image/png"
 | 
			
		||||
        ]
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def tesseract_consumer_test(doc):
 | 
			
		||||
    return MATCHING_FILES.match(doc.lower())
 | 
			
		||||
 | 
			
		||||
@ -1,36 +0,0 @@
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
 | 
			
		||||
from paperless_tesseract.signals import tesseract_consumer_test
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SignalsTestCase(TestCase):
 | 
			
		||||
 | 
			
		||||
    def test_test_handles_various_file_names_true(self):
 | 
			
		||||
 | 
			
		||||
        prefixes = (
 | 
			
		||||
            "doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags",
 | 
			
		||||
            "A document with a . in it", "Doc with -- in it"
 | 
			
		||||
        )
 | 
			
		||||
        suffixes = (
 | 
			
		||||
            "pdf", "jpg", "jpeg", "gif", "png", "tiff", "tif", "pnm", "bmp",
 | 
			
		||||
            "PDF", "JPG", "JPEG", "GIF", "PNG", "TIFF", "TIF", "PNM", "BMP",
 | 
			
		||||
            "pDf", "jPg", "jpEg", "gIf", "pNg", "tIff", "tIf", "pNm", "bMp",
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        for prefix in prefixes:
 | 
			
		||||
            for suffix in suffixes:
 | 
			
		||||
                name = "{}.{}".format(prefix, suffix)
 | 
			
		||||
                self.assertTrue(tesseract_consumer_test(name))
 | 
			
		||||
 | 
			
		||||
    def test_test_handles_various_file_names_false(self):
 | 
			
		||||
 | 
			
		||||
        prefixes = ("doc",)
 | 
			
		||||
        suffixes = ("txt", "markdown", "",)
 | 
			
		||||
 | 
			
		||||
        for prefix in prefixes:
 | 
			
		||||
            for suffix in suffixes:
 | 
			
		||||
                name = "{}.{}".format(prefix, suffix)
 | 
			
		||||
                self.assertFalse(tesseract_consumer_test(name))
 | 
			
		||||
 | 
			
		||||
        self.assertFalse(tesseract_consumer_test(""))
 | 
			
		||||
        self.assertFalse(tesseract_consumer_test("doc"))
 | 
			
		||||
@ -1,5 +1,3 @@
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from .parsers import TextDocumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -7,12 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
 | 
			
		||||
    return {
 | 
			
		||||
        "parser": TextDocumentParser,
 | 
			
		||||
        "weight": 10,
 | 
			
		||||
        "test": text_consumer_test
 | 
			
		||||
        "mime_types": [
 | 
			
		||||
            "text/plain",
 | 
			
		||||
            "text/comma-separated-values"
 | 
			
		||||
        ]
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def text_consumer_test(doc):
 | 
			
		||||
    return MATCHING_FILES.match(doc.lower())
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user