mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	Updates ignore path filtering so files in a folder in an ignored folder will be ignored correctly
This commit is contained in:
		
							parent
							
								
									0d1a8d6d2f
								
							
						
					
					
						commit
						c08f0054da
					
				@ -999,13 +999,20 @@ within your documents.
 | 
				
			|||||||
`PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`
 | 
					`PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`
 | 
				
			||||||
 | 
					
 | 
				
			||||||
: By default, paperless ignores certain files and folders in the
 | 
					: By default, paperless ignores certain files and folders in the
 | 
				
			||||||
consumption directory, such as system files created by the Mac OS.
 | 
					consumption directory, such as system files created by the Mac OS
 | 
				
			||||||
 | 
					or hidden folders some tools use to store data.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    This can be adjusted by configuring a custom json array with
 | 
					    This can be adjusted by configuring a custom json array with
 | 
				
			||||||
    patterns to exclude.
 | 
					    patterns to exclude.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    For example, `.DS_STORE/*` will ignore any files found in a folder
 | 
				
			||||||
 | 
					    named `.DS_STORE`, including `.DS_STORE/bar.pdf` and `foo/.DS_STORE/bar.pdf`
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    A pattern like `._*` will ignore anything starting with `._`, including:
 | 
				
			||||||
 | 
					    `._foo.pdf` and `._bar/foo.pdf`
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Defaults to
 | 
					    Defaults to
 | 
				
			||||||
    `[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]`.
 | 
					    `[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Binaries
 | 
					## Binaries
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -1,5 +1,6 @@
 | 
				
			|||||||
import logging
 | 
					import logging
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					from fnmatch import filter
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from pathlib import PurePath
 | 
					from pathlib import PurePath
 | 
				
			||||||
from threading import Event
 | 
					from threading import Event
 | 
				
			||||||
@ -7,6 +8,7 @@ from threading import Thread
 | 
				
			|||||||
from time import monotonic
 | 
					from time import monotonic
 | 
				
			||||||
from time import sleep
 | 
					from time import sleep
 | 
				
			||||||
from typing import Final
 | 
					from typing import Final
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from django.conf import settings
 | 
					from django.conf import settings
 | 
				
			||||||
from django.core.management.base import BaseCommand
 | 
					from django.core.management.base import BaseCommand
 | 
				
			||||||
@ -25,15 +27,15 @@ except ImportError:  # pragma: nocover
 | 
				
			|||||||
logger = logging.getLogger("paperless.management.consumer")
 | 
					logger = logging.getLogger("paperless.management.consumer")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _tags_from_path(filepath):
 | 
					def _tags_from_path(filepath) -> Set[Tag]:
 | 
				
			||||||
    """Walk up the directory tree from filepath to CONSUMPTION_DIR
 | 
					    """
 | 
				
			||||||
    and get or create Tag IDs for every directory.
 | 
					    Walk up the directory tree from filepath to CONSUMPTION_DIR
 | 
				
			||||||
 | 
					    and get or create Tag IDs for every directory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Returns set of Tag models
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    normalized_consumption_dir = os.path.abspath(
 | 
					 | 
				
			||||||
        os.path.normpath(settings.CONSUMPTION_DIR),
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tag_ids = set()
 | 
					    tag_ids = set()
 | 
				
			||||||
    path_parts = Path(filepath).relative_to(normalized_consumption_dir).parent.parts
 | 
					    path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts
 | 
				
			||||||
    for part in path_parts:
 | 
					    for part in path_parts:
 | 
				
			||||||
        tag_ids.add(
 | 
					        tag_ids.add(
 | 
				
			||||||
            Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
 | 
					            Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
 | 
				
			||||||
@ -43,14 +45,41 @@ def _tags_from_path(filepath):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _is_ignored(filepath: str) -> bool:
 | 
					def _is_ignored(filepath: str) -> bool:
 | 
				
			||||||
    normalized_consumption_dir = os.path.abspath(
 | 
					    """
 | 
				
			||||||
        os.path.normpath(settings.CONSUMPTION_DIR),
 | 
					    Checks if the given file should be ignored, based on configured
 | 
				
			||||||
 | 
					    patterns.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Returns True if the file is ignored, False otherwise
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    filepath = os.path.abspath(
 | 
				
			||||||
 | 
					        os.path.normpath(filepath),
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    filepath_relative = PurePath(filepath).relative_to(normalized_consumption_dir)
 | 
					
 | 
				
			||||||
    return any(filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
 | 
					    # Trim out the consume directory, leaving only filename and it's
 | 
				
			||||||
 | 
					    # path relative to the consume directory
 | 
				
			||||||
 | 
					    filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # March through the components of the path, including directories and the filename
 | 
				
			||||||
 | 
					    # looking for anything matching
 | 
				
			||||||
 | 
					    # foo/bar/baz/file.pdf -> (foo, bar, baz, file.pdf)
 | 
				
			||||||
 | 
					    parts = []
 | 
				
			||||||
 | 
					    for part in filepath_relative.parts:
 | 
				
			||||||
 | 
					        # If the part is not the name (ie, it's a dir)
 | 
				
			||||||
 | 
					        # Need to append the trailing slash or fnmatch doesn't match
 | 
				
			||||||
 | 
					        # fnmatch("dir", "dir/*") == False
 | 
				
			||||||
 | 
					        # fnmatch("dir/", "dir/*") == True
 | 
				
			||||||
 | 
					        if part != filepath_relative.name:
 | 
				
			||||||
 | 
					            part = part + "/"
 | 
				
			||||||
 | 
					        parts.append(part)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for pattern in settings.CONSUMER_IGNORE_PATTERNS:
 | 
				
			||||||
 | 
					        if len(filter(parts, pattern)):
 | 
				
			||||||
 | 
					            return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _consume(filepath):
 | 
					def _consume(filepath: str) -> None:
 | 
				
			||||||
    if os.path.isdir(filepath) or _is_ignored(filepath):
 | 
					    if os.path.isdir(filepath) or _is_ignored(filepath):
 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -103,7 +132,13 @@ def _consume(filepath):
 | 
				
			|||||||
        logger.exception("Error while consuming document")
 | 
					        logger.exception("Error while consuming document")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _consume_wait_unmodified(file):
 | 
					def _consume_wait_unmodified(file: str) -> None:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Waits for the given file to appear unmodified based on file size
 | 
				
			||||||
 | 
					    and modification time.  Will wait a configured number of seconds
 | 
				
			||||||
 | 
					    and retry a configured number of times before either consuming or
 | 
				
			||||||
 | 
					    giving up
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    if _is_ignored(file):
 | 
					    if _is_ignored(file):
 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -247,22 +247,85 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    def test_is_ignored(self):
 | 
					    def test_is_ignored(self):
 | 
				
			||||||
        test_paths = [
 | 
					        test_paths = [
 | 
				
			||||||
            (os.path.join(self.dirs.consumption_dir, "foo.pdf"), False),
 | 
					            {
 | 
				
			||||||
            (os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"), False),
 | 
					                "path": os.path.join(self.dirs.consumption_dir, "foo.pdf"),
 | 
				
			||||||
            (os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), True),
 | 
					                "ignore": False,
 | 
				
			||||||
            (
 | 
					            },
 | 
				
			||||||
                os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"),
 | 
					            {
 | 
				
			||||||
                True,
 | 
					                "path": os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"),
 | 
				
			||||||
            ),
 | 
					                "ignore": False,
 | 
				
			||||||
            (os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), True),
 | 
					            },
 | 
				
			||||||
            (os.path.join(self.dirs.consumption_dir, "._foo.pdf"), True),
 | 
					            {
 | 
				
			||||||
            (os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), False),
 | 
					                "path": os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"),
 | 
				
			||||||
 | 
					                "ignore": True,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "path": os.path.join(
 | 
				
			||||||
 | 
					                    self.dirs.consumption_dir,
 | 
				
			||||||
 | 
					                    "foo",
 | 
				
			||||||
 | 
					                    ".DS_STORE",
 | 
				
			||||||
 | 
					                    "bar.pdf",
 | 
				
			||||||
 | 
					                ),
 | 
				
			||||||
 | 
					                "ignore": True,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "path": os.path.join(
 | 
				
			||||||
 | 
					                    self.dirs.consumption_dir,
 | 
				
			||||||
 | 
					                    ".DS_STORE",
 | 
				
			||||||
 | 
					                    "foo",
 | 
				
			||||||
 | 
					                    "bar.pdf",
 | 
				
			||||||
 | 
					                ),
 | 
				
			||||||
 | 
					                "ignore": True,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "path": os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"),
 | 
				
			||||||
 | 
					                "ignore": True,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "path": os.path.join(self.dirs.consumption_dir, ".stfolder.pdf"),
 | 
				
			||||||
 | 
					                "ignore": False,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "path": os.path.join(
 | 
				
			||||||
 | 
					                    self.dirs.consumption_dir,
 | 
				
			||||||
 | 
					                    ".stversions",
 | 
				
			||||||
 | 
					                    "foo.pdf",
 | 
				
			||||||
 | 
					                ),
 | 
				
			||||||
 | 
					                "ignore": True,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "path": os.path.join(self.dirs.consumption_dir, ".stversions.pdf"),
 | 
				
			||||||
 | 
					                "ignore": False,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "path": os.path.join(self.dirs.consumption_dir, "._foo.pdf"),
 | 
				
			||||||
 | 
					                "ignore": True,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "path": os.path.join(self.dirs.consumption_dir, "my_foo.pdf"),
 | 
				
			||||||
 | 
					                "ignore": False,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "path": os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"),
 | 
				
			||||||
 | 
					                "ignore": True,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "path": os.path.join(
 | 
				
			||||||
 | 
					                    self.dirs.consumption_dir,
 | 
				
			||||||
 | 
					                    "@eaDir",
 | 
				
			||||||
 | 
					                    "SYNO@.fileindexdb",
 | 
				
			||||||
 | 
					                    "_1jk.fnm",
 | 
				
			||||||
 | 
					                ),
 | 
				
			||||||
 | 
					                "ignore": True,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
        for file_path, expected_ignored in test_paths:
 | 
					        for test_setup in test_paths:
 | 
				
			||||||
 | 
					            filepath = test_setup["path"]
 | 
				
			||||||
 | 
					            expected_ignored_result = test_setup["ignore"]
 | 
				
			||||||
            self.assertEqual(
 | 
					            self.assertEqual(
 | 
				
			||||||
                expected_ignored,
 | 
					                expected_ignored_result,
 | 
				
			||||||
                document_consumer._is_ignored(file_path),
 | 
					                document_consumer._is_ignored(filepath),
 | 
				
			||||||
                f'_is_ignored("{file_path}") != {expected_ignored}',
 | 
					                f'_is_ignored("{filepath}") != {expected_ignored_result}',
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @mock.patch("documents.management.commands.document_consumer.open")
 | 
					    @mock.patch("documents.management.commands.document_consumer.open")
 | 
				
			||||||
 | 
				
			|||||||
@ -673,7 +673,7 @@ CONSUMER_IGNORE_PATTERNS = list(
 | 
				
			|||||||
    json.loads(
 | 
					    json.loads(
 | 
				
			||||||
        os.getenv(
 | 
					        os.getenv(
 | 
				
			||||||
            "PAPERLESS_CONSUMER_IGNORE_PATTERNS",
 | 
					            "PAPERLESS_CONSUMER_IGNORE_PATTERNS",
 | 
				
			||||||
            '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]',  # noqa: E501
 | 
					            '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]',  # noqa: E501
 | 
				
			||||||
        ),
 | 
					        ),
 | 
				
			||||||
    ),
 | 
					    ),
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user