mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-31 10:37:12 -04:00 
			
		
		
		
	Merge branch 'refactor-file-info-extraction' of https://github.com/tikitu/paperless into tikitu-refactor-file-info-extraction
This commit is contained in:
		
						commit
						cf5076bcad
					
				| @ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError | ||||
| 
 | ||||
| from paperless.db import GnuPG | ||||
| 
 | ||||
| from .models import Correspondent, Tag, Document, Log | ||||
| from .models import Correspondent, Tag, Document, Log, FileInfo | ||||
| from .languages import ISO639 | ||||
| 
 | ||||
| 
 | ||||
| @ -54,19 +54,6 @@ class Consumer(object): | ||||
| 
 | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
| 
 | ||||
|     REGEX_TITLE = re.compile( | ||||
|         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
|     REGEX_CORRESPONDENT_TITLE = re.compile( | ||||
|         r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
|     REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( | ||||
|         r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
| 
 | ||||
|     def __init__(self): | ||||
| 
 | ||||
|         self.logger = logging.getLogger(__name__) | ||||
| @ -105,7 +92,7 @@ class Consumer(object): | ||||
|             if not os.path.isfile(doc): | ||||
|                 continue | ||||
| 
 | ||||
|             if not re.match(self.REGEX_TITLE, doc): | ||||
|             if not re.match(FileInfo.REGEX_TITLE, doc): | ||||
|                 continue | ||||
| 
 | ||||
|             if doc in self._ignore: | ||||
| @ -269,72 +256,20 @@ class Consumer(object): | ||||
|         # Strip out excess white space to allow matching to go smoother | ||||
|         return re.sub(r"\s+", " ", r) | ||||
| 
 | ||||
|     def _guess_attributes_from_name(self, parseable): | ||||
|         """ | ||||
|         We use a crude naming convention to make handling the correspondent, | ||||
|         title, and tags easier: | ||||
|           "<correspondent> - <title> - <tags>.<suffix>" | ||||
|           "<correspondent> - <title>.<suffix>" | ||||
|           "<title>.<suffix>" | ||||
|         """ | ||||
| 
 | ||||
|         def get_correspondent(correspondent_name): | ||||
|             return Correspondent.objects.get_or_create( | ||||
|                 name=correspondent_name, | ||||
|                 defaults={"slug": slugify(correspondent_name)} | ||||
|             )[0] | ||||
| 
 | ||||
|         def get_tags(tags): | ||||
|             r = [] | ||||
|             for t in tags.split(","): | ||||
|                 r.append( | ||||
|                     Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) | ||||
|             return tuple(r) | ||||
| 
 | ||||
|         def get_suffix(suffix): | ||||
|             suffix = suffix.lower() | ||||
|             if suffix == "jpeg": | ||||
|                 return "jpg" | ||||
|             return suffix | ||||
| 
 | ||||
|         # First attempt: "<correspondent> - <title> - <tags>.<suffix>" | ||||
|         m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable) | ||||
|         if m: | ||||
|             return ( | ||||
|                 get_correspondent(m.group(1)), | ||||
|                 m.group(2), | ||||
|                 get_tags(m.group(3)), | ||||
|                 get_suffix(m.group(4)) | ||||
|             ) | ||||
| 
 | ||||
|         # Second attempt: "<correspondent> - <title>.<suffix>" | ||||
|         m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable) | ||||
|         if m: | ||||
|             return ( | ||||
|                 get_correspondent(m.group(1)), | ||||
|                 m.group(2), | ||||
|                 (), | ||||
|                 get_suffix(m.group(3)) | ||||
|             ) | ||||
| 
 | ||||
|         # That didn't work, so we assume correspondent and tags are None | ||||
|         m = re.match(self.REGEX_TITLE, parseable) | ||||
|         return None, m.group(1), (), get_suffix(m.group(2)) | ||||
| 
 | ||||
|     def _store(self, text, doc, thumbnail): | ||||
| 
 | ||||
|         sender, title, tags, file_type = self._guess_attributes_from_name(doc) | ||||
|         relevant_tags = set(list(Tag.match_all(text)) + list(tags)) | ||||
|         file_info = FileInfo.from_path(doc) | ||||
|         relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags)) | ||||
| 
 | ||||
|         stats = os.stat(doc) | ||||
| 
 | ||||
|         self.log("debug", "Saving record to database") | ||||
| 
 | ||||
|         document = Document.objects.create( | ||||
|             correspondent=sender, | ||||
|             title=title, | ||||
|             correspondent=file_info.correspondent, | ||||
|             title=file_info.title, | ||||
|             content=text, | ||||
|             file_type=file_type, | ||||
|             file_type=file_info.suffix, | ||||
|             created=timezone.make_aware( | ||||
|                 datetime.datetime.fromtimestamp(stats.st_mtime)), | ||||
|             modified=timezone.make_aware( | ||||
|  | ||||
| @ -12,6 +12,97 @@ from django.utils import timezone | ||||
| from .managers import LogManager | ||||
| 
 | ||||
| 
 | ||||
| class FileInfo(object): | ||||
|     def __init__(self, title, suffix, | ||||
|                  correspondent=None, tags=None): | ||||
|         self._title = title | ||||
|         self._suffix = suffix | ||||
|         self._correspondent = correspondent | ||||
|         self._tags = tags | ||||
| 
 | ||||
|     REGEX_TITLE = re.compile( | ||||
|         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
|     REGEX_CORRESPONDENT_TITLE = re.compile( | ||||
|         r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
|     REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( | ||||
|         r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def from_path(cls, path): | ||||
|         """ | ||||
|         We use a crude naming convention to make handling the correspondent, | ||||
|         title, and tags easier: | ||||
|           "<correspondent> - <title> - <tags>.<suffix>" | ||||
|           "<correspondent> - <title>.<suffix>" | ||||
|           "<title>.<suffix>" | ||||
|         """ | ||||
| 
 | ||||
|         def get_correspondent(correspondent_name): | ||||
|             return Correspondent.objects.get_or_create( | ||||
|                 name=correspondent_name, | ||||
|                 defaults={"slug": slugify(correspondent_name)} | ||||
|             )[0] | ||||
| 
 | ||||
|         def get_tags(tags): | ||||
|             r = [] | ||||
|             for t in tags.split(","): | ||||
|                 r.append( | ||||
|                     Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) | ||||
|             return tuple(r) | ||||
| 
 | ||||
|         def get_suffix(suffix): | ||||
|             suffix = suffix.lower() | ||||
|             if suffix == "jpeg": | ||||
|                 return "jpg" | ||||
|             return suffix | ||||
| 
 | ||||
|         # First attempt: "<correspondent> - <title> - <tags>.<suffix>" | ||||
|         m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path) | ||||
|         if m: | ||||
|             return cls( | ||||
|                 title=m.group(2), | ||||
|                 correspondent=get_correspondent(m.group(1)), | ||||
|                 tags=get_tags(m.group(3)), | ||||
|                 suffix=get_suffix(m.group(4)) | ||||
|             ) | ||||
| 
 | ||||
|         # Second attempt: "<correspondent> - <title>.<suffix>" | ||||
|         m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path) | ||||
|         if m: | ||||
|             return cls( | ||||
|                 title=m.group(2), | ||||
|                 correspondent=get_correspondent(m.group(1)), | ||||
|                 tags=(), | ||||
|                 suffix=get_suffix(m.group(3)) | ||||
|             ) | ||||
| 
 | ||||
|         # That didn't work, so we assume correspondent and tags are None | ||||
|         m = re.match(cls.REGEX_TITLE, path) | ||||
|         return FileInfo( | ||||
|             title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) | ||||
| 
 | ||||
|     @property | ||||
|     def title(self): | ||||
|         return self._title | ||||
| 
 | ||||
|     @property | ||||
|     def correspondent(self): | ||||
|         return self._correspondent | ||||
| 
 | ||||
|     @property | ||||
|     def tags(self): | ||||
|         return self._tags | ||||
| 
 | ||||
|     @property | ||||
|     def suffix(self): | ||||
|         return self._suffix | ||||
| 
 | ||||
| class SluggedModel(models.Model): | ||||
| 
 | ||||
|     name = models.CharField(max_length=128, unique=True) | ||||
|  | ||||
| @ -1,12 +1,11 @@ | ||||
| from django.test import TestCase | ||||
| 
 | ||||
| from ..consumer import Consumer | ||||
| from ..models import FileInfo | ||||
| 
 | ||||
| 
 | ||||
| class TestAttachment(TestCase): | ||||
| 
 | ||||
|     TAGS = ("tag1", "tag2", "tag3") | ||||
|     CONSUMER = Consumer() | ||||
|     SUFFIXES = ( | ||||
|         "pdf", "png", "jpg", "jpeg", "gif", | ||||
|         "PDF", "PNG", "JPG", "JPEG", "GIF", | ||||
| @ -16,14 +15,14 @@ class TestAttachment(TestCase): | ||||
|     def _test_guess_attributes_from_name(self, path, sender, title, tags): | ||||
|         for suffix in self.SUFFIXES: | ||||
|             f = path.format(suffix) | ||||
|             results = self.CONSUMER._guess_attributes_from_name(f) | ||||
|             self.assertEqual(results[0].name, sender, f) | ||||
|             self.assertEqual(results[1], title, f) | ||||
|             self.assertEqual(tuple([t.slug for t in results[2]]), tags, f) | ||||
|             file_info = FileInfo.from_path(f) | ||||
|             self.assertEqual(file_info.correspondent.name, sender, f) | ||||
|             self.assertEqual(file_info.title, title, f) | ||||
|             self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f) | ||||
|             if suffix.lower() == "jpeg": | ||||
|                 self.assertEqual(results[3], "jpg", f) | ||||
|                 self.assertEqual(file_info.suffix, "jpg", f) | ||||
|             else: | ||||
|                 self.assertEqual(results[3], suffix.lower(), f) | ||||
|                 self.assertEqual(file_info.suffix, suffix.lower(), f) | ||||
| 
 | ||||
|     def test_guess_attributes_from_name0(self): | ||||
|         self._test_guess_attributes_from_name( | ||||
| @ -92,3 +91,95 @@ class TestAttachment(TestCase): | ||||
|             "Τιτλε", | ||||
|             self.TAGS | ||||
|         ) | ||||
| 
 | ||||
|     def test_guess_attributes_from_name_when_correspondent_empty(self): | ||||
|         self._test_guess_attributes_from_name( | ||||
|             '/path/to/ - weird empty correspondent but should not break.{}', | ||||
|             None, | ||||
|             ' - weird empty correspondent but should not break', | ||||
|             () | ||||
|         ) | ||||
| 
 | ||||
|     def test_guess_attributes_from_name_when_title_starts_with_dash(self): | ||||
|         self._test_guess_attributes_from_name( | ||||
|             '/path/to/- weird but should not break.{}', | ||||
|             None, | ||||
|             '- weird but should not break', | ||||
|             () | ||||
|         ) | ||||
| 
 | ||||
|     def test_guess_attributes_from_name_when_title_ends_with_dash(self): | ||||
|         self._test_guess_attributes_from_name( | ||||
|             '/path/to/weird but should not break -.{}', | ||||
|             None, | ||||
|             'weird but should not break -', | ||||
|             () | ||||
|         ) | ||||
| 
 | ||||
|     def test_guess_attributes_from_name_when_title_is_empty(self): | ||||
|         self._test_guess_attributes_from_name( | ||||
|             '/path/to/weird correspondent but should not break - .{}', | ||||
|             'weird correspondent but should not break', | ||||
|             '', | ||||
|             () | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| class Permutations(TestCase): | ||||
|     valid_correspondents = ['timmy', 'Dr. McWheelie', | ||||
|                             'Dash Gor-don', 'ο Θερμαστής'] | ||||
|     valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', ''] | ||||
|     valid_tags = ['tag', 'tig,tag', '-', '0,1,2', ''] | ||||
|     valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif'] | ||||
| 
 | ||||
|     def _test_guessed_attributes( | ||||
|             self, filename, title, suffix, correspondent=None, tags=None): | ||||
|         file_info = FileInfo.from_path(filename) | ||||
| 
 | ||||
|         # Required | ||||
|         self.assertEqual(file_info.title, title, filename) | ||||
|         if suffix == 'jpeg': | ||||
|             suffix = 'jpg' | ||||
|         self.assertEqual(file_info.suffix, suffix, filename) | ||||
|         # Optional | ||||
|         if correspondent is None: | ||||
|             self.assertEqual(file_info.correspondent, | ||||
|                              correspondent, filename) | ||||
|         else: | ||||
|             self.assertEqual(file_info.correspondent.name, | ||||
|                              correspondent, filename) | ||||
|         if tags is None: | ||||
|             self.assertEqual(file_info.tags, (), filename) | ||||
|         else: | ||||
|             self.assertEqual([t.slug for t in file_info.tags], | ||||
|                              tags.split(','), | ||||
|                              filename) | ||||
| 
 | ||||
|     def test_just_title(self): | ||||
|         template = '/path/to/{title}.{suffix}' | ||||
|         for title in self.valid_titles: | ||||
|             for suffix in self.valid_suffixes: | ||||
|                 spec = dict(title=title, suffix=suffix) | ||||
|                 filename = template.format(**spec) | ||||
|                 self._test_guessed_attributes(filename, **spec) | ||||
| 
 | ||||
|     def test_title_and_correspondent(self): | ||||
|         template = '/path/to/{correspondent} - {title}.{suffix}' | ||||
|         for correspondent in self.valid_correspondents: | ||||
|             for title in self.valid_titles: | ||||
|                 for suffix in self.valid_suffixes: | ||||
|                     spec = dict(correspondent=correspondent, title=title, | ||||
|                                 suffix=suffix) | ||||
|                     filename = template.format(**spec) | ||||
|                     self._test_guessed_attributes(filename, **spec) | ||||
| 
 | ||||
|     def test_title_and_correspondent_and_tags(self): | ||||
|         template = '/path/to/{correspondent} - {title} - {tags}.{suffix}' | ||||
|         for correspondent in self.valid_correspondents: | ||||
|             for title in self.valid_titles: | ||||
|                 for tags in self.valid_tags: | ||||
|                     for suffix in self.valid_suffixes: | ||||
|                         spec = dict(correspondent=correspondent, title=title, | ||||
|                                     tags=tags, suffix=suffix) | ||||
|                         filename = template.format(**spec) | ||||
|                         self._test_guessed_attributes(filename, **spec) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user