mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-31 10:37:12 -04:00 
			
		
		
		
	Merge branch 'jat255-ENH_text_consumer'
This commit is contained in:
		
						commit
						75648cc74b
					
				| @ -1,24 +1,24 @@ | |||||||
| # coding=utf-8 | # coding=utf-8 | ||||||
| 
 | 
 | ||||||
| import dateutil.parser |  | ||||||
| import logging | import logging | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| import uuid | import uuid | ||||||
| 
 |  | ||||||
| from collections import OrderedDict | from collections import OrderedDict | ||||||
|  | 
 | ||||||
|  | import dateutil.parser | ||||||
|  | from django.conf import settings | ||||||
|  | from django.db import models | ||||||
|  | from django.template.defaultfilters import slugify | ||||||
|  | from django.utils import timezone | ||||||
| from fuzzywuzzy import fuzz | from fuzzywuzzy import fuzz | ||||||
| 
 | 
 | ||||||
| from django.conf import settings | from .managers import LogManager | ||||||
|  | 
 | ||||||
| try: | try: | ||||||
|     from django.core.urlresolvers import reverse |     from django.core.urlresolvers import reverse | ||||||
| except ImportError: | except ImportError: | ||||||
|     from django.urls import reverse |     from django.urls import reverse | ||||||
| from django.db import models |  | ||||||
| from django.template.defaultfilters import slugify |  | ||||||
| from django.utils import timezone |  | ||||||
| 
 |  | ||||||
| from .managers import LogManager |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class MatchingModel(models.Model): | class MatchingModel(models.Model): | ||||||
| @ -192,7 +192,11 @@ class Document(models.Model): | |||||||
|     TYPE_JPG = "jpg" |     TYPE_JPG = "jpg" | ||||||
|     TYPE_GIF = "gif" |     TYPE_GIF = "gif" | ||||||
|     TYPE_TIF = "tiff" |     TYPE_TIF = "tiff" | ||||||
|     TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) |     TYPE_TXT = "txt" | ||||||
|  |     TYPE_CSV = "csv" | ||||||
|  |     TYPE_MD = "md" | ||||||
|  |     TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF, | ||||||
|  |              TYPE_TXT, TYPE_CSV, TYPE_MD) | ||||||
| 
 | 
 | ||||||
|     STORAGE_TYPE_UNENCRYPTED = "unencrypted" |     STORAGE_TYPE_UNENCRYPTED = "unencrypted" | ||||||
|     STORAGE_TYPE_GPG = "gpg" |     STORAGE_TYPE_GPG = "gpg" | ||||||
| @ -365,51 +369,52 @@ class FileInfo: | |||||||
|         ) |         ) | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|  |     formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv" | ||||||
|     REGEXES = OrderedDict([ |     REGEXES = OrderedDict([ | ||||||
|         ("created-correspondent-title-tags", re.compile( |         ("created-correspondent-title-tags", re.compile( | ||||||
|             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " |             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " | ||||||
|             r"(?P<correspondent>.*) - " |             r"(?P<correspondent>.*) - " | ||||||
|             r"(?P<title>.*) - " |             r"(?P<title>.*) - " | ||||||
|             r"(?P<tags>[a-z0-9\-,]*)" |             r"(?P<tags>[a-z0-9\-,]*)" | ||||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", |             r"\.(?P<extension>{})$".format(formats), | ||||||
|             flags=re.IGNORECASE |             flags=re.IGNORECASE | ||||||
|         )), |         )), | ||||||
|         ("created-title-tags", re.compile( |         ("created-title-tags", re.compile( | ||||||
|             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " |             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " | ||||||
|             r"(?P<title>.*) - " |             r"(?P<title>.*) - " | ||||||
|             r"(?P<tags>[a-z0-9\-,]*)" |             r"(?P<tags>[a-z0-9\-,]*)" | ||||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", |             r"\.(?P<extension>{})$".format(formats), | ||||||
|             flags=re.IGNORECASE |             flags=re.IGNORECASE | ||||||
|         )), |         )), | ||||||
|         ("created-correspondent-title", re.compile( |         ("created-correspondent-title", re.compile( | ||||||
|             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " |             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " | ||||||
|             r"(?P<correspondent>.*) - " |             r"(?P<correspondent>.*) - " | ||||||
|             r"(?P<title>.*)" |             r"(?P<title>.*)" | ||||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", |             r"\.(?P<extension>{})$".format(formats), | ||||||
|             flags=re.IGNORECASE |             flags=re.IGNORECASE | ||||||
|         )), |         )), | ||||||
|         ("created-title", re.compile( |         ("created-title", re.compile( | ||||||
|             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " |             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " | ||||||
|             r"(?P<title>.*)" |             r"(?P<title>.*)" | ||||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", |             r"\.(?P<extension>{})$".format(formats), | ||||||
|             flags=re.IGNORECASE |             flags=re.IGNORECASE | ||||||
|         )), |         )), | ||||||
|         ("correspondent-title-tags", re.compile( |         ("correspondent-title-tags", re.compile( | ||||||
|             r"(?P<correspondent>.*) - " |             r"(?P<correspondent>.*) - " | ||||||
|             r"(?P<title>.*) - " |             r"(?P<title>.*) - " | ||||||
|             r"(?P<tags>[a-z0-9\-,]*)" |             r"(?P<tags>[a-z0-9\-,]*)" | ||||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", |             r"\.(?P<extension>{})$".format(formats), | ||||||
|             flags=re.IGNORECASE |             flags=re.IGNORECASE | ||||||
|         )), |         )), | ||||||
|         ("correspondent-title", re.compile( |         ("correspondent-title", re.compile( | ||||||
|             r"(?P<correspondent>.*) - " |             r"(?P<correspondent>.*) - " | ||||||
|             r"(?P<title>.*)?" |             r"(?P<title>.*)?" | ||||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", |             r"\.(?P<extension>{})$".format(formats), | ||||||
|             flags=re.IGNORECASE |             flags=re.IGNORECASE | ||||||
|         )), |         )), | ||||||
|         ("title", re.compile( |         ("title", re.compile( | ||||||
|             r"(?P<title>.*)" |             r"(?P<title>.*)" | ||||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", |             r"\.(?P<extension>{})$".format(formats), | ||||||
|             flags=re.IGNORECASE |             flags=re.IGNORECASE | ||||||
|         )) |         )) | ||||||
|     ]) |     ]) | ||||||
|  | |||||||
| @ -1,9 +1,25 @@ | |||||||
| import logging | import logging | ||||||
| import shutil | import shutil | ||||||
| import tempfile | import tempfile | ||||||
|  | import re | ||||||
| 
 | 
 | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| 
 | 
 | ||||||
|  | # This regular expression will try to find dates in the document at | ||||||
|  | # hand and will match the following formats: | ||||||
|  | # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||||
|  | # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||||
|  | # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||||
|  | # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits | ||||||
|  | # - MONTH ZZZZ, with ZZZZ being 4 digits | ||||||
|  | # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits | ||||||
|  | DATE_REGEX = re.compile( | ||||||
|  |     r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + | ||||||
|  |     r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + | ||||||
|  |     r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + | ||||||
|  |     r'\b([^\W\d_]{3,9} [0-9]{4})\b' | ||||||
|  | ) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class ParseError(Exception): | class ParseError(Exception): | ||||||
|     pass |     pass | ||||||
|  | |||||||
| @ -48,6 +48,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView): | |||||||
|             Document.TYPE_JPG: "image/jpeg", |             Document.TYPE_JPG: "image/jpeg", | ||||||
|             Document.TYPE_GIF: "image/gif", |             Document.TYPE_GIF: "image/gif", | ||||||
|             Document.TYPE_TIF: "image/tiff", |             Document.TYPE_TIF: "image/tiff", | ||||||
|  |             Document.TYPE_CSV: "text/csv", | ||||||
|  |             Document.TYPE_MD:  "text/markdown", | ||||||
|  |             Document.TYPE_TXT: "text/plain" | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         if self.kwargs["kind"] == "thumb": |         if self.kwargs["kind"] == "thumb": | ||||||
|  | |||||||
| @ -67,6 +67,7 @@ INSTALLED_APPS = [ | |||||||
|     "documents.apps.DocumentsConfig", |     "documents.apps.DocumentsConfig", | ||||||
|     "reminders.apps.RemindersConfig", |     "reminders.apps.RemindersConfig", | ||||||
|     "paperless_tesseract.apps.PaperlessTesseractConfig", |     "paperless_tesseract.apps.PaperlessTesseractConfig", | ||||||
|  |     "paperless_text.apps.PaperlessTextConfig", | ||||||
| 
 | 
 | ||||||
|     "django.contrib.admin", |     "django.contrib.admin", | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \ | |||||||
| from pyocr.tesseract import TesseractError | from pyocr.tesseract import TesseractError | ||||||
| 
 | 
 | ||||||
| import pdftotext | import pdftotext | ||||||
| from documents.parsers import DocumentParser, ParseError | from documents.parsers import DocumentParser, ParseError, DATE_REGEX | ||||||
| 
 | 
 | ||||||
| from .languages import ISO639 | from .languages import ISO639 | ||||||
| 
 | 
 | ||||||
| @ -210,22 +210,8 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|         except ParseError as e: |         except ParseError as e: | ||||||
|             return None |             return None | ||||||
| 
 | 
 | ||||||
|         # This regular expression will try to find dates in the document at |  | ||||||
|         # hand and will match the following formats: |  | ||||||
|         # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits |  | ||||||
|         # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits |  | ||||||
|         # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits |  | ||||||
|         # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits |  | ||||||
|         # - MONTH ZZZZ, with ZZZZ being 4 digits |  | ||||||
|         # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits |  | ||||||
|         pattern = re.compile( |  | ||||||
|             r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + |  | ||||||
|             r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + |  | ||||||
|             r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + |  | ||||||
|             r'\b([^\W\d_]{3,9} [0-9]{4})\b') |  | ||||||
| 
 |  | ||||||
|         # Iterate through all regex matches and try to parse the date |         # Iterate through all regex matches and try to parse the date | ||||||
|         for m in re.finditer(pattern, text): |         for m in re.finditer(DATE_REGEX, text): | ||||||
|             datestring = m.group(0) |             datestring = m.group(0) | ||||||
| 
 | 
 | ||||||
|             try: |             try: | ||||||
|  | |||||||
							
								
								
									
										0
									
								
								src/paperless_text/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_text/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										16
									
								
								src/paperless_text/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/paperless_text/apps.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,16 @@ | |||||||
|  | from django.apps import AppConfig | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class PaperlessTextConfig(AppConfig): | ||||||
|  | 
 | ||||||
|  |     name = "paperless_text" | ||||||
|  | 
 | ||||||
|  |     def ready(self): | ||||||
|  | 
 | ||||||
|  |         from documents.signals import document_consumer_declaration | ||||||
|  | 
 | ||||||
|  |         from .signals import ConsumerDeclaration | ||||||
|  | 
 | ||||||
|  |         document_consumer_declaration.connect(ConsumerDeclaration.handle) | ||||||
|  | 
 | ||||||
|  |         AppConfig.ready(self) | ||||||
							
								
								
									
										131
									
								
								src/paperless_text/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										131
									
								
								src/paperless_text/parsers.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,131 @@ | |||||||
|  | import os | ||||||
|  | import re | ||||||
|  | import subprocess | ||||||
|  | 
 | ||||||
|  | import dateparser | ||||||
|  | from django.conf import settings | ||||||
|  | 
 | ||||||
|  | from documents.parsers import DocumentParser, ParseError, DATE_REGEX | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TextDocumentParser(DocumentParser): | ||||||
|  |     """ | ||||||
|  |     This parser directly parses a text document (.txt, .md, or .csv) | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     CONVERT = settings.CONVERT_BINARY | ||||||
|  |     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||||
|  |     UNPAPER = settings.UNPAPER_BINARY | ||||||
|  |     DATE_ORDER = settings.DATE_ORDER | ||||||
|  |     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||||
|  |     OCR_ALWAYS = settings.OCR_ALWAYS | ||||||
|  | 
 | ||||||
|  |     def __init__(self, path): | ||||||
|  |         super().__init__(path) | ||||||
|  |         self._text = None | ||||||
|  | 
 | ||||||
|  |     def get_thumbnail(self): | ||||||
|  |         """ | ||||||
|  |         The thumbnail of a txt is just a 500px wide image of the text | ||||||
|  |         rendered onto a letter-sized page. | ||||||
|  |         """ | ||||||
|  |         # The below is heavily cribbed from https://askubuntu.com/a/590951 | ||||||
|  | 
 | ||||||
|  |         bg_color = "white"  # bg color | ||||||
|  |         text_color = "black"  # text color | ||||||
|  |         psize = [500, 647]  # icon size | ||||||
|  |         n_lines = 50  # number of lines to show | ||||||
|  |         output_file = os.path.join(self.tempdir, "convert-txt.png") | ||||||
|  | 
 | ||||||
|  |         temp_bg = os.path.join(self.tempdir, "bg.png") | ||||||
|  |         temp_txlayer = os.path.join(self.tempdir, "tx.png") | ||||||
|  |         picsize = "x".join([str(n) for n in psize]) | ||||||
|  |         txsize = "x".join([str(n - 8) for n in psize]) | ||||||
|  | 
 | ||||||
|  |         def create_bg(): | ||||||
|  |             work_size = ",".join([str(n - 1) for n in psize]) | ||||||
|  |             r = str(round(psize[0] / 10)) | ||||||
|  |             rounded = ",".join([r, r]) | ||||||
|  |             run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ', | ||||||
|  |                         '"fill ', bg_color, ' roundrectangle 0,0,', | ||||||
|  |                         work_size, ",", rounded, '" ', temp_bg) | ||||||
|  | 
 | ||||||
|  |         def read_text(): | ||||||
|  |             with open(self.document_path, 'r') as src: | ||||||
|  |                 lines = [l.strip() for l in src.readlines()] | ||||||
|  |                 text = "\n".join([l for l in lines[:n_lines]]) | ||||||
|  |                 return text.replace('"', "'") | ||||||
|  | 
 | ||||||
|  |         def create_txlayer(): | ||||||
|  |             run_command(self.CONVERT, | ||||||
|  |                         "-background none", | ||||||
|  |                         "-fill", | ||||||
|  |                         text_color, | ||||||
|  |                         "-pointsize", "12", | ||||||
|  |                         "-border 4 -bordercolor none", | ||||||
|  |                         "-size ", txsize, | ||||||
|  |                         ' caption:"', read_text(), '" ', | ||||||
|  |                         temp_txlayer) | ||||||
|  | 
 | ||||||
|  |         create_txlayer() | ||||||
|  |         create_bg() | ||||||
|  |         run_command(self.CONVERT, temp_bg, temp_txlayer, | ||||||
|  |                     "-background None -layers merge ", output_file) | ||||||
|  | 
 | ||||||
|  |         return output_file | ||||||
|  | 
 | ||||||
|  |     def get_text(self): | ||||||
|  | 
 | ||||||
|  |         if self._text is not None: | ||||||
|  |             return self._text | ||||||
|  | 
 | ||||||
|  |         with open(self.document_path, 'r') as f: | ||||||
|  |             self._text = f.read() | ||||||
|  | 
 | ||||||
|  |         return self._text | ||||||
|  | 
 | ||||||
|  |     def get_date(self): | ||||||
|  |         date = None | ||||||
|  |         datestring = None | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             text = self.get_text() | ||||||
|  |         except ParseError as e: | ||||||
|  |             return None | ||||||
|  | 
 | ||||||
|  |         # Iterate through all regex matches and try to parse the date | ||||||
|  |         for m in re.finditer(DATE_REGEX, text): | ||||||
|  |             datestring = m.group(0) | ||||||
|  | 
 | ||||||
|  |             try: | ||||||
|  |                 date = dateparser.parse( | ||||||
|  |                            datestring, | ||||||
|  |                            settings={'DATE_ORDER': self.DATE_ORDER, | ||||||
|  |                                      'PREFER_DAY_OF_MONTH': 'first', | ||||||
|  |                                      'RETURN_AS_TIMEZONE_AWARE': True}) | ||||||
|  |             except TypeError: | ||||||
|  |                 # Skip all matches that do not parse to a proper date | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             if date is not None: | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|  |         if date is not None: | ||||||
|  |             self.log("info", "Detected document date " + date.isoformat() + | ||||||
|  |                              " based on string " + datestring) | ||||||
|  |         else: | ||||||
|  |             self.log("info", "Unable to detect date for document") | ||||||
|  | 
 | ||||||
|  |         return date | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def run_command(*args): | ||||||
|  |     environment = os.environ.copy() | ||||||
|  |     if settings.CONVERT_MEMORY_LIMIT: | ||||||
|  |         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT | ||||||
|  |     if settings.CONVERT_TMPDIR: | ||||||
|  |         environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR | ||||||
|  | 
 | ||||||
|  |     if not subprocess.Popen(' '.join(args), env=environment, | ||||||
|  |                             shell=True).wait() == 0: | ||||||
|  |         raise ParseError("Convert failed at {}".format(args)) | ||||||
							
								
								
									
										23
									
								
								src/paperless_text/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/paperless_text/signals.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,23 @@ | |||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | from .parsers import TextDocumentParser | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class ConsumerDeclaration: | ||||||
|  | 
 | ||||||
|  |     MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$") | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def handle(cls, sender, **kwargs): | ||||||
|  |         return cls.test | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def test(cls, doc): | ||||||
|  | 
 | ||||||
|  |         if cls.MATCHING_FILES.match(doc.lower()): | ||||||
|  |             return { | ||||||
|  |                 "parser": TextDocumentParser, | ||||||
|  |                 "weight": 10 | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |         return None | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user