mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-31 02:27:10 -04:00 
			
		
		
		
	Merge branch 'jat255-ENH_text_consumer'
This commit is contained in:
		
						commit
						75648cc74b
					
				| @ -1,24 +1,24 @@ | ||||
| # coding=utf-8 | ||||
| 
 | ||||
| import dateutil.parser | ||||
| import logging | ||||
| import os | ||||
| import re | ||||
| import uuid | ||||
| 
 | ||||
| from collections import OrderedDict | ||||
| 
 | ||||
| import dateutil.parser | ||||
| from django.conf import settings | ||||
| from django.db import models | ||||
| from django.template.defaultfilters import slugify | ||||
| from django.utils import timezone | ||||
| from fuzzywuzzy import fuzz | ||||
| 
 | ||||
| from django.conf import settings | ||||
| from .managers import LogManager | ||||
| 
 | ||||
| try: | ||||
|     from django.core.urlresolvers import reverse | ||||
| except ImportError: | ||||
|     from django.urls import reverse | ||||
| from django.db import models | ||||
| from django.template.defaultfilters import slugify | ||||
| from django.utils import timezone | ||||
| 
 | ||||
| from .managers import LogManager | ||||
| 
 | ||||
| 
 | ||||
| class MatchingModel(models.Model): | ||||
| @ -192,7 +192,11 @@ class Document(models.Model): | ||||
|     TYPE_JPG = "jpg" | ||||
|     TYPE_GIF = "gif" | ||||
|     TYPE_TIF = "tiff" | ||||
|     TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) | ||||
|     TYPE_TXT = "txt" | ||||
|     TYPE_CSV = "csv" | ||||
|     TYPE_MD = "md" | ||||
|     TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF, | ||||
|              TYPE_TXT, TYPE_CSV, TYPE_MD) | ||||
| 
 | ||||
|     STORAGE_TYPE_UNENCRYPTED = "unencrypted" | ||||
|     STORAGE_TYPE_GPG = "gpg" | ||||
| @ -365,51 +369,52 @@ class FileInfo: | ||||
|         ) | ||||
|     ) | ||||
| 
 | ||||
|     formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv" | ||||
|     REGEXES = OrderedDict([ | ||||
|         ("created-correspondent-title-tags", re.compile( | ||||
|             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " | ||||
|             r"(?P<correspondent>.*) - " | ||||
|             r"(?P<title>.*) - " | ||||
|             r"(?P<tags>[a-z0-9\-,]*)" | ||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", | ||||
|             r"\.(?P<extension>{})$".format(formats), | ||||
|             flags=re.IGNORECASE | ||||
|         )), | ||||
|         ("created-title-tags", re.compile( | ||||
|             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " | ||||
|             r"(?P<title>.*) - " | ||||
|             r"(?P<tags>[a-z0-9\-,]*)" | ||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", | ||||
|             r"\.(?P<extension>{})$".format(formats), | ||||
|             flags=re.IGNORECASE | ||||
|         )), | ||||
|         ("created-correspondent-title", re.compile( | ||||
|             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " | ||||
|             r"(?P<correspondent>.*) - " | ||||
|             r"(?P<title>.*)" | ||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", | ||||
|             r"\.(?P<extension>{})$".format(formats), | ||||
|             flags=re.IGNORECASE | ||||
|         )), | ||||
|         ("created-title", re.compile( | ||||
|             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " | ||||
|             r"(?P<title>.*)" | ||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", | ||||
|             r"\.(?P<extension>{})$".format(formats), | ||||
|             flags=re.IGNORECASE | ||||
|         )), | ||||
|         ("correspondent-title-tags", re.compile( | ||||
|             r"(?P<correspondent>.*) - " | ||||
|             r"(?P<title>.*) - " | ||||
|             r"(?P<tags>[a-z0-9\-,]*)" | ||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", | ||||
|             r"\.(?P<extension>{})$".format(formats), | ||||
|             flags=re.IGNORECASE | ||||
|         )), | ||||
|         ("correspondent-title", re.compile( | ||||
|             r"(?P<correspondent>.*) - " | ||||
|             r"(?P<title>.*)?" | ||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", | ||||
|             r"\.(?P<extension>{})$".format(formats), | ||||
|             flags=re.IGNORECASE | ||||
|         )), | ||||
|         ("title", re.compile( | ||||
|             r"(?P<title>.*)" | ||||
|             r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", | ||||
|             r"\.(?P<extension>{})$".format(formats), | ||||
|             flags=re.IGNORECASE | ||||
|         )) | ||||
|     ]) | ||||
|  | ||||
| @ -1,9 +1,25 @@ | ||||
| import logging | ||||
| import shutil | ||||
| import tempfile | ||||
| import re | ||||
| 
 | ||||
| from django.conf import settings | ||||
| 
 | ||||
| # This regular expression will try to find dates in the document at | ||||
| # hand and will match the following formats: | ||||
| # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
| # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
| # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
| # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
| # - MONTH ZZZZ, with ZZZZ being 4 digits | ||||
| # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits | ||||
| DATE_REGEX = re.compile( | ||||
|     r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + | ||||
|     r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + | ||||
|     r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + | ||||
|     r'\b([^\W\d_]{3,9} [0-9]{4})\b' | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| class ParseError(Exception): | ||||
|     pass | ||||
|  | ||||
| @ -48,6 +48,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView): | ||||
|             Document.TYPE_JPG: "image/jpeg", | ||||
|             Document.TYPE_GIF: "image/gif", | ||||
|             Document.TYPE_TIF: "image/tiff", | ||||
|             Document.TYPE_CSV: "text/csv", | ||||
|             Document.TYPE_MD:  "text/markdown", | ||||
|             Document.TYPE_TXT: "text/plain" | ||||
|         } | ||||
| 
 | ||||
|         if self.kwargs["kind"] == "thumb": | ||||
|  | ||||
| @ -67,6 +67,7 @@ INSTALLED_APPS = [ | ||||
|     "documents.apps.DocumentsConfig", | ||||
|     "reminders.apps.RemindersConfig", | ||||
|     "paperless_tesseract.apps.PaperlessTesseractConfig", | ||||
|     "paperless_text.apps.PaperlessTextConfig", | ||||
| 
 | ||||
|     "django.contrib.admin", | ||||
| 
 | ||||
|  | ||||
| @ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \ | ||||
| from pyocr.tesseract import TesseractError | ||||
| 
 | ||||
| import pdftotext | ||||
| from documents.parsers import DocumentParser, ParseError | ||||
| from documents.parsers import DocumentParser, ParseError, DATE_REGEX | ||||
| 
 | ||||
| from .languages import ISO639 | ||||
| 
 | ||||
| @ -210,22 +210,8 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         except ParseError as e: | ||||
|             return None | ||||
| 
 | ||||
|         # This regular expression will try to find dates in the document at | ||||
|         # hand and will match the following formats: | ||||
|         # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - MONTH ZZZZ, with ZZZZ being 4 digits | ||||
|         # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits | ||||
|         pattern = re.compile( | ||||
|             r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + | ||||
|             r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + | ||||
|             r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + | ||||
|             r'\b([^\W\d_]{3,9} [0-9]{4})\b') | ||||
| 
 | ||||
|         # Iterate through all regex matches and try to parse the date | ||||
|         for m in re.finditer(pattern, text): | ||||
|         for m in re.finditer(DATE_REGEX, text): | ||||
|             datestring = m.group(0) | ||||
| 
 | ||||
|             try: | ||||
|  | ||||
							
								
								
									
										0
									
								
								src/paperless_text/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_text/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										16
									
								
								src/paperless_text/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/paperless_text/apps.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,16 @@ | ||||
| from django.apps import AppConfig | ||||
| 
 | ||||
| 
 | ||||
| class PaperlessTextConfig(AppConfig): | ||||
| 
 | ||||
|     name = "paperless_text" | ||||
| 
 | ||||
|     def ready(self): | ||||
| 
 | ||||
|         from documents.signals import document_consumer_declaration | ||||
| 
 | ||||
|         from .signals import ConsumerDeclaration | ||||
| 
 | ||||
|         document_consumer_declaration.connect(ConsumerDeclaration.handle) | ||||
| 
 | ||||
|         AppConfig.ready(self) | ||||
							
								
								
									
										131
									
								
								src/paperless_text/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										131
									
								
								src/paperless_text/parsers.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,131 @@ | ||||
| import os | ||||
| import re | ||||
| import subprocess | ||||
| 
 | ||||
| import dateparser | ||||
| from django.conf import settings | ||||
| 
 | ||||
| from documents.parsers import DocumentParser, ParseError, DATE_REGEX | ||||
| 
 | ||||
| 
 | ||||
| class TextDocumentParser(DocumentParser): | ||||
|     """ | ||||
|     This parser directly parses a text document (.txt, .md, or .csv) | ||||
|     """ | ||||
| 
 | ||||
|     CONVERT = settings.CONVERT_BINARY | ||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||
|     UNPAPER = settings.UNPAPER_BINARY | ||||
|     DATE_ORDER = settings.DATE_ORDER | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|     OCR_ALWAYS = settings.OCR_ALWAYS | ||||
| 
 | ||||
|     def __init__(self, path): | ||||
|         super().__init__(path) | ||||
|         self._text = None | ||||
| 
 | ||||
|     def get_thumbnail(self): | ||||
|         """ | ||||
|         The thumbnail of a txt is just a 500px wide image of the text | ||||
|         rendered onto a letter-sized page. | ||||
|         """ | ||||
|         # The below is heavily cribbed from https://askubuntu.com/a/590951 | ||||
| 
 | ||||
|         bg_color = "white"  # bg color | ||||
|         text_color = "black"  # text color | ||||
|         psize = [500, 647]  # icon size | ||||
|         n_lines = 50  # number of lines to show | ||||
|         output_file = os.path.join(self.tempdir, "convert-txt.png") | ||||
| 
 | ||||
|         temp_bg = os.path.join(self.tempdir, "bg.png") | ||||
|         temp_txlayer = os.path.join(self.tempdir, "tx.png") | ||||
|         picsize = "x".join([str(n) for n in psize]) | ||||
|         txsize = "x".join([str(n - 8) for n in psize]) | ||||
| 
 | ||||
|         def create_bg(): | ||||
|             work_size = ",".join([str(n - 1) for n in psize]) | ||||
|             r = str(round(psize[0] / 10)) | ||||
|             rounded = ",".join([r, r]) | ||||
|             run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ', | ||||
|                         '"fill ', bg_color, ' roundrectangle 0,0,', | ||||
|                         work_size, ",", rounded, '" ', temp_bg) | ||||
| 
 | ||||
|         def read_text(): | ||||
|             with open(self.document_path, 'r') as src: | ||||
|                 lines = [l.strip() for l in src.readlines()] | ||||
|                 text = "\n".join([l for l in lines[:n_lines]]) | ||||
|                 return text.replace('"', "'") | ||||
| 
 | ||||
|         def create_txlayer(): | ||||
|             run_command(self.CONVERT, | ||||
|                         "-background none", | ||||
|                         "-fill", | ||||
|                         text_color, | ||||
|                         "-pointsize", "12", | ||||
|                         "-border 4 -bordercolor none", | ||||
|                         "-size ", txsize, | ||||
|                         ' caption:"', read_text(), '" ', | ||||
|                         temp_txlayer) | ||||
| 
 | ||||
|         create_txlayer() | ||||
|         create_bg() | ||||
|         run_command(self.CONVERT, temp_bg, temp_txlayer, | ||||
|                     "-background None -layers merge ", output_file) | ||||
| 
 | ||||
|         return output_file | ||||
| 
 | ||||
|     def get_text(self): | ||||
| 
 | ||||
|         if self._text is not None: | ||||
|             return self._text | ||||
| 
 | ||||
|         with open(self.document_path, 'r') as f: | ||||
|             self._text = f.read() | ||||
| 
 | ||||
|         return self._text | ||||
| 
 | ||||
|     def get_date(self): | ||||
|         date = None | ||||
|         datestring = None | ||||
| 
 | ||||
|         try: | ||||
|             text = self.get_text() | ||||
|         except ParseError as e: | ||||
|             return None | ||||
| 
 | ||||
|         # Iterate through all regex matches and try to parse the date | ||||
|         for m in re.finditer(DATE_REGEX, text): | ||||
|             datestring = m.group(0) | ||||
| 
 | ||||
|             try: | ||||
|                 date = dateparser.parse( | ||||
|                            datestring, | ||||
|                            settings={'DATE_ORDER': self.DATE_ORDER, | ||||
|                                      'PREFER_DAY_OF_MONTH': 'first', | ||||
|                                      'RETURN_AS_TIMEZONE_AWARE': True}) | ||||
|             except TypeError: | ||||
|                 # Skip all matches that do not parse to a proper date | ||||
|                 continue | ||||
| 
 | ||||
|             if date is not None: | ||||
|                 break | ||||
| 
 | ||||
|         if date is not None: | ||||
|             self.log("info", "Detected document date " + date.isoformat() + | ||||
|                              " based on string " + datestring) | ||||
|         else: | ||||
|             self.log("info", "Unable to detect date for document") | ||||
| 
 | ||||
|         return date | ||||
| 
 | ||||
| 
 | ||||
| def run_command(*args): | ||||
|     environment = os.environ.copy() | ||||
|     if settings.CONVERT_MEMORY_LIMIT: | ||||
|         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT | ||||
|     if settings.CONVERT_TMPDIR: | ||||
|         environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR | ||||
| 
 | ||||
|     if not subprocess.Popen(' '.join(args), env=environment, | ||||
|                             shell=True).wait() == 0: | ||||
|         raise ParseError("Convert failed at {}".format(args)) | ||||
							
								
								
									
										23
									
								
								src/paperless_text/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/paperless_text/signals.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,23 @@ | ||||
| import re | ||||
| 
 | ||||
| from .parsers import TextDocumentParser | ||||
| 
 | ||||
| 
 | ||||
| class ConsumerDeclaration: | ||||
| 
 | ||||
|     MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$") | ||||
| 
 | ||||
|     @classmethod | ||||
|     def handle(cls, sender, **kwargs): | ||||
|         return cls.test | ||||
| 
 | ||||
|     @classmethod | ||||
|     def test(cls, doc): | ||||
| 
 | ||||
|         if cls.MATCHING_FILES.match(doc.lower()): | ||||
|             return { | ||||
|                 "parser": TextDocumentParser, | ||||
|                 "weight": 10 | ||||
|             } | ||||
| 
 | ||||
|         return None | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user