mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-04 03:27:12 -05:00 
			
		
		
		
	Merge branch 'ENH_text_consumer' of git://github.com/jat255/paperless into jat255-ENH_text_consumer
This commit is contained in:
		
						commit
						2dc35cc856
					
				@ -192,7 +192,11 @@ class Document(models.Model):
 | 
				
			|||||||
    TYPE_JPG = "jpg"
 | 
					    TYPE_JPG = "jpg"
 | 
				
			||||||
    TYPE_GIF = "gif"
 | 
					    TYPE_GIF = "gif"
 | 
				
			||||||
    TYPE_TIF = "tiff"
 | 
					    TYPE_TIF = "tiff"
 | 
				
			||||||
    TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
 | 
					    TYPE_TXT = "txt"
 | 
				
			||||||
 | 
					    TYPE_CSV = "csv"
 | 
				
			||||||
 | 
					    TYPE_MD  = "md"
 | 
				
			||||||
 | 
					    TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
 | 
				
			||||||
 | 
					             TYPE_TXT, TYPE_CSV, TYPE_MD)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    STORAGE_TYPE_UNENCRYPTED = "unencrypted"
 | 
					    STORAGE_TYPE_UNENCRYPTED = "unencrypted"
 | 
				
			||||||
    STORAGE_TYPE_GPG = "gpg"
 | 
					    STORAGE_TYPE_GPG = "gpg"
 | 
				
			||||||
@ -365,51 +369,52 @@ class FileInfo:
 | 
				
			|||||||
        )
 | 
					        )
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
 | 
				
			||||||
    REGEXES = OrderedDict([
 | 
					    REGEXES = OrderedDict([
 | 
				
			||||||
        ("created-correspondent-title-tags", re.compile(
 | 
					        ("created-correspondent-title-tags", re.compile(
 | 
				
			||||||
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
 | 
					            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
 | 
				
			||||||
            r"(?P<correspondent>.*) - "
 | 
					            r"(?P<correspondent>.*) - "
 | 
				
			||||||
            r"(?P<title>.*) - "
 | 
					            r"(?P<title>.*) - "
 | 
				
			||||||
            r"(?P<tags>[a-z0-9\-,]*)"
 | 
					            r"(?P<tags>[a-z0-9\-,]*)"
 | 
				
			||||||
            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
 | 
					            r"\.(?P<extension>{})$".format(formats),
 | 
				
			||||||
            flags=re.IGNORECASE
 | 
					            flags=re.IGNORECASE
 | 
				
			||||||
        )),
 | 
					        )),
 | 
				
			||||||
        ("created-title-tags", re.compile(
 | 
					        ("created-title-tags", re.compile(
 | 
				
			||||||
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
 | 
					            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
 | 
				
			||||||
            r"(?P<title>.*) - "
 | 
					            r"(?P<title>.*) - "
 | 
				
			||||||
            r"(?P<tags>[a-z0-9\-,]*)"
 | 
					            r"(?P<tags>[a-z0-9\-,]*)"
 | 
				
			||||||
            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
 | 
					            r"\.(?P<extension>{})$".format(formats),
 | 
				
			||||||
            flags=re.IGNORECASE
 | 
					            flags=re.IGNORECASE
 | 
				
			||||||
        )),
 | 
					        )),
 | 
				
			||||||
        ("created-correspondent-title", re.compile(
 | 
					        ("created-correspondent-title", re.compile(
 | 
				
			||||||
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
 | 
					            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
 | 
				
			||||||
            r"(?P<correspondent>.*) - "
 | 
					            r"(?P<correspondent>.*) - "
 | 
				
			||||||
            r"(?P<title>.*)"
 | 
					            r"(?P<title>.*)"
 | 
				
			||||||
            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
 | 
					            r"\.(?P<extension>{})$".format(formats),
 | 
				
			||||||
            flags=re.IGNORECASE
 | 
					            flags=re.IGNORECASE
 | 
				
			||||||
        )),
 | 
					        )),
 | 
				
			||||||
        ("created-title", re.compile(
 | 
					        ("created-title", re.compile(
 | 
				
			||||||
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
 | 
					            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
 | 
				
			||||||
            r"(?P<title>.*)"
 | 
					            r"(?P<title>.*)"
 | 
				
			||||||
            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
 | 
					            r"\.(?P<extension>{})$".format(formats),
 | 
				
			||||||
            flags=re.IGNORECASE
 | 
					            flags=re.IGNORECASE
 | 
				
			||||||
        )),
 | 
					        )),
 | 
				
			||||||
        ("correspondent-title-tags", re.compile(
 | 
					        ("correspondent-title-tags", re.compile(
 | 
				
			||||||
            r"(?P<correspondent>.*) - "
 | 
					            r"(?P<correspondent>.*) - "
 | 
				
			||||||
            r"(?P<title>.*) - "
 | 
					            r"(?P<title>.*) - "
 | 
				
			||||||
            r"(?P<tags>[a-z0-9\-,]*)"
 | 
					            r"(?P<tags>[a-z0-9\-,]*)"
 | 
				
			||||||
            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
 | 
					            r"\.(?P<extension>{})$".format(formats),
 | 
				
			||||||
            flags=re.IGNORECASE
 | 
					            flags=re.IGNORECASE
 | 
				
			||||||
        )),
 | 
					        )),
 | 
				
			||||||
        ("correspondent-title", re.compile(
 | 
					        ("correspondent-title", re.compile(
 | 
				
			||||||
            r"(?P<correspondent>.*) - "
 | 
					            r"(?P<correspondent>.*) - "
 | 
				
			||||||
            r"(?P<title>.*)?"
 | 
					            r"(?P<title>.*)?"
 | 
				
			||||||
            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
 | 
					            r"\.(?P<extension>{})$".format(formats),
 | 
				
			||||||
            flags=re.IGNORECASE
 | 
					            flags=re.IGNORECASE
 | 
				
			||||||
        )),
 | 
					        )),
 | 
				
			||||||
        ("title", re.compile(
 | 
					        ("title", re.compile(
 | 
				
			||||||
            r"(?P<title>.*)"
 | 
					            r"(?P<title>.*)"
 | 
				
			||||||
            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
 | 
					            r"\.(?P<extension>{})$".format(formats),
 | 
				
			||||||
            flags=re.IGNORECASE
 | 
					            flags=re.IGNORECASE
 | 
				
			||||||
        ))
 | 
					        ))
 | 
				
			||||||
    ])
 | 
					    ])
 | 
				
			||||||
 | 
				
			|||||||
@ -1,9 +1,24 @@
 | 
				
			|||||||
import logging
 | 
					import logging
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
import tempfile
 | 
					import tempfile
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from django.conf import settings
 | 
					from django.conf import settings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# This regular expression will try to find dates in the document at
 | 
				
			||||||
 | 
					# hand and will match the following formats:
 | 
				
			||||||
 | 
					# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
				
			||||||
 | 
					# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
				
			||||||
 | 
					# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
				
			||||||
 | 
					# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
				
			||||||
 | 
					# - MONTH ZZZZ, with ZZZZ being 4 digits
 | 
				
			||||||
 | 
					# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
 | 
				
			||||||
 | 
					pattern = re.compile(
 | 
				
			||||||
 | 
					    r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
 | 
				
			||||||
 | 
					    r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
 | 
				
			||||||
 | 
					    r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
 | 
				
			||||||
 | 
					    r'\b([^\W\d_]{3,9} [0-9]{4})\b')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ParseError(Exception):
 | 
					class ParseError(Exception):
 | 
				
			||||||
    pass
 | 
					    pass
 | 
				
			||||||
 | 
				
			|||||||
@ -48,6 +48,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView):
 | 
				
			|||||||
            Document.TYPE_JPG: "image/jpeg",
 | 
					            Document.TYPE_JPG: "image/jpeg",
 | 
				
			||||||
            Document.TYPE_GIF: "image/gif",
 | 
					            Document.TYPE_GIF: "image/gif",
 | 
				
			||||||
            Document.TYPE_TIF: "image/tiff",
 | 
					            Document.TYPE_TIF: "image/tiff",
 | 
				
			||||||
 | 
					            Document.TYPE_CSV: "text/csv",
 | 
				
			||||||
 | 
					            Document.TYPE_MD:  "text/markdown",
 | 
				
			||||||
 | 
					            Document.TYPE_TXT: "text/plain"
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if self.kwargs["kind"] == "thumb":
 | 
					        if self.kwargs["kind"] == "thumb":
 | 
				
			||||||
 | 
				
			|||||||
@ -67,6 +67,7 @@ INSTALLED_APPS = [
 | 
				
			|||||||
    "documents.apps.DocumentsConfig",
 | 
					    "documents.apps.DocumentsConfig",
 | 
				
			||||||
    "reminders.apps.RemindersConfig",
 | 
					    "reminders.apps.RemindersConfig",
 | 
				
			||||||
    "paperless_tesseract.apps.PaperlessTesseractConfig",
 | 
					    "paperless_tesseract.apps.PaperlessTesseractConfig",
 | 
				
			||||||
 | 
					    "paperless_text.apps.PaperlessTextConfig",
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    "django.contrib.admin",
 | 
					    "django.contrib.admin",
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \
 | 
				
			|||||||
from pyocr.tesseract import TesseractError
 | 
					from pyocr.tesseract import TesseractError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pdftotext
 | 
					import pdftotext
 | 
				
			||||||
from documents.parsers import DocumentParser, ParseError
 | 
					from documents.parsers import DocumentParser, ParseError, pattern
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .languages import ISO639
 | 
					from .languages import ISO639
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -210,20 +210,6 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
        except ParseError as e:
 | 
					        except ParseError as e:
 | 
				
			||||||
            return None
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # This regular expression will try to find dates in the document at
 | 
					 | 
				
			||||||
        # hand and will match the following formats:
 | 
					 | 
				
			||||||
        # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
					 | 
				
			||||||
        # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
					 | 
				
			||||||
        # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
					 | 
				
			||||||
        # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
					 | 
				
			||||||
        # - MONTH ZZZZ, with ZZZZ being 4 digits
 | 
					 | 
				
			||||||
        # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
 | 
					 | 
				
			||||||
        pattern = re.compile(
 | 
					 | 
				
			||||||
            r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
 | 
					 | 
				
			||||||
            r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
 | 
					 | 
				
			||||||
            r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
 | 
					 | 
				
			||||||
            r'\b([^\W\d_]{3,9} [0-9]{4})\b')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Iterate through all regex matches and try to parse the date
 | 
					        # Iterate through all regex matches and try to parse the date
 | 
				
			||||||
        for m in re.finditer(pattern, text):
 | 
					        for m in re.finditer(pattern, text):
 | 
				
			||||||
            datestring = m.group(0)
 | 
					            datestring = m.group(0)
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										0
									
								
								src/paperless_text/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_text/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										16
									
								
								src/paperless_text/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/paperless_text/apps.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,16 @@
 | 
				
			|||||||
 | 
					from django.apps import AppConfig
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class PaperlessTextConfig(AppConfig):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    name = "paperless_text"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def ready(self):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        from documents.signals import document_consumer_declaration
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        from .signals import ConsumerDeclaration
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        document_consumer_declaration.connect(ConsumerDeclaration.handle)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        AppConfig.ready(self)
 | 
				
			||||||
							
								
								
									
										131
									
								
								src/paperless_text/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										131
									
								
								src/paperless_text/parsers.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,131 @@
 | 
				
			|||||||
 | 
					import os
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import subprocess
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import dateparser
 | 
				
			||||||
 | 
					from django.conf import settings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from documents.parsers import DocumentParser, ParseError, pattern
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class TextDocumentParser(DocumentParser):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This parser directly parses a text document (.txt, .md, or .csv)
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    CONVERT = settings.CONVERT_BINARY
 | 
				
			||||||
 | 
					    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
				
			||||||
 | 
					    UNPAPER = settings.UNPAPER_BINARY
 | 
				
			||||||
 | 
					    DATE_ORDER = settings.DATE_ORDER
 | 
				
			||||||
 | 
					    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 | 
				
			||||||
 | 
					    OCR_ALWAYS = settings.OCR_ALWAYS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, path):
 | 
				
			||||||
 | 
					        super().__init__(path)
 | 
				
			||||||
 | 
					        self._text = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_thumbnail(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        The thumbnail of a txt is just a 500px wide image of the text
 | 
				
			||||||
 | 
					        rendered onto a letter-sized page.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        # The below is heavily cribbed from https://askubuntu.com/a/590951
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        bg_color = "white"  # bg color
 | 
				
			||||||
 | 
					        text_color = "black"  # text color
 | 
				
			||||||
 | 
					        psize = [500, 647]  # icon size
 | 
				
			||||||
 | 
					        n_lines = 50  # number of lines to show
 | 
				
			||||||
 | 
					        output_file = os.path.join(self.tempdir, "convert-txt.png")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        temp_bg = os.path.join(self.tempdir, "bg.png")
 | 
				
			||||||
 | 
					        temp_txlayer = os.path.join(self.tempdir, "tx.png")
 | 
				
			||||||
 | 
					        picsize = "x".join([str(n) for n in psize])
 | 
				
			||||||
 | 
					        txsize = "x".join([str(n - 8) for n in psize])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def create_bg():
 | 
				
			||||||
 | 
					            work_size = ",".join([str(n - 1) for n in psize])
 | 
				
			||||||
 | 
					            r = str(round(psize[0] / 10));
 | 
				
			||||||
 | 
					            rounded = ",".join([r, r])
 | 
				
			||||||
 | 
					            run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ',
 | 
				
			||||||
 | 
					                        '"fill ', bg_color, ' roundrectangle 0,0,',
 | 
				
			||||||
 | 
					                        work_size, ",", rounded, '" ', temp_bg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def read_text():
 | 
				
			||||||
 | 
					            with open(self.document_path, 'r') as src:
 | 
				
			||||||
 | 
					                lines = [l.strip() for l in src.readlines()]
 | 
				
			||||||
 | 
					                text = "\n".join([l for l in lines[:n_lines]])
 | 
				
			||||||
 | 
					                return text.replace('"', "'")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def create_txlayer():
 | 
				
			||||||
 | 
					            run_command(self.CONVERT,
 | 
				
			||||||
 | 
					                        "-background none",
 | 
				
			||||||
 | 
					                        "-fill",
 | 
				
			||||||
 | 
					                        text_color,
 | 
				
			||||||
 | 
					                        "-pointsize", "12",
 | 
				
			||||||
 | 
					                        "-border 4 -bordercolor none",
 | 
				
			||||||
 | 
					                        "-size ", txsize,
 | 
				
			||||||
 | 
					                        ' caption:"', read_text(), '" ',
 | 
				
			||||||
 | 
					                        temp_txlayer)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        create_txlayer()
 | 
				
			||||||
 | 
					        create_bg()
 | 
				
			||||||
 | 
					        run_command(self.CONVERT, temp_bg, temp_txlayer,
 | 
				
			||||||
 | 
					                    "-background None -layers merge ", output_file)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return output_file
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_text(self):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if self._text is not None:
 | 
				
			||||||
 | 
					            return self._text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        with open(self.document_path, 'r') as f:
 | 
				
			||||||
 | 
					            self._text = f.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return self._text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_date(self):
 | 
				
			||||||
 | 
					        date = None
 | 
				
			||||||
 | 
					        datestring = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            text = self.get_text()
 | 
				
			||||||
 | 
					        except ParseError as e:
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Iterate through all regex matches and try to parse the date
 | 
				
			||||||
 | 
					        for m in re.finditer(pattern, text):
 | 
				
			||||||
 | 
					            datestring = m.group(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                date = dateparser.parse(
 | 
				
			||||||
 | 
					                           datestring,
 | 
				
			||||||
 | 
					                           settings={'DATE_ORDER': self.DATE_ORDER,
 | 
				
			||||||
 | 
					                                     'PREFER_DAY_OF_MONTH': 'first',
 | 
				
			||||||
 | 
					                                     'RETURN_AS_TIMEZONE_AWARE': True})
 | 
				
			||||||
 | 
					            except TypeError:
 | 
				
			||||||
 | 
					                # Skip all matches that do not parse to a proper date
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if date is not None:
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if date is not None:
 | 
				
			||||||
 | 
					            self.log("info", "Detected document date " + date.isoformat() +
 | 
				
			||||||
 | 
					                             " based on string " + datestring)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            self.log("info", "Unable to detect date for document")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return date
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def run_command(*args):
 | 
				
			||||||
 | 
					    environment = os.environ.copy()
 | 
				
			||||||
 | 
					    if settings.CONVERT_MEMORY_LIMIT:
 | 
				
			||||||
 | 
					        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
 | 
				
			||||||
 | 
					    if settings.CONVERT_TMPDIR:
 | 
				
			||||||
 | 
					        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if not subprocess.Popen(' '.join(args), env=environment,
 | 
				
			||||||
 | 
					                            shell=True).wait() == 0:
 | 
				
			||||||
 | 
					        raise ParseError("Convert failed at {}".format(args))
 | 
				
			||||||
							
								
								
									
										23
									
								
								src/paperless_text/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/paperless_text/signals.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,23 @@
 | 
				
			|||||||
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .parsers import TextDocumentParser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ConsumerDeclaration:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    MATCHING_FILES = re.compile("^.*\.(te?xt|md|csv)$")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def handle(cls, sender, **kwargs):
 | 
				
			||||||
 | 
					        return cls.test
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def test(cls, doc):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if cls.MATCHING_FILES.match(doc.lower()):
 | 
				
			||||||
 | 
					            return {
 | 
				
			||||||
 | 
					                "parser": TextDocumentParser,
 | 
				
			||||||
 | 
					                "weight": 10
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user