mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	Add support for a heuristic that extracts the document date from its text
This commit is contained in:
		
							parent
							
								
									9faf0a102e
								
							
						
					
					
						commit
						b140935843
					
				@ -13,6 +13,7 @@ python-dateutil>=2.6.0
 | 
				
			|||||||
python-dotenv>=0.6.2
 | 
					python-dotenv>=0.6.2
 | 
				
			||||||
python-gnupg>=0.3.9
 | 
					python-gnupg>=0.3.9
 | 
				
			||||||
pytz>=2016.10
 | 
					pytz>=2016.10
 | 
				
			||||||
 | 
					dateparser>=0.6.0
 | 
				
			||||||
gunicorn==19.7.1
 | 
					gunicorn==19.7.1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# For the tests
 | 
					# For the tests
 | 
				
			||||||
 | 
				
			|||||||
@ -118,12 +118,14 @@ class Consumer(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            parsed_document = parser_class(doc)
 | 
					            parsed_document = parser_class(doc)
 | 
				
			||||||
            thumbnail = parsed_document.get_thumbnail()
 | 
					            thumbnail = parsed_document.get_thumbnail()
 | 
				
			||||||
 | 
					            date = parsed_document.get_date()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                document = self._store(
 | 
					                document = self._store(
 | 
				
			||||||
                    parsed_document.get_text(),
 | 
					                    parsed_document.get_text(),
 | 
				
			||||||
                    doc,
 | 
					                    doc,
 | 
				
			||||||
                    thumbnail
 | 
					                    thumbnail,
 | 
				
			||||||
 | 
					                    date
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            except ParseError as e:
 | 
					            except ParseError as e:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -174,7 +176,7 @@ class Consumer(object):
 | 
				
			|||||||
        return sorted(
 | 
					        return sorted(
 | 
				
			||||||
            options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
 | 
					            options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _store(self, text, doc, thumbnail):
 | 
					    def _store(self, text, doc, thumbnail, date):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        file_info = FileInfo.from_path(doc)
 | 
					        file_info = FileInfo.from_path(doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -182,7 +184,7 @@ class Consumer(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        self.log("debug", "Saving record to database")
 | 
					        self.log("debug", "Saving record to database")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        created = file_info.created or timezone.make_aware(
 | 
					        created = file_info.created or date or timezone.make_aware(
 | 
				
			||||||
                    datetime.datetime.fromtimestamp(stats.st_mtime))
 | 
					                    datetime.datetime.fromtimestamp(stats.st_mtime))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with open(doc, "rb") as f:
 | 
					        with open(doc, "rb") as f:
 | 
				
			||||||
 | 
				
			|||||||
@ -35,6 +35,12 @@ class DocumentParser(object):
 | 
				
			|||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError()
 | 
					        raise NotImplementedError()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_date(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Returns the date of the document.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        raise NotImplementedError()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def log(self, level, message):
 | 
					    def log(self, level, message):
 | 
				
			||||||
        getattr(self.logger, level)(message, extra={
 | 
					        getattr(self.logger, level)(message, extra={
 | 
				
			||||||
            "group": self.logging_group
 | 
					            "group": self.logging_group
 | 
				
			||||||
 | 
				
			|||||||
@ -258,3 +258,6 @@ PAPERLESS_LIST_PER_PAGE = int(os.getenv("PAPERLESS_LIST_PER_PAGE", 100))
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
FY_START = os.getenv("PAPERLESS_FINANCIAL_YEAR_START")
 | 
					FY_START = os.getenv("PAPERLESS_FINANCIAL_YEAR_START")
 | 
				
			||||||
FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
 | 
					FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Specify the default date order (for autodetected dates)
 | 
				
			||||||
 | 
					DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
 | 
				
			||||||
 | 
				
			|||||||
@ -3,6 +3,7 @@ import os
 | 
				
			|||||||
import re
 | 
					import re
 | 
				
			||||||
import subprocess
 | 
					import subprocess
 | 
				
			||||||
from multiprocessing.pool import Pool
 | 
					from multiprocessing.pool import Pool
 | 
				
			||||||
 | 
					import dateparser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import langdetect
 | 
					import langdetect
 | 
				
			||||||
import pyocr
 | 
					import pyocr
 | 
				
			||||||
@ -30,6 +31,7 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
 | 
					    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
 | 
				
			||||||
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
					    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
				
			||||||
    UNPAPER = settings.UNPAPER_BINARY
 | 
					    UNPAPER = settings.UNPAPER_BINARY
 | 
				
			||||||
 | 
					    DATE_ORDER = settings.DATE_ORDER
 | 
				
			||||||
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 | 
					    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_thumbnail(self):
 | 
					    def get_thumbnail(self):
 | 
				
			||||||
@ -175,6 +177,29 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
 | 
					        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
 | 
				
			||||||
        return text
 | 
					        return text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_date(self):
 | 
				
			||||||
 | 
					        text = self.get_text()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # This regular expression will try to find dates in the document at
 | 
				
			||||||
 | 
					        # hand and will match the following formats:
 | 
				
			||||||
 | 
					        # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
				
			||||||
 | 
					        # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
				
			||||||
 | 
					        # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
				
			||||||
 | 
					        # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
				
			||||||
 | 
					        # - MONTH ZZZZ
 | 
				
			||||||
 | 
					        m = re.search(
 | 
				
			||||||
 | 
					            r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
 | 
				
			||||||
 | 
					            r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
 | 
				
			||||||
 | 
					            r'\b([^ ]{3,9} [0-9]{4})\b', text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if m is None:
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return dateparser.parse(m.group(0),
 | 
				
			||||||
 | 
					                                settings={'DATE_ORDER': self.DATE_ORDER,
 | 
				
			||||||
 | 
					                                          'PREFER_DAY_OF_MONTH': 'first',
 | 
				
			||||||
 | 
					                                          'RETURN_AS_TIMEZONE_AWARE': True})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def run_convert(*args):
 | 
					def run_convert(*args):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user