mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	feat(parsers): add generator for date parsing
This commit is contained in:
		
							parent
							
								
									ca75fb5664
								
							
						
					
					
						commit
						a5d2ae2588
					
				@ -6,6 +6,7 @@ import re
 | 
			
		||||
import shutil
 | 
			
		||||
import subprocess
 | 
			
		||||
import tempfile
 | 
			
		||||
from typing import Iterator
 | 
			
		||||
from typing import Optional
 | 
			
		||||
from typing import Set
 | 
			
		||||
 | 
			
		||||
@ -216,6 +217,10 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_date(filename, text) -> Optional[datetime.datetime]:
 | 
			
		||||
    return next(parse_date_generator(filename, text), None)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
 | 
			
		||||
    """
 | 
			
		||||
    Returns the date of the document.
 | 
			
		||||
    """
 | 
			
		||||
@ -246,38 +251,32 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
 | 
			
		||||
            return date
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    def __process_match(
 | 
			
		||||
        match: re.Match[str],
 | 
			
		||||
        date_order: str,
 | 
			
		||||
    ) -> Optional[datetime.datetime]:
 | 
			
		||||
        date_string = match.group(0)
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            date = __parser(date_string, date_order)
 | 
			
		||||
        except (TypeError, ValueError):
 | 
			
		||||
            # Skip all matches that do not parse to a proper date
 | 
			
		||||
            date = None
 | 
			
		||||
 | 
			
		||||
        return __filter(date)
 | 
			
		||||
 | 
			
		||||
    def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
 | 
			
		||||
        for m in re.finditer(DATE_REGEX, content):
 | 
			
		||||
            date = __process_match(m, date_order)
 | 
			
		||||
            if date is not None:
 | 
			
		||||
                yield date
 | 
			
		||||
 | 
			
		||||
    # if filename date parsing is enabled, search there first:
 | 
			
		||||
    if settings.FILENAME_DATE_ORDER:
 | 
			
		||||
        for m in re.finditer(DATE_REGEX, filename):
 | 
			
		||||
            date_string = m.group(0)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                date = __parser(date_string, settings.FILENAME_DATE_ORDER)
 | 
			
		||||
            except (TypeError, ValueError):
 | 
			
		||||
                # Skip all matches that do not parse to a proper date
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            date = __filter(date)
 | 
			
		||||
            if date is not None:
 | 
			
		||||
                return date
 | 
			
		||||
        yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
 | 
			
		||||
 | 
			
		||||
    # Iterate through all regex matches in text and try to parse the date
 | 
			
		||||
    for m in re.finditer(DATE_REGEX, text):
 | 
			
		||||
        date_string = m.group(0)
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            date = __parser(date_string, settings.DATE_ORDER)
 | 
			
		||||
        except (TypeError, ValueError):
 | 
			
		||||
            # Skip all matches that do not parse to a proper date
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        date = __filter(date)
 | 
			
		||||
        if date is not None:
 | 
			
		||||
            return date
 | 
			
		||||
 | 
			
		||||
    return date
 | 
			
		||||
    yield from __process_content(text, settings.DATE_ORDER)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ParseError(Exception):
 | 
			
		||||
 | 
			
		||||
@ -1,3 +1,4 @@
 | 
			
		||||
import itertools
 | 
			
		||||
import json
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
@ -21,6 +22,7 @@ from django.db.models.functions import Lower
 | 
			
		||||
from django.http import Http404
 | 
			
		||||
from django.http import HttpResponse
 | 
			
		||||
from django.http import HttpResponseBadRequest
 | 
			
		||||
from django.shortcuts import get_object_or_404
 | 
			
		||||
from django.utils.decorators import method_decorator
 | 
			
		||||
from django.utils.translation import get_language
 | 
			
		||||
from django.views.decorators.cache import cache_control
 | 
			
		||||
@ -70,6 +72,7 @@ from .models import SavedView
 | 
			
		||||
from .models import StoragePath
 | 
			
		||||
from .models import Tag
 | 
			
		||||
from .parsers import get_parser_class_for_mime_type
 | 
			
		||||
from .parsers import parse_date_generator
 | 
			
		||||
from .serialisers import AcknowledgeTasksViewSerializer
 | 
			
		||||
from .serialisers import BulkDownloadSerializer
 | 
			
		||||
from .serialisers import BulkEditSerializer
 | 
			
		||||
@ -329,13 +332,13 @@ class DocumentViewSet(
 | 
			
		||||
 | 
			
		||||
    @action(methods=["get"], detail=True)
 | 
			
		||||
    def suggestions(self, request, pk=None):
 | 
			
		||||
        try:
 | 
			
		||||
            doc = Document.objects.get(pk=pk)
 | 
			
		||||
        except Document.DoesNotExist:
 | 
			
		||||
            raise Http404()
 | 
			
		||||
        doc = get_object_or_404(Document, pk=pk)
 | 
			
		||||
 | 
			
		||||
        classifier = load_classifier()
 | 
			
		||||
 | 
			
		||||
        gen = parse_date_generator(doc.filename, doc.content)
 | 
			
		||||
        dates = {i for i in itertools.islice(gen, 5)}
 | 
			
		||||
 | 
			
		||||
        return Response(
 | 
			
		||||
            {
 | 
			
		||||
                "correspondents": [c.id for c in match_correspondents(doc, classifier)],
 | 
			
		||||
@ -344,6 +347,9 @@ class DocumentViewSet(
 | 
			
		||||
                    dt.id for dt in match_document_types(doc, classifier)
 | 
			
		||||
                ],
 | 
			
		||||
                "storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)],
 | 
			
		||||
                "dates": [
 | 
			
		||||
                    date.strftime("%Y-%m-%d") for date in dates if date is not None
 | 
			
		||||
                ],
 | 
			
		||||
            },
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user