mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-04 03:27:12 -05:00 
			
		
		
		
	work in progress Mail parsing
This commit is contained in:
		
							parent
							
								
									cca576f518
								
							
						
					
					
						commit
						027897ff03
					
				
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							@ -53,6 +53,7 @@ concurrent-log-handler = "*"
 | 
				
			|||||||
zipp = {version = "*", markers = "python_version < '3.9'"}
 | 
					zipp = {version = "*", markers = "python_version < '3.9'"}
 | 
				
			||||||
pyzbar = "*"
 | 
					pyzbar = "*"
 | 
				
			||||||
pdf2image = "*"
 | 
					pdf2image = "*"
 | 
				
			||||||
 | 
					click = "==8.0.4"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[dev-packages]
 | 
					[dev-packages]
 | 
				
			||||||
coveralls = "*"
 | 
					coveralls = "*"
 | 
				
			||||||
 | 
				
			|||||||
@ -199,7 +199,7 @@ class MailAccountHandler(LoggingMixin):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        return total_processed_files
 | 
					        return total_processed_files
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def handle_mail_rule(self, M, rule):
 | 
					    def handle_mail_rule(self, M, rule: MailRule):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}")
 | 
					        self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -1,6 +1,7 @@
 | 
				
			|||||||
from django.apps import AppConfig
 | 
					from django.apps import AppConfig
 | 
				
			||||||
from django.conf import settings
 | 
					from django.conf import settings
 | 
				
			||||||
from paperless_tika.signals import tika_consumer_declaration
 | 
					from paperless_tika.signals import tika_consumer_declaration
 | 
				
			||||||
 | 
					from paperless_tika.signals import tika_consumer_declaration_eml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PaperlessTikaConfig(AppConfig):
 | 
					class PaperlessTikaConfig(AppConfig):
 | 
				
			||||||
@ -11,4 +12,5 @@ class PaperlessTikaConfig(AppConfig):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        if settings.PAPERLESS_TIKA_ENABLED:
 | 
					        if settings.PAPERLESS_TIKA_ENABLED:
 | 
				
			||||||
            document_consumer_declaration.connect(tika_consumer_declaration)
 | 
					            document_consumer_declaration.connect(tika_consumer_declaration)
 | 
				
			||||||
 | 
					            document_consumer_declaration.connect(tika_consumer_declaration_eml)
 | 
				
			||||||
        AppConfig.ready(self)
 | 
					        AppConfig.ready(self)
 | 
				
			||||||
 | 
				
			|||||||
@ -1,4 +1,6 @@
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					from io import StringIO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import dateutil.parser
 | 
					import dateutil.parser
 | 
				
			||||||
import requests
 | 
					import requests
 | 
				
			||||||
@ -6,6 +8,9 @@ from django.conf import settings
 | 
				
			|||||||
from documents.parsers import DocumentParser
 | 
					from documents.parsers import DocumentParser
 | 
				
			||||||
from documents.parsers import make_thumbnail_from_pdf
 | 
					from documents.parsers import make_thumbnail_from_pdf
 | 
				
			||||||
from documents.parsers import ParseError
 | 
					from documents.parsers import ParseError
 | 
				
			||||||
 | 
					from PIL import Image
 | 
				
			||||||
 | 
					from PIL import ImageDraw
 | 
				
			||||||
 | 
					from PIL import ImageFont
 | 
				
			||||||
from tika import parser
 | 
					from tika import parser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -97,3 +102,146 @@ class TikaDocumentParser(DocumentParser):
 | 
				
			|||||||
            file.close()
 | 
					            file.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return pdf_path
 | 
					        return pdf_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class TikaDocumentParserEml(DocumentParser):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This parser sends documents to a local tika server
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    logging_name = "paperless.parsing.tikaeml"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_thumbnail(self, document_path, mime_type, file_name=None):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        img = Image.new("RGB", (500, 700), color="white")
 | 
				
			||||||
 | 
					        draw = ImageDraw.Draw(img)
 | 
				
			||||||
 | 
					        font = ImageFont.truetype(
 | 
				
			||||||
 | 
					            font=settings.THUMBNAIL_FONT_NAME,
 | 
				
			||||||
 | 
					            size=20,
 | 
				
			||||||
 | 
					            layout_engine=ImageFont.LAYOUT_BASIC,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        draw.text((5, 5), self.text, font=font, fill="black")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        out_path = os.path.join(self.tempdir, "thumb.png")
 | 
				
			||||||
 | 
					        img.save(out_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return out_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def extract_metadata(self, document_path, mime_type):
 | 
				
			||||||
 | 
					        tika_server = settings.PAPERLESS_TIKA_ENDPOINT
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            parsed = parser.from_file(document_path, tika_server)
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            self.log(
 | 
				
			||||||
 | 
					                "warning",
 | 
				
			||||||
 | 
					                f"Error while fetching document metadata for " f"{document_path}: {e}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            return []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return [
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "namespace": "",
 | 
				
			||||||
 | 
					                "prefix": "",
 | 
				
			||||||
 | 
					                "key": key,
 | 
				
			||||||
 | 
					                "value": parsed["metadata"][key],
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            for key in parsed["metadata"]
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def parse(self, document_path, mime_type, file_name=None):
 | 
				
			||||||
 | 
					        self.log("info", f"Sending {document_path} to Tika server")
 | 
				
			||||||
 | 
					        tika_server = settings.PAPERLESS_TIKA_ENDPOINT
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            parsed = parser.from_file(document_path, tika_server)
 | 
				
			||||||
 | 
					        except Exception as err:
 | 
				
			||||||
 | 
					            raise ParseError(
 | 
				
			||||||
 | 
					                f"Could not parse {document_path} with tika server at "
 | 
				
			||||||
 | 
					                f"{tika_server}: {err}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        text = re.sub(" +", " ", str(parsed))
 | 
				
			||||||
 | 
					        text = re.sub("\n+", "\n", text)
 | 
				
			||||||
 | 
					        self.text = text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        print(text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"])
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            self.log(
 | 
				
			||||||
 | 
					                "warning",
 | 
				
			||||||
 | 
					                f"Unable to extract date for document " f"{document_path}: {e}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        md_path = self.convert_to_md(document_path, file_name)
 | 
				
			||||||
 | 
					        self.archive_path = self.convert_md_to_pdf(md_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def convert_md_to_pdf(self, md_path):
 | 
				
			||||||
 | 
					        pdf_path = os.path.join(self.tempdir, "convert.pdf")
 | 
				
			||||||
 | 
					        gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
 | 
				
			||||||
 | 
					        url = gotenberg_server + "/forms/chromium/convert/markdown"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.log("info", f"Converting {md_path} to PDF as {pdf_path}")
 | 
				
			||||||
 | 
					        html = StringIO(
 | 
				
			||||||
 | 
					            """
 | 
				
			||||||
 | 
					<!doctype html>
 | 
				
			||||||
 | 
					<html lang="en">
 | 
				
			||||||
 | 
					  <head>
 | 
				
			||||||
 | 
					    <meta charset="utf-8">
 | 
				
			||||||
 | 
					    <title>My PDF</title>
 | 
				
			||||||
 | 
					  </head>
 | 
				
			||||||
 | 
					  <body>
 | 
				
			||||||
 | 
					    {{ toHTML "convert.md" }}
 | 
				
			||||||
 | 
					  </body>
 | 
				
			||||||
 | 
					</html>
 | 
				
			||||||
 | 
					        """,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        md = StringIO(
 | 
				
			||||||
 | 
					            """
 | 
				
			||||||
 | 
					# Subject
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					blub  \nblah
 | 
				
			||||||
 | 
					blib
 | 
				
			||||||
 | 
					        """,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        files = {
 | 
				
			||||||
 | 
					            "md": (
 | 
				
			||||||
 | 
					                os.path.basename(md_path),
 | 
				
			||||||
 | 
					                md,
 | 
				
			||||||
 | 
					            ),
 | 
				
			||||||
 | 
					            "html": (
 | 
				
			||||||
 | 
					                "index.html",
 | 
				
			||||||
 | 
					                html,
 | 
				
			||||||
 | 
					            ),
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        headers = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            response = requests.post(url, files=files, headers=headers)
 | 
				
			||||||
 | 
					            response.raise_for_status()  # ensure we notice bad responses
 | 
				
			||||||
 | 
					        except Exception as err:
 | 
				
			||||||
 | 
					            raise ParseError(f"Error while converting document to PDF: {err}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        with open(pdf_path, "wb") as file:
 | 
				
			||||||
 | 
					            file.write(response.content)
 | 
				
			||||||
 | 
					            file.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return pdf_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def convert_to_md(self, document_path, file_name):
 | 
				
			||||||
 | 
					        md_path = os.path.join(self.tempdir, "convert.md")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.log("info", f"Converting {document_path} to markdown as {md_path}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        with open(md_path, "w") as file:
 | 
				
			||||||
 | 
					            md = [
 | 
				
			||||||
 | 
					                "# Subject",
 | 
				
			||||||
 | 
					                "\n\n",
 | 
				
			||||||
 | 
					                "blah",
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
 | 
					            file.writelines(md)
 | 
				
			||||||
 | 
					            file.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return md_path
 | 
				
			||||||
 | 
				
			|||||||
@ -22,3 +22,19 @@ def tika_consumer_declaration(sender, **kwargs):
 | 
				
			|||||||
            "text/rtf": ".rtf",
 | 
					            "text/rtf": ".rtf",
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_parser_eml(*args, **kwargs):
 | 
				
			||||||
 | 
					    from .parsers import TikaDocumentParserEml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return TikaDocumentParserEml(*args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def tika_consumer_declaration_eml(sender, **kwargs):
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					        "parser": get_parser_eml,
 | 
				
			||||||
 | 
					        "weight": 10,
 | 
				
			||||||
 | 
					        "mime_types": {
 | 
				
			||||||
 | 
					            "message/rfc822": ".eml",
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user