work in progress Mail parsing

2025-11-04 03:27:12 -05:00 · 2022-04-19 00:39:00 +02:00 · 2022-04-19 00:39:00 +02:00 · 027897ff03
commit 027897ff03
parent cca576f518
5 changed files with 168 additions and 1 deletions
--- a/1
+++ b/1
@ -53,6 +53,7 @@ concurrent-log-handler = "*"
 zipp = {version = "*", markers = "python_version < '3.9'"}
 pyzbar = "*"
 pdf2image = "*"
 click = "==8.0.4"
 [dev-packages]
 coveralls = "*"
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@ -199,7 +199,7 @@ class MailAccountHandler(LoggingMixin):
        return total_processed_files
-    def handle_mail_rule(self, M, rule):
+    def handle_mail_rule(self, M, rule: MailRule):
        self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}")
--- a/src/paperless_tika/apps.py
+++ b/src/paperless_tika/apps.py
@ -1,6 +1,7 @@
 from django.apps import AppConfig
 from django.conf import settings
 from paperless_tika.signals import tika_consumer_declaration
 from paperless_tika.signals import tika_consumer_declaration_eml
 class PaperlessTikaConfig(AppConfig):
@ -11,4 +12,5 @@ class PaperlessTikaConfig(AppConfig):
        if settings.PAPERLESS_TIKA_ENABLED:
            document_consumer_declaration.connect(tika_consumer_declaration)
            document_consumer_declaration.connect(tika_consumer_declaration_eml)
        AppConfig.ready(self)
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@ -1,4 +1,6 @@
 import os
 import re
 from io import StringIO
 import dateutil.parser
 import requests
@ -6,6 +8,9 @@ from django.conf import settings
 from documents.parsers import DocumentParser
 from documents.parsers import make_thumbnail_from_pdf
 from documents.parsers import ParseError
 from PIL import Image
 from PIL import ImageDraw
 from PIL import ImageFont
 from tika import parser
@ -97,3 +102,146 @@ class TikaDocumentParser(DocumentParser):
            file.close()
        return pdf_path
 class TikaDocumentParserEml(DocumentParser):
    """
    This parser sends documents to a local tika server
    """
    logging_name = "paperless.parsing.tikaeml"
    def get_thumbnail(self, document_path, mime_type, file_name=None):
        img = Image.new("RGB", (500, 700), color="white")
        draw = ImageDraw.Draw(img)
        font = ImageFont.truetype(
            font=settings.THUMBNAIL_FONT_NAME,
            size=20,
            layout_engine=ImageFont.LAYOUT_BASIC,
        )
        draw.text((5, 5), self.text, font=font, fill="black")
        out_path = os.path.join(self.tempdir, "thumb.png")
        img.save(out_path)
        return out_path
    def extract_metadata(self, document_path, mime_type):
        tika_server = settings.PAPERLESS_TIKA_ENDPOINT
        try:
            parsed = parser.from_file(document_path, tika_server)
        except Exception as e:
            self.log(
                "warning",
                f"Error while fetching document metadata for " f"{document_path}: {e}",
            )
            return []
        return [
            {
                "namespace": "",
                "prefix": "",
                "key": key,
                "value": parsed["metadata"][key],
            }
            for key in parsed["metadata"]
        ]
    def parse(self, document_path, mime_type, file_name=None):
        self.log("info", f"Sending {document_path} to Tika server")
        tika_server = settings.PAPERLESS_TIKA_ENDPOINT
        try:
            parsed = parser.from_file(document_path, tika_server)
        except Exception as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server at "
                f"{tika_server}: {err}",
            )
        text = re.sub(" +", " ", str(parsed))
        text = re.sub("\n+", "\n", text)
        self.text = text
        print(text)
        try:
            self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"])
        except Exception as e:
            self.log(
                "warning",
                f"Unable to extract date for document " f"{document_path}: {e}",
            )
        md_path = self.convert_to_md(document_path, file_name)
        self.archive_path = self.convert_md_to_pdf(md_path)
    def convert_md_to_pdf(self, md_path):
        pdf_path = os.path.join(self.tempdir, "convert.pdf")
        gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
        url = gotenberg_server + "/forms/chromium/convert/markdown"
        self.log("info", f"Converting {md_path} to PDF as {pdf_path}")
        html = StringIO(
            """
 <!doctype html>
 <html lang="en">
  <head>
    <meta charset="utf-8">
    <title>My PDF</title>
  </head>
  <body>
    {{ toHTML "convert.md" }}
  </body>
 </html>
        """,
        )
        md = StringIO(
            """
 # Subject
 blub  \nblah
 blib
        """,
        )
        files = {
            "md": (
                os.path.basename(md_path),
                md,
            ),
            "html": (
                "index.html",
                html,
            ),
        }
        headers = {}
        try:
            response = requests.post(url, files=files, headers=headers)
            response.raise_for_status()  # ensure we notice bad responses
        except Exception as err:
            raise ParseError(f"Error while converting document to PDF: {err}")
        with open(pdf_path, "wb") as file:
            file.write(response.content)
            file.close()
        return pdf_path
    def convert_to_md(self, document_path, file_name):
        md_path = os.path.join(self.tempdir, "convert.md")
        self.log("info", f"Converting {document_path} to markdown as {md_path}")
        with open(md_path, "w") as file:
            md = [
                "# Subject",
                "\n\n",
                "blah",
            ]
            file.writelines(md)
            file.close()
        return md_path
--- a/src/paperless_tika/signals.py
+++ b/src/paperless_tika/signals.py
@ -22,3 +22,19 @@ def tika_consumer_declaration(sender, **kwargs):
            "text/rtf": ".rtf",
        },
    }
 def get_parser_eml(*args, **kwargs):
    from .parsers import TikaDocumentParserEml
    return TikaDocumentParserEml(*args, **kwargs)
 def tika_consumer_declaration_eml(sender, **kwargs):
    return {
        "parser": get_parser_eml,
        "weight": 10,
        "mime_types": {
            "message/rfc822": ".eml",
        },
    }