mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-31 10:37:12 -04:00 
			
		
		
		
	Merge branch 'erikarvstedt-inotify'
This commit is contained in:
		
						commit
						52b32fddc9
					
				| @ -49,17 +49,18 @@ The Consumer | ||||
| ------------ | ||||
| 
 | ||||
| The consumer script runs in an infinite loop, constantly looking at a directory | ||||
| for PDF files to parse and index.  The process is pretty straightforward: | ||||
| for documents to parse and index.  The process is pretty straightforward: | ||||
| 
 | ||||
| 1. Look in ``CONSUMPTION_DIR`` for a PDF.  If one is found, go to #2.  If not, | ||||
|    wait 10 seconds and try again. | ||||
| 2. Parse the PDF with Tesseract | ||||
| 1. Look in ``CONSUMPTION_DIR`` for a document.  If one is found, go to #2. | ||||
|    If not, wait 10 seconds and try again.  On Linux, new documents are detected | ||||
|    instantly via inotify, so there's no waiting involved. | ||||
| 2. Parse the document with Tesseract | ||||
| 3. Create a new record in the database with the OCR'd text | ||||
| 4. Attempt to automatically assign document attributes by doing some guesswork. | ||||
|    Read up on the :ref:`guesswork documentation<guesswork>` for more | ||||
|    information about this process. | ||||
| 5. Encrypt the PDF and store it in the ``media`` directory under | ||||
|    ``documents/pdf``. | ||||
| 5. Encrypt the document and store it in the ``media`` directory under | ||||
|    ``documents/originals``. | ||||
| 6. Go to #1. | ||||
| 
 | ||||
| 
 | ||||
| @ -74,8 +75,8 @@ The consumer is started via the ``manage.py`` script: | ||||
| 
 | ||||
|     $ /path/to/paperless/src/manage.py document_consumer | ||||
| 
 | ||||
| This starts the service that will run in a loop, consuming PDF files as they | ||||
| appear in ``CONSUMPTION_DIR``. | ||||
| This starts the service that will consume documents as they appear in | ||||
| ``CONSUMPTION_DIR``. | ||||
| 
 | ||||
| Note that this command runs continuously, so exiting it will mean your webserver | ||||
| disappears.  If you want to run this full-time (which is kind of the point) | ||||
| @ -97,8 +98,8 @@ The Exporter | ||||
| ------------ | ||||
| 
 | ||||
| Tired of fiddling with Paperless, or just want to do something stupid and are | ||||
| afraid of accidentally damaging your files?  You can export all of your PDFs | ||||
| into neatly named, dated, and unencrypted. | ||||
| afraid of accidentally damaging your files?  You can export all of your | ||||
| documents into neatly named, dated, and unencrypted files. | ||||
| 
 | ||||
| 
 | ||||
| .. _utilities-exporter-howto: | ||||
| @ -112,10 +113,10 @@ This too is done via the ``manage.py`` script: | ||||
| 
 | ||||
|     $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/ | ||||
| 
 | ||||
| This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you | ||||
| to do with as you please.  The files are accompanied with a special file, | ||||
| ``manifest.json`` which can be used to | ||||
| :ref:`import the files <utilities-importer>` at a later date if you wish. | ||||
| This will dump all of your unencrypted documents into ``/path/to/somewhere`` | ||||
| for you to do with as you please.  The files are accompanied with a special | ||||
| file, ``manifest.json`` which can be used to :ref:`import the files | ||||
| <utilities-importer>` at a later date if you wish. | ||||
| 
 | ||||
| 
 | ||||
| .. _utilities-exporter-howto-docker: | ||||
|  | ||||
| @ -165,6 +165,8 @@ PAPERLESS_PASSPHRASE="secret" | ||||
| #PAPERLESS_CONVERT_DENSITY=300 | ||||
| 
 | ||||
| 
 | ||||
| # (This setting is ignored on Linux where inotify is used instead of a | ||||
| # polling loop.) | ||||
| # The number of seconds that Paperless will wait between checking | ||||
| # PAPERLESS_CONSUMPTION_DIR.  If you tend to write documents to this directory | ||||
| # rarely, you may want to use a higher value than the default (10). | ||||
|  | ||||
| @ -20,6 +20,7 @@ flake8==3.5.0 | ||||
| fuzzywuzzy==0.15.0 | ||||
| gunicorn==19.8.1 | ||||
| idna==2.6 | ||||
| inotify_simple==1.1.7; sys_platform == 'linux' | ||||
| langdetect==1.0.7 | ||||
| mccabe==0.6.1 | ||||
| more-itertools==4.1.0 | ||||
|  | ||||
| @ -3,8 +3,10 @@ import hashlib | ||||
| import logging | ||||
| import os | ||||
| import re | ||||
| import time | ||||
| import uuid | ||||
| 
 | ||||
| from operator import itemgetter | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
| from paperless.db import GnuPG | ||||
| @ -32,21 +34,21 @@ class Consumer: | ||||
|       5. Delete the document and image(s) | ||||
|     """ | ||||
| 
 | ||||
|     # Files are considered ready for consumption if they have been unmodified | ||||
|     # for this duration | ||||
|     FILES_MIN_UNMODIFIED_DURATION = 0.5 | ||||
| 
 | ||||
|     def __init__(self, consume=settings.CONSUMPTION_DIR, | ||||
|                  scratch=settings.SCRATCH_DIR): | ||||
| 
 | ||||
|         self.logger = logging.getLogger(__name__) | ||||
|         self.logging_group = None | ||||
| 
 | ||||
|         self.stats = {} | ||||
|         self._ignore = [] | ||||
|         self.consume = consume | ||||
|         self.scratch = scratch | ||||
| 
 | ||||
|         try: | ||||
|             os.makedirs(self.scratch) | ||||
|         except FileExistsError: | ||||
|             pass | ||||
|         os.makedirs(self.scratch, exist_ok=True) | ||||
| 
 | ||||
|         if not self.consume: | ||||
|             raise ConsumerError( | ||||
| @ -73,83 +75,99 @@ class Consumer: | ||||
|             "group": self.logging_group | ||||
|         }) | ||||
| 
 | ||||
|     def run(self): | ||||
|     def consume_new_files(self): | ||||
|         """ | ||||
|         Find non-ignored files in consumption dir and consume them if they have | ||||
|         been unmodified for FILES_MIN_UNMODIFIED_DURATION. | ||||
|         """ | ||||
|         ignored_files = [] | ||||
|         files = [] | ||||
|         for entry in os.scandir(self.consume): | ||||
|             if entry.is_file(): | ||||
|                 file = (entry.path, entry.stat().st_mtime) | ||||
|                 if file in self._ignore: | ||||
|                     ignored_files.append(file) | ||||
|                 else: | ||||
|                     files.append(file) | ||||
| 
 | ||||
|         for doc in os.listdir(self.consume): | ||||
|         if not files: | ||||
|             return | ||||
| 
 | ||||
|             doc = os.path.join(self.consume, doc) | ||||
|         # Set _ignore to only include files that still exist. | ||||
|         # This keeps it from growing indefinitely. | ||||
|         self._ignore[:] = ignored_files | ||||
| 
 | ||||
|             if not os.path.isfile(doc): | ||||
|                 continue | ||||
|         files_old_to_new = sorted(files, key=itemgetter(1)) | ||||
| 
 | ||||
|             if not re.match(FileInfo.REGEXES["title"], doc): | ||||
|                 continue | ||||
|         time.sleep(self.FILES_MIN_UNMODIFIED_DURATION) | ||||
| 
 | ||||
|             if doc in self._ignore: | ||||
|                 continue | ||||
|         for file, mtime in files_old_to_new: | ||||
|             if mtime == os.path.getmtime(file): | ||||
|                 # File has not been modified and can be consumed | ||||
|                 if not self.try_consume_file(file): | ||||
|                     self._ignore.append((file, mtime)) | ||||
| 
 | ||||
|             if not self._is_ready(doc): | ||||
|                 continue | ||||
|     def try_consume_file(self, file): | ||||
|         "Return True if file was consumed" | ||||
| 
 | ||||
|             if self._is_duplicate(doc): | ||||
|                 self.log( | ||||
|                     "info", | ||||
|                     "Skipping {} as it appears to be a duplicate".format(doc) | ||||
|                 ) | ||||
|                 self._ignore.append(doc) | ||||
|                 continue | ||||
|         if not re.match(FileInfo.REGEXES["title"], file): | ||||
|             return False | ||||
| 
 | ||||
|             parser_class = self._get_parser_class(doc) | ||||
|             if not parser_class: | ||||
|                 self.log( | ||||
|                     "error", "No parsers could be found for {}".format(doc)) | ||||
|                 self._ignore.append(doc) | ||||
|                 continue | ||||
|         doc = file | ||||
| 
 | ||||
|             self.logging_group = uuid.uuid4() | ||||
|         if self._is_duplicate(doc): | ||||
|             self.log( | ||||
|                 "info", | ||||
|                 "Skipping {} as it appears to be a duplicate".format(doc) | ||||
|             ) | ||||
|             return False | ||||
| 
 | ||||
|             self.log("info", "Consuming {}".format(doc)) | ||||
|         parser_class = self._get_parser_class(doc) | ||||
|         if not parser_class: | ||||
|             self.log( | ||||
|                 "error", "No parsers could be found for {}".format(doc)) | ||||
|             return False | ||||
| 
 | ||||
|             document_consumption_started.send( | ||||
|                 sender=self.__class__, | ||||
|                 filename=doc, | ||||
|                 logging_group=self.logging_group | ||||
|         self.logging_group = uuid.uuid4() | ||||
| 
 | ||||
|         self.log("info", "Consuming {}".format(doc)) | ||||
| 
 | ||||
|         document_consumption_started.send( | ||||
|             sender=self.__class__, | ||||
|             filename=doc, | ||||
|             logging_group=self.logging_group | ||||
|         ) | ||||
| 
 | ||||
|         parsed_document = parser_class(doc) | ||||
| 
 | ||||
|         try: | ||||
|             thumbnail = parsed_document.get_thumbnail() | ||||
|             date = parsed_document.get_date() | ||||
|             document = self._store( | ||||
|                 parsed_document.get_text(), | ||||
|                 doc, | ||||
|                 thumbnail, | ||||
|                 date | ||||
|             ) | ||||
|         except ParseError as e: | ||||
|             self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) | ||||
|             parsed_document.cleanup() | ||||
|             return False | ||||
|         else: | ||||
|             parsed_document.cleanup() | ||||
|             self._cleanup_doc(doc) | ||||
| 
 | ||||
|             self.log( | ||||
|                 "info", | ||||
|                 "Document {} consumption finished".format(document) | ||||
|             ) | ||||
| 
 | ||||
|             parsed_document = parser_class(doc) | ||||
| 
 | ||||
|             try: | ||||
|                 thumbnail = parsed_document.get_thumbnail() | ||||
|                 date = parsed_document.get_date() | ||||
|                 document = self._store( | ||||
|                     parsed_document.get_text(), | ||||
|                     doc, | ||||
|                     thumbnail, | ||||
|                     date | ||||
|                 ) | ||||
|             except ParseError as e: | ||||
| 
 | ||||
|                 self._ignore.append(doc) | ||||
|                 self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) | ||||
|                 parsed_document.cleanup() | ||||
| 
 | ||||
|                 continue | ||||
| 
 | ||||
|             else: | ||||
| 
 | ||||
|                 parsed_document.cleanup() | ||||
|                 self._cleanup_doc(doc) | ||||
| 
 | ||||
|                 self.log( | ||||
|                     "info", | ||||
|                     "Document {} consumption finished".format(document) | ||||
|                 ) | ||||
| 
 | ||||
|                 document_consumption_finished.send( | ||||
|                     sender=self.__class__, | ||||
|                     document=document, | ||||
|                     logging_group=self.logging_group | ||||
|                 ) | ||||
|             document_consumption_finished.send( | ||||
|                 sender=self.__class__, | ||||
|                 document=document, | ||||
|                 logging_group=self.logging_group | ||||
|             ) | ||||
|             return True | ||||
| 
 | ||||
|     def _get_parser_class(self, doc): | ||||
|         """ | ||||
| @ -224,22 +242,6 @@ class Consumer: | ||||
|         self.log("debug", "Deleting document {}".format(doc)) | ||||
|         os.unlink(doc) | ||||
| 
 | ||||
|     def _is_ready(self, doc): | ||||
|         """ | ||||
|         Detect whether ``doc`` is ready to consume or if it's still being | ||||
|         written to by the uploader. | ||||
|         """ | ||||
| 
 | ||||
|         t = os.stat(doc).st_mtime | ||||
| 
 | ||||
|         if self.stats.get(doc) == t: | ||||
|             del(self.stats[doc]) | ||||
|             return True | ||||
| 
 | ||||
|         self.stats[doc] = t | ||||
| 
 | ||||
|         return False | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _is_duplicate(doc): | ||||
|         with open(doc, "rb") as f: | ||||
|  | ||||
| @ -20,7 +20,7 @@ class MailFetcherError(Exception): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| class InvalidMessageError(Exception): | ||||
| class InvalidMessageError(MailFetcherError): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| @ -75,6 +75,9 @@ class Message(Loggable): | ||||
|                 continue | ||||
| 
 | ||||
|             dispositions = content_disposition.strip().split(";") | ||||
|             if len(dispositions) < 2: | ||||
|                 continue | ||||
| 
 | ||||
|             if not dispositions[0].lower() == "attachment" and \ | ||||
|                "filename" not in dispositions[1].lower(): | ||||
|                 continue | ||||
| @ -159,8 +162,10 @@ class MailFetcher(Loggable): | ||||
|         self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX") | ||||
| 
 | ||||
|         self._enabled = bool(self._host) | ||||
|         if self._enabled and Message.SECRET is None: | ||||
|             raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined") | ||||
| 
 | ||||
|         self.last_checked = datetime.datetime.now() | ||||
|         self.last_checked = time.time() | ||||
|         self.consume = consume | ||||
| 
 | ||||
|     def pull(self): | ||||
| @ -187,7 +192,7 @@ class MailFetcher(Loggable): | ||||
|                     f.write(message.attachment.data) | ||||
|                     os.utime(file_name, times=(t, t)) | ||||
| 
 | ||||
|         self.last_checked = datetime.datetime.now() | ||||
|         self.last_checked = time.time() | ||||
| 
 | ||||
|     def _get_messages(self): | ||||
| 
 | ||||
| @ -205,7 +210,7 @@ class MailFetcher(Loggable): | ||||
|             self._connection.close() | ||||
|             self._connection.logout() | ||||
| 
 | ||||
|         except Exception as e: | ||||
|         except MailFetcherError as e: | ||||
|             self.log("error", str(e)) | ||||
| 
 | ||||
|         return r | ||||
|  | ||||
| @ -1,6 +1,7 @@ | ||||
| import datetime | ||||
| import logging | ||||
| import os | ||||
| import sys | ||||
| import time | ||||
| 
 | ||||
| from django.conf import settings | ||||
| @ -9,6 +10,11 @@ from django.core.management.base import BaseCommand, CommandError | ||||
| from ...consumer import Consumer, ConsumerError | ||||
| from ...mail import MailFetcher, MailFetcherError | ||||
| 
 | ||||
| try: | ||||
|     from inotify_simple import INotify, flags | ||||
| except ImportError: | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| class Command(BaseCommand): | ||||
|     """ | ||||
| @ -53,13 +59,20 @@ class Command(BaseCommand): | ||||
|             action="store_true", | ||||
|             help="Run only once." | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             "--no-inotify", | ||||
|             action="store_true", | ||||
|             help="Don't use inotify, even if it's available." | ||||
|         ) | ||||
| 
 | ||||
|     def handle(self, *args, **options): | ||||
| 
 | ||||
|         self.verbosity = options["verbosity"] | ||||
|         directory = options["directory"] | ||||
|         loop_time = options["loop_time"] | ||||
|         mail_delta = datetime.timedelta(minutes=options["mail_delta"]) | ||||
|         mail_delta = options["mail_delta"] * 60 | ||||
|         use_inotify = (not options["no_inotify"] | ||||
|                        and "inotify_simple" in sys.modules) | ||||
| 
 | ||||
|         try: | ||||
|             self.file_consumer = Consumer(consume=directory) | ||||
| @ -67,39 +80,68 @@ class Command(BaseCommand): | ||||
|         except (ConsumerError, MailFetcherError) as e: | ||||
|             raise CommandError(e) | ||||
| 
 | ||||
|         for path in (self.ORIGINAL_DOCS, self.THUMB_DOCS): | ||||
|             try: | ||||
|                 os.makedirs(path) | ||||
|             except FileExistsError: | ||||
|                 pass | ||||
|         for d in (self.ORIGINAL_DOCS, self.THUMB_DOCS): | ||||
|             os.makedirs(d, exist_ok=True) | ||||
| 
 | ||||
|         logging.getLogger(__name__).info( | ||||
|             "Starting document consumer at {}".format(directory) | ||||
|             "Starting document consumer at {}{}".format( | ||||
|                 directory, | ||||
|                 " with inotify" if use_inotify else "" | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|         if options["oneshot"]: | ||||
|             self.loop(mail_delta=mail_delta) | ||||
|             self.loop_step(mail_delta) | ||||
|         else: | ||||
|             try: | ||||
|                 while True: | ||||
|                     self.loop(mail_delta=mail_delta) | ||||
|                     time.sleep(loop_time) | ||||
|                     if self.verbosity > 1: | ||||
|                         print(".", int(time.time())) | ||||
|                 if use_inotify: | ||||
|                     self.loop_inotify(mail_delta) | ||||
|                 else: | ||||
|                     self.loop(loop_time, mail_delta) | ||||
|             except KeyboardInterrupt: | ||||
|                 print("Exiting") | ||||
| 
 | ||||
|     def loop(self, mail_delta): | ||||
|     def loop(self, loop_time, mail_delta): | ||||
|         while True: | ||||
|             start_time = time.time() | ||||
|             if self.verbosity > 1: | ||||
|                 print(".", int(start_time)) | ||||
|             self.loop_step(mail_delta, start_time) | ||||
|             # Sleep until the start of the next loop step | ||||
|             time.sleep(max(0, start_time + loop_time - time.time())) | ||||
| 
 | ||||
|     def loop_step(self, mail_delta, time_now=None): | ||||
| 
 | ||||
|         # Occasionally fetch mail and store it to be consumed on the next loop | ||||
|         # We fetch email when we first start up so that it is not necessary to | ||||
|         # wait for 10 minutes after making changes to the config file. | ||||
|         delta = self.mail_fetcher.last_checked + mail_delta | ||||
|         if self.first_iteration or delta < datetime.datetime.now(): | ||||
|         next_mail_time = self.mail_fetcher.last_checked + mail_delta | ||||
|         if self.first_iteration or time_now > next_mail_time: | ||||
|             self.first_iteration = False | ||||
|             self.mail_fetcher.pull() | ||||
| 
 | ||||
|         # Consume whatever files we can. | ||||
|         # We have to run twice as the first run checks for file readiness | ||||
|         for i in range(2): | ||||
|             self.file_consumer.run() | ||||
|         self.file_consumer.consume_new_files() | ||||
| 
 | ||||
|     def loop_inotify(self, mail_delta): | ||||
|         directory = self.file_consumer.consume | ||||
|         inotify = INotify() | ||||
|         inotify.add_watch(directory, flags.CLOSE_WRITE | flags.MOVED_TO) | ||||
| 
 | ||||
|         # Run initial mail fetch and consume all currently existing documents | ||||
|         self.loop_step(mail_delta) | ||||
|         next_mail_time = self.mail_fetcher.last_checked + mail_delta | ||||
| 
 | ||||
|         while True: | ||||
|             # Consume documents until next_mail_time | ||||
|             while True: | ||||
|                 delta = next_mail_time - time.time() | ||||
|                 if delta > 0: | ||||
|                     for event in inotify.read(timeout=delta): | ||||
|                         file = os.path.join(directory, event.name) | ||||
|                         if os.path.isfile(file): | ||||
|                             self.file_consumer.try_consume_file(file) | ||||
|                 else: | ||||
|                     break | ||||
| 
 | ||||
|             self.mail_fetcher.pull() | ||||
|             next_mail_time = self.mail_fetcher.last_checked + mail_delta | ||||
|  | ||||
| @ -246,6 +246,8 @@ SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless") | ||||
| # This is where Paperless will look for PDFs to index | ||||
| CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR") | ||||
| 
 | ||||
| # (This setting is ignored on Linux where inotify is used instead of a | ||||
| # polling loop.) | ||||
| # The number of seconds that Paperless will wait between checking | ||||
| # CONSUMPTION_DIR.  If you tend to write documents to this directory very | ||||
| # slowly, you may want to use a higher value than the default. | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user