mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	Merge branch 'dev'
This commit is contained in:
		
						commit
						5573a84335
					
				
							
								
								
									
										4
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								Pipfile
									
									
									
									
									
								
							@ -8,6 +8,9 @@ url = "https://www.piwheels.org/simple"
 | 
			
		||||
verify_ssl = true
 | 
			
		||||
name = "piwheels"
 | 
			
		||||
 | 
			
		||||
[requires]
 | 
			
		||||
python_version = "3.6"
 | 
			
		||||
 | 
			
		||||
[packages]
 | 
			
		||||
dateparser = "~=0.7.6"
 | 
			
		||||
django = "~=3.1.3"
 | 
			
		||||
@ -35,6 +38,7 @@ scikit-learn="~=0.23.2"
 | 
			
		||||
whitenoise = "~=5.2.0"
 | 
			
		||||
watchdog = "*"
 | 
			
		||||
whoosh="~=2.7.4"
 | 
			
		||||
inotify-simple = "*"
 | 
			
		||||
 | 
			
		||||
[dev-packages]
 | 
			
		||||
coveralls = "*"
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										50
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										50
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							@ -1,10 +1,12 @@
 | 
			
		||||
{
 | 
			
		||||
    "_meta": {
 | 
			
		||||
        "hash": {
 | 
			
		||||
            "sha256": "ae2643b9cf0cf5741ae149fb6bc0c480de41329ce48e773eb4b5d760bc5e2244"
 | 
			
		||||
            "sha256": "d6432a18280c092c108e998f00bcd377c0c55ef18f26cb0b8eb64f9618b9f383"
 | 
			
		||||
        },
 | 
			
		||||
        "pipfile-spec": 6,
 | 
			
		||||
        "requires": {},
 | 
			
		||||
        "requires": {
 | 
			
		||||
            "python_version": "3.6"
 | 
			
		||||
        },
 | 
			
		||||
        "sources": [
 | 
			
		||||
            {
 | 
			
		||||
                "name": "pypi",
 | 
			
		||||
@ -129,6 +131,14 @@
 | 
			
		||||
            "index": "pypi",
 | 
			
		||||
            "version": "==0.32.0"
 | 
			
		||||
        },
 | 
			
		||||
        "inotify-simple": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
                "sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128",
 | 
			
		||||
                "sha256:854f9ac752cc1fcff6ca34e9d3d875c9a94c9b7d6eb377f63be2d481a566c6ee"
 | 
			
		||||
            ],
 | 
			
		||||
            "index": "pypi",
 | 
			
		||||
            "version": "==1.3.5"
 | 
			
		||||
        },
 | 
			
		||||
        "joblib": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
                "sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",
 | 
			
		||||
@ -663,11 +673,11 @@
 | 
			
		||||
        },
 | 
			
		||||
        "faker": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
                "sha256:3f5d379e4b5ce92a8afe3c2ce59d7c43886370dd3bf9495a936b91888debfc81",
 | 
			
		||||
                "sha256:8c0e8a06acef4b9312902e2ce18becabe62badd3a6632180bd0680c6ee111473"
 | 
			
		||||
                "sha256:5398268e1d751ffdb3ed36b8a790ed98659200599b368eec38a02eed15bce997",
 | 
			
		||||
                "sha256:d4183b8f57316de3be27cd6c3b40e9f9343d27c95c96179f027316c58c2c239e"
 | 
			
		||||
            ],
 | 
			
		||||
            "markers": "python_version >= '3.5'",
 | 
			
		||||
            "version": "==4.17.0"
 | 
			
		||||
            "version": "==4.17.1"
 | 
			
		||||
        },
 | 
			
		||||
        "filelock": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
@ -693,6 +703,22 @@
 | 
			
		||||
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
 | 
			
		||||
            "version": "==1.2.0"
 | 
			
		||||
        },
 | 
			
		||||
        "importlib-metadata": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
                "sha256:030f3b1bdb823ecbe4a9659e14cc861ce5af403fe99863bae173ec5fe00ab132",
 | 
			
		||||
                "sha256:caeee3603f5dcf567864d1be9b839b0bcfdf1383e3e7be33ce2dead8144ff19c"
 | 
			
		||||
            ],
 | 
			
		||||
            "markers": "python_version < '3.8'",
 | 
			
		||||
            "version": "==2.1.0"
 | 
			
		||||
        },
 | 
			
		||||
        "importlib-resources": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
                "sha256:7b51f0106c8ec564b1bef3d9c588bc694ce2b92125bbb6278f4f2f5b54ec3592",
 | 
			
		||||
                "sha256:a3d34a8464ce1d5d7c92b0ea4e921e696d86f2aa212e684451cb1482c8d84ed5"
 | 
			
		||||
            ],
 | 
			
		||||
            "markers": "python_version < '3.7'",
 | 
			
		||||
            "version": "==3.3.0"
 | 
			
		||||
        },
 | 
			
		||||
        "iniconfig": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
 | 
			
		||||
@ -999,11 +1025,19 @@
 | 
			
		||||
        },
 | 
			
		||||
        "virtualenv": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
                "sha256:b0011228208944ce71052987437d3843e05690b2f23d1c7da4263fde104c97a2",
 | 
			
		||||
                "sha256:b8d6110f493af256a40d65e29846c69340a947669eec8ce784fcf3dd3af28380"
 | 
			
		||||
                "sha256:07cff122e9d343140366055f31be4dcd61fd598c69d11cd33a9d9c8df4546dd7",
 | 
			
		||||
                "sha256:e0aac7525e880a429764cefd3aaaff54afb5d9f25c82627563603f5d7de5a6e5"
 | 
			
		||||
            ],
 | 
			
		||||
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
 | 
			
		||||
            "version": "==20.1.0"
 | 
			
		||||
            "version": "==20.2.1"
 | 
			
		||||
        },
 | 
			
		||||
        "zipp": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
                "sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108",
 | 
			
		||||
                "sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"
 | 
			
		||||
            ],
 | 
			
		||||
            "markers": "python_version < '3.8'",
 | 
			
		||||
            "version": "==3.4.0"
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -15,7 +15,7 @@ services:
 | 
			
		||||
      POSTGRES_PASSWORD: paperless
 | 
			
		||||
 | 
			
		||||
  webserver:
 | 
			
		||||
    image: jonaswinkler/paperless-ng:0.9.2
 | 
			
		||||
    image: jonaswinkler/paperless-ng:0.9.3
 | 
			
		||||
    restart: always
 | 
			
		||||
    depends_on:
 | 
			
		||||
      - db
 | 
			
		||||
 | 
			
		||||
@ -5,7 +5,7 @@ services:
 | 
			
		||||
    restart: always
 | 
			
		||||
 | 
			
		||||
  webserver:
 | 
			
		||||
    image: jonaswinkler/paperless-ng:0.9.2
 | 
			
		||||
    image: jonaswinkler/paperless-ng:0.9.3
 | 
			
		||||
    restart: always
 | 
			
		||||
    depends_on:
 | 
			
		||||
      - broker
 | 
			
		||||
 | 
			
		||||
@ -30,7 +30,7 @@ Options available to docker installations:
 | 
			
		||||
    Paperless uses 3 volumes:
 | 
			
		||||
 | 
			
		||||
    *   ``paperless_media``: This is where your documents are stored.
 | 
			
		||||
    *   ``paperless_data``: This is where auxilliary data is stored. This
 | 
			
		||||
    *   ``paperless_data``: This is where auxillary data is stored. This
 | 
			
		||||
        folder also contains the SQLite database, if you use it.
 | 
			
		||||
    *   ``paperless_pgdata``: Exists only if you use PostgreSQL and contains
 | 
			
		||||
        the database.
 | 
			
		||||
@ -109,7 +109,7 @@ B.  If you built the image yourself, grab the new archive and replace your curre
 | 
			
		||||
.. hint::
 | 
			
		||||
 | 
			
		||||
    You can usually keep your ``docker-compose.env`` file, since this file will
 | 
			
		||||
    never include mandantory configuration options. However, it is worth checking
 | 
			
		||||
    never include mandatory configuration options. However, it is worth checking
 | 
			
		||||
    out the new version of this file, since it might have new recommendations
 | 
			
		||||
    on what to configure.
 | 
			
		||||
 | 
			
		||||
@ -126,8 +126,8 @@ After grabbing the new release and unpacking the contents, do the following:
 | 
			
		||||
 | 
			
		||||
        $ pip install --upgrade pipenv
 | 
			
		||||
        $ cd /path/to/paperless
 | 
			
		||||
        $ pipenv install
 | 
			
		||||
        $ pipenv clean
 | 
			
		||||
        $ pipenv install
 | 
			
		||||
 | 
			
		||||
    This creates a new virtual environment (or uses your existing environment)
 | 
			
		||||
    and installs all dependencies into it.
 | 
			
		||||
@ -247,12 +247,12 @@ your already processed documents.
 | 
			
		||||
 | 
			
		||||
When multiple document types or correspondents match a single document,
 | 
			
		||||
the retagger won't assign these to the document. Specify ``--use-first``
 | 
			
		||||
to override this behaviour and just use the first correspondent or type
 | 
			
		||||
to override this behavior and just use the first correspondent or type
 | 
			
		||||
it finds. This option does not apply to tags, since any amount of tags
 | 
			
		||||
can be applied to a document.
 | 
			
		||||
 | 
			
		||||
Finally, ``-f`` specifies that you wish to overwrite already assigned
 | 
			
		||||
correspondents, types and/or tags. The default behaviour is to not
 | 
			
		||||
correspondents, types and/or tags. The default behavior is to not
 | 
			
		||||
assign correspondents and types to documents that have this data already
 | 
			
		||||
assigned. ``-f`` works differently for tags: By default, only additional tags get
 | 
			
		||||
added to documents, no tags will be removed. With ``-f``, tags that don't
 | 
			
		||||
@ -341,7 +341,7 @@ Documents can be stored in Paperless using GnuPG encryption.
 | 
			
		||||
 | 
			
		||||
.. danger::
 | 
			
		||||
 | 
			
		||||
    Encryption is depreceated since paperless-ng 0.9 and doesn't really provide any
 | 
			
		||||
    Encryption is deprecated since paperless-ng 0.9 and doesn't really provide any
 | 
			
		||||
    additional security, since you have to store the passphrase in a configuration
 | 
			
		||||
    file on the same system as the encrypted documents for paperless to work.
 | 
			
		||||
    Furthermore, the entire text content of the documents is stored plain in the
 | 
			
		||||
@ -353,39 +353,23 @@ Documents can be stored in Paperless using GnuPG encryption.
 | 
			
		||||
    Consider running paperless on an encrypted filesystem instead, which will then
 | 
			
		||||
    at least provide security against physical hardware theft.
 | 
			
		||||
 | 
			
		||||
.. code::
 | 
			
		||||
 | 
			
		||||
    change_storage_type [--passphrase PASSPHRASE] {gpg,unencrypted} {gpg,unencrypted}
 | 
			
		||||
 | 
			
		||||
    positional arguments:
 | 
			
		||||
      {gpg,unencrypted}     The state you want to change your documents from
 | 
			
		||||
      {gpg,unencrypted}     The state you want to change your documents to
 | 
			
		||||
 | 
			
		||||
    optional arguments:
 | 
			
		||||
      --passphrase PASSPHRASE
 | 
			
		||||
 | 
			
		||||
Enabling encryption
 | 
			
		||||
-------------------
 | 
			
		||||
 | 
			
		||||
Basic usage to enable encryption of your document store (**USE A MORE SECURE PASSPHRASE**):
 | 
			
		||||
 | 
			
		||||
(Note: If ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
 | 
			
		||||
 | 
			
		||||
.. code::
 | 
			
		||||
 | 
			
		||||
    change_storage_type [--passphrase SECR3TP4SSPHRA$E] unencrypted gpg
 | 
			
		||||
Enabling encryption is no longer supported.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Disabling encryption
 | 
			
		||||
--------------------
 | 
			
		||||
 | 
			
		||||
Basic usage to enable encryption of your document store:
 | 
			
		||||
Basic usage to disable encryption of your document store:
 | 
			
		||||
 | 
			
		||||
(Note: Again, if ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
 | 
			
		||||
(Note: If ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
 | 
			
		||||
 | 
			
		||||
.. code::
 | 
			
		||||
 | 
			
		||||
    change_storage_type [--passphrase SECR3TP4SSPHRA$E] gpg unencrypted
 | 
			
		||||
    decrypt_documents [--passphrase SECR3TP4SSPHRA$E]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
.. _Pipenv: https://pipenv.pypa.io/en/latest/
 | 
			
		||||
@ -84,6 +84,8 @@ to the filename.
 | 
			
		||||
   PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
.. _advanced-matching:
 | 
			
		||||
 | 
			
		||||
Matching tags, correspondents and document types
 | 
			
		||||
################################################
 | 
			
		||||
 | 
			
		||||
@ -145,7 +147,9 @@ America are tagged with the tag "bofa_123" and the matching algorithm of this
 | 
			
		||||
tag is set to *Auto*, this neural network will examine your documents and
 | 
			
		||||
automatically learn when to assign this tag.
 | 
			
		||||
 | 
			
		||||
There are a couple caveats you need to keep in mind when using this feature:
 | 
			
		||||
Paperless tries to hide much of the involved complexity with this approach.
 | 
			
		||||
However, there are a couple caveats you need to keep in mind when using this
 | 
			
		||||
feature:
 | 
			
		||||
 | 
			
		||||
* Changes to your documents are not immediately reflected by the matching
 | 
			
		||||
  algorithm. The neural network needs to be *trained* on your documents after
 | 
			
		||||
@ -165,6 +169,11 @@ There are a couple caveats you need to keep in mind when using this feature:
 | 
			
		||||
  has the correspondent "Very obscure web shop I bought something five years
 | 
			
		||||
  ago", it will probably not assign this correspondent automatically if you buy
 | 
			
		||||
  something from them again. The more documents, the better.
 | 
			
		||||
* Paperless also needs a reasonable amount of negative examples to decide when
 | 
			
		||||
  not to assign a certain tag, correspondent or type. This will usually be the
 | 
			
		||||
  case as you start filling up paperless with documents. Example: If all your
 | 
			
		||||
  documents are either from "Webshop" and "Bank", paperless will assign one of
 | 
			
		||||
  these correspondents to ANY new document, if both are set to automatic matching.
 | 
			
		||||
 | 
			
		||||
Hooking into the consumption process
 | 
			
		||||
####################################
 | 
			
		||||
@ -253,7 +262,7 @@ By default, paperless stores your documents in the media directory and renames t
 | 
			
		||||
using the identifier which it has assigned to each document. You will end up getting
 | 
			
		||||
files like ``0000123.pdf`` in your media directory. This isn't necessarily a bad
 | 
			
		||||
thing, because you normally don't have to access these files manually. However, if
 | 
			
		||||
you wish to name your files differently, you can do that by adjustng the
 | 
			
		||||
you wish to name your files differently, you can do that by adjusting the
 | 
			
		||||
``PAPERLESS_FILENAME_FORMAT`` settings variable.
 | 
			
		||||
 | 
			
		||||
This variable allows you to configure the filename (folders are allowed!) using
 | 
			
		||||
@ -278,7 +287,7 @@ will create a directory structure as follows:
 | 
			
		||||
        my_new_shoes-0000004.pdf
 | 
			
		||||
 | 
			
		||||
Paperless appends the unique identifier of each document to the filename. This
 | 
			
		||||
avoides filename clashes.
 | 
			
		||||
avoids filename clashes.
 | 
			
		||||
 | 
			
		||||
.. danger::
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -94,7 +94,7 @@ Result object:
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
*   ``id``: the primary key of the found document
 | 
			
		||||
*   ``highlights``: an object containing parseable highlights for the result.
 | 
			
		||||
*   ``highlights``: an object containing parsable highlights for the result.
 | 
			
		||||
    See below.
 | 
			
		||||
*   ``score``: The score assigned to the document. A higher score indicates a
 | 
			
		||||
    better match with the query. Search results are sorted descending by score.
 | 
			
		||||
 | 
			
		||||
@ -5,6 +5,24 @@
 | 
			
		||||
Changelog
 | 
			
		||||
*********
 | 
			
		||||
 | 
			
		||||
paperless-ng 0.9.3
 | 
			
		||||
##################
 | 
			
		||||
 | 
			
		||||
* Setting ``PAPERLESS_AUTO_LOGIN_USERNAME`` replaces ``PAPERLESS_DISABLE_LOGIN``.
 | 
			
		||||
  You have to specify your username.
 | 
			
		||||
* Added a simple sanity checker that checks your documents for missing or orphaned files,
 | 
			
		||||
  files with wrong checksums, inaccessible files, and documents with empty content.
 | 
			
		||||
* It is no longer possible to encrypt your documents. For the time being, paperless will
 | 
			
		||||
  continue to operate with already encrypted documents.
 | 
			
		||||
* Fixes:
 | 
			
		||||
 | 
			
		||||
  * Paperless now uses inotify again, since the watchdog was causing issues which I was not
 | 
			
		||||
    aware of.
 | 
			
		||||
  * Issue with the automatic classifier not working with only one tag.
 | 
			
		||||
  * A couple issues with the search index being opened to eagerly.
 | 
			
		||||
  
 | 
			
		||||
* Added lots of tests for various parts of the application.
 | 
			
		||||
 | 
			
		||||
paperless-ng 0.9.2
 | 
			
		||||
##################
 | 
			
		||||
 | 
			
		||||
@ -52,7 +70,7 @@ paperless-ng 0.9.0
 | 
			
		||||
* **Added:** New frontend. Features:
 | 
			
		||||
 | 
			
		||||
  * Single page application: It's much more responsive than the django admin pages.
 | 
			
		||||
  * Dashboard. Shows recently scanned documents, or todos, or other documents
 | 
			
		||||
  * Dashboard. Shows recently scanned documents, or todo notes, or other documents
 | 
			
		||||
    at wish. Allows uploading of documents. Shows basic statistics.
 | 
			
		||||
  * Better document list with multiple display options.
 | 
			
		||||
  * Full text search with result highlighting, auto completion and scoring based
 | 
			
		||||
@ -102,7 +120,7 @@ paperless-ng 0.9.0
 | 
			
		||||
 | 
			
		||||
* **Modified [breaking]:** PostgreSQL:
 | 
			
		||||
 | 
			
		||||
  * If ``PAPERLESS_DBHOST`` is specified in the settings, paperless uses postgresql instead of sqlite.
 | 
			
		||||
  * If ``PAPERLESS_DBHOST`` is specified in the settings, paperless uses PostgreSQL instead of SQLite.
 | 
			
		||||
    Username, database and password all default to ``paperless`` if not specified.
 | 
			
		||||
 | 
			
		||||
* **Modified [breaking]:** document_retagger management command rework. See
 | 
			
		||||
@ -130,7 +148,7 @@ paperless-ng 0.9.0
 | 
			
		||||
    Certain language specifics such as umlauts may not get picked up properly.
 | 
			
		||||
  * ``PAPERLESS_DEBUG`` defaults to ``false``.
 | 
			
		||||
  * The presence of ``PAPERLESS_DBHOST`` now determines whether to use PostgreSQL or
 | 
			
		||||
    sqlite.
 | 
			
		||||
    SQLite.
 | 
			
		||||
  * ``PAPERLESS_OCR_THREADS`` is gone and replaced with ``PAPERLESS_TASK_WORKERS`` and
 | 
			
		||||
    ``PAPERLESS_THREADS_PER_WORKER``. Refer to the config example for details.
 | 
			
		||||
  * ``PAPERLESS_OPTIMIZE_THUMBNAILS`` allows you to disable or enable thumbnail
 | 
			
		||||
@ -138,8 +156,11 @@ paperless-ng 0.9.0
 | 
			
		||||
 | 
			
		||||
* Many more small changes here and there. The usual stuff.
 | 
			
		||||
 | 
			
		||||
Paperless
 | 
			
		||||
#########
 | 
			
		||||
 | 
			
		||||
2.7.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* `syntonym`_ submitted a pull request to catch IMAP connection errors `#475`_.
 | 
			
		||||
* `Stéphane Brunner`_ added ``psycopg2`` to the Pipfile `#489`_.  He also fixed
 | 
			
		||||
@ -156,7 +177,7 @@ paperless-ng 0.9.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
2.6.1
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* We now have a logo, complete with a favicon :-)
 | 
			
		||||
* Removed some problematic tests.
 | 
			
		||||
@ -168,7 +189,7 @@ paperless-ng 0.9.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
2.6.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Allow an infinite number of logs to be deleted.  Thanks to `Ulli`_ for noting
 | 
			
		||||
  the problem in `#433`_.
 | 
			
		||||
@ -189,7 +210,7 @@ paperless-ng 0.9.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
2.5.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* **New dependency**: Paperless now optimises thumbnail generation with
 | 
			
		||||
  `optipng`_, so you'll need to install that somewhere in your PATH or declare
 | 
			
		||||
@ -233,7 +254,7 @@ paperless-ng 0.9.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
2.4.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* A new set of actions are now available thanks to `jonaswinkler`_'s very first
 | 
			
		||||
  pull request!  You can now do nifty things like tag documents in bulk, or set
 | 
			
		||||
@ -254,7 +275,7 @@ paperless-ng 0.9.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
2.3.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Support for consuming plain text & markdown documents was added by
 | 
			
		||||
  `Joshua Taillon`_!  This was a long-requested feature, and it's addition is
 | 
			
		||||
@ -272,14 +293,14 @@ paperless-ng 0.9.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
2.2.1
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* `Kyle Lucy`_ reported a bug quickly after the release of 2.2.0 where we broke
 | 
			
		||||
  the ``DISABLE_LOGIN`` feature: `#392`_.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
2.2.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Thanks to `dadosch`_, `Wolfgang Mader`_, and `Tim Brooks`_ this is the first
 | 
			
		||||
  version of Paperless that supports Django 2.0!  As a result of their hard
 | 
			
		||||
@ -296,7 +317,7 @@ paperless-ng 0.9.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
2.1.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* `Enno Lohmeier`_ added three simple features that make Paperless a lot more
 | 
			
		||||
  user (and developer) friendly:
 | 
			
		||||
@ -315,7 +336,7 @@ paperless-ng 0.9.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
2.0.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
This is a big release as we've changed a core-functionality of Paperless: we no
 | 
			
		||||
longer encrypt files with GPG by default.
 | 
			
		||||
@ -347,7 +368,7 @@ Special thanks to `erikarvstedt`_, `matthewmoto`_, and `mcronce`_ who did the
 | 
			
		||||
bulk of the work on this big change.
 | 
			
		||||
 | 
			
		||||
1.4.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* `Quentin Dawans`_ has refactored the document consumer to allow for some
 | 
			
		||||
  command-line options.  Notably, you can now direct it to consume from a
 | 
			
		||||
@ -382,7 +403,7 @@ bulk of the work on this big change.
 | 
			
		||||
  to some excellent work from `erikarvstedt`_ on `#351`_
 | 
			
		||||
 | 
			
		||||
1.3.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* You can now run Paperless without a login, though you'll still have to create
 | 
			
		||||
  at least one user.  This is thanks to a pull-request from `matthewmoto`_:
 | 
			
		||||
@ -405,7 +426,7 @@ bulk of the work on this big change.
 | 
			
		||||
  problem and helping me find where to fix it.
 | 
			
		||||
 | 
			
		||||
1.2.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
 | 
			
		||||
  and `Pit`_.  This new image is dramatically smaller than the Debian-based
 | 
			
		||||
@ -424,7 +445,7 @@ bulk of the work on this big change.
 | 
			
		||||
  in the document text.
 | 
			
		||||
 | 
			
		||||
1.1.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Fix for `#283`_, a redirect bug which broke interactions with
 | 
			
		||||
  paperless-desktop.  Thanks to `chris-aeviator`_ for reporting it.
 | 
			
		||||
@ -434,7 +455,7 @@ bulk of the work on this big change.
 | 
			
		||||
  `Dan Panzarella`_
 | 
			
		||||
 | 
			
		||||
1.0.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Upgrade to Django 1.11.  **You'll need to run
 | 
			
		||||
  ``pip install -r requirements.txt`` after the usual ``git pull`` to
 | 
			
		||||
@ -453,14 +474,14 @@ bulk of the work on this big change.
 | 
			
		||||
  `Lukas Winkler`_'s issue `#278`_
 | 
			
		||||
 | 
			
		||||
0.8.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Paperless can now run in a subdirectory on a host (``/paperless``), rather
 | 
			
		||||
  than always running in the root (``/``) thanks to `maphy-psd`_'s work on
 | 
			
		||||
  `#255`_.
 | 
			
		||||
 | 
			
		||||
0.7.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* **Potentially breaking change**: As per `#235`_, Paperless will no longer
 | 
			
		||||
  automatically delete documents attached to correspondents when those
 | 
			
		||||
@ -472,7 +493,7 @@ bulk of the work on this big change.
 | 
			
		||||
  `Kusti Skytén`_ for posting the correct solution in the Github issue.
 | 
			
		||||
 | 
			
		||||
0.6.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Abandon the shared-secret trick we were using for the POST API in favour
 | 
			
		||||
  of BasicAuth or Django session.
 | 
			
		||||
@ -486,7 +507,7 @@ bulk of the work on this big change.
 | 
			
		||||
  the help with this feature.
 | 
			
		||||
 | 
			
		||||
0.5.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Support for fuzzy matching in the auto-tagger & auto-correspondent systems
 | 
			
		||||
  thanks to `Jake Gysland`_'s patch `#220`_.
 | 
			
		||||
@ -504,13 +525,13 @@ bulk of the work on this big change.
 | 
			
		||||
  * Amended the Django Admin configuration to have nice headers (`#230`_)
 | 
			
		||||
 | 
			
		||||
0.4.1
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Fix for `#206`_ wherein the pluggable parser didn't recognise files with
 | 
			
		||||
  all-caps suffixes like ``.PDF``
 | 
			
		||||
 | 
			
		||||
0.4.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Introducing reminders.  See `#199`_ for more information, but the short
 | 
			
		||||
  explanation is that you can now attach simple notes & times to documents
 | 
			
		||||
@ -520,7 +541,7 @@ bulk of the work on this big change.
 | 
			
		||||
  like to make use of this feature in his project.
 | 
			
		||||
 | 
			
		||||
0.3.6
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
 | 
			
		||||
  correspondent or the tags for a document.
 | 
			
		||||
@ -534,7 +555,7 @@ bulk of the work on this big change.
 | 
			
		||||
  documentation is on its way.
 | 
			
		||||
 | 
			
		||||
0.3.5
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* A serious facelift for the documents listing page wherein we drop the
 | 
			
		||||
  tabular layout in favour of a tiled interface.
 | 
			
		||||
@ -545,7 +566,7 @@ bulk of the work on this big change.
 | 
			
		||||
  consumption.
 | 
			
		||||
 | 
			
		||||
0.3.4
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
 | 
			
		||||
  Note that you *can* use Django Suit with Paperless, but only in a
 | 
			
		||||
@ -558,26 +579,26 @@ bulk of the work on this big change.
 | 
			
		||||
  API thanks to @thomasbrueggemann.  See `#179`_.
 | 
			
		||||
 | 
			
		||||
0.3.3
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
 | 
			
		||||
* Timezone, items per page, and default language are now all configurable,
 | 
			
		||||
  also thanks to @ekw.
 | 
			
		||||
 | 
			
		||||
0.3.2
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
 | 
			
		||||
  user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
 | 
			
		||||
  arise.
 | 
			
		||||
 | 
			
		||||
0.3.1
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Added a default value for ``CONVERT_BINARY``
 | 
			
		||||
 | 
			
		||||
0.3.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Updated to using django-filter 1.x
 | 
			
		||||
* Added some system checks so new users aren't confused by misconfigurations.
 | 
			
		||||
@ -590,7 +611,7 @@ bulk of the work on this big change.
 | 
			
		||||
  ``PAPERLESS_SHARED_SECRET`` respectively instead.
 | 
			
		||||
 | 
			
		||||
0.2.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* `#150`_: The media root is now a variable you can set in
 | 
			
		||||
  ``paperless.conf``.
 | 
			
		||||
@ -618,7 +639,7 @@ bulk of the work on this big change.
 | 
			
		||||
  to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
 | 
			
		||||
 | 
			
		||||
0.1.1
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Potentially **Breaking Change**: All references to "sender" in the code
 | 
			
		||||
  have been renamed to "correspondent" to better reflect the nature of the
 | 
			
		||||
@ -642,7 +663,7 @@ bulk of the work on this big change.
 | 
			
		||||
  to be imported but made unavailable.
 | 
			
		||||
 | 
			
		||||
0.1.0
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and
 | 
			
		||||
  `Tikitu de Jager`_ for this one, and especially to `Pit`_
 | 
			
		||||
@ -661,14 +682,14 @@ bulk of the work on this big change.
 | 
			
		||||
* Added tox with pep8 checking
 | 
			
		||||
 | 
			
		||||
0.0.6
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Added support for parallel OCR (significant work from `Pit`_)
 | 
			
		||||
* Sped up the language detection (significant work from `Pit`_)
 | 
			
		||||
* Added simple logging
 | 
			
		||||
 | 
			
		||||
0.0.5
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Added support for image files as documents (png, jpg, gif, tiff)
 | 
			
		||||
* Added a crude means of HTTP POST for document imports
 | 
			
		||||
@ -677,7 +698,7 @@ bulk of the work on this big change.
 | 
			
		||||
* Documentation for the above as well as data migration
 | 
			
		||||
 | 
			
		||||
0.0.4
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Added automated tagging basted on keyword matching
 | 
			
		||||
* Cleaned up the document listing page
 | 
			
		||||
@ -685,19 +706,19 @@ bulk of the work on this big change.
 | 
			
		||||
* Added ``pytz`` to the list of requirements
 | 
			
		||||
 | 
			
		||||
0.0.3
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Added basic tagging
 | 
			
		||||
 | 
			
		||||
0.0.2
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Added language detection
 | 
			
		||||
* Added datestamps to ``document_exporter``.
 | 
			
		||||
* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
 | 
			
		||||
 | 
			
		||||
0.0.1
 | 
			
		||||
#####
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
* Initial release
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -35,22 +35,22 @@ PAPERLESS_DBHOST=<hostname>
 | 
			
		||||
 | 
			
		||||
PAPERLESS_DBPORT=<port>
 | 
			
		||||
    Adjust port if necessary.
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    Default is 5432.
 | 
			
		||||
 | 
			
		||||
PAPERLESS_DBNAME=<name>
 | 
			
		||||
    Database name in PostgreSQL.
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    Defaults to "paperless".
 | 
			
		||||
 | 
			
		||||
PAPERLESS_DBUSER=<name>
 | 
			
		||||
    Database user in PostgreSQL.
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    Defaults to "paperless".
 | 
			
		||||
 | 
			
		||||
PAPERLESS_DBPASS=<password>
 | 
			
		||||
    Database password for PostgreSQL.
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    Defaults to "paperless".
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -69,7 +69,7 @@ PAPERLESS_CONSUMPTION_DIR=<path>
 | 
			
		||||
    Defaults to "../consume", relative to the "src" directory.
 | 
			
		||||
 | 
			
		||||
PAPERLESS_DATA_DIR=<path>
 | 
			
		||||
    This is where paperless stores all its data (search index, sqlite database,
 | 
			
		||||
    This is where paperless stores all its data (search index, SQLite database,
 | 
			
		||||
    classification model, etc).
 | 
			
		||||
 | 
			
		||||
    Defaults to "../data", relative to the "src" directory.
 | 
			
		||||
@ -100,7 +100,7 @@ Hosting & Security
 | 
			
		||||
##################
 | 
			
		||||
 | 
			
		||||
PAPERLESS_SECRET_KEY=<key>
 | 
			
		||||
    Paperless uses this to make session tokens. If you exose paperless on the
 | 
			
		||||
    Paperless uses this to make session tokens. If you expose paperless on the
 | 
			
		||||
    internet, you need to change this, since the default secret is well known.
 | 
			
		||||
 | 
			
		||||
    Use any sequence of characters. The more, the better. You don't need to
 | 
			
		||||
@ -113,7 +113,7 @@ PAPERLESS_ALLOWED_HOSTS<comma-separated-list>
 | 
			
		||||
    really should set this value to the domain name you're using.  Failing to do
 | 
			
		||||
    so leaves you open to HTTP host header attacks:
 | 
			
		||||
    https://docs.djangoproject.com/en/3.1/topics/security/#host-header-validation
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    Just remember that this is a comma-separated list, so "example.com" is fine,
 | 
			
		||||
    as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
 | 
			
		||||
 | 
			
		||||
@ -132,15 +132,25 @@ PAPERLESS_FORCE_SCRIPT_NAME=<path>
 | 
			
		||||
    .. note::
 | 
			
		||||
 | 
			
		||||
        I don't know if this works in paperless-ng. Probably not.
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    Defaults to none, which hosts paperless at "/".
 | 
			
		||||
 | 
			
		||||
PAPERLESS_STATIC_URL=<path>
 | 
			
		||||
    Override the STATIC_URL here.  Unless you're hosting Paperless off a
 | 
			
		||||
    subdomain like /paperless/, you probably don't need to change this.
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    Defaults to "/static/".
 | 
			
		||||
 | 
			
		||||
PAPERLESS_AUTO_LOGIN_USERNAME=<username>
 | 
			
		||||
    Specify a username here so that paperless will automatically perform login
 | 
			
		||||
    with the selected user.
 | 
			
		||||
 | 
			
		||||
    .. danger::
 | 
			
		||||
 | 
			
		||||
        Do not use this when exposing paperless on the internet. There are no
 | 
			
		||||
        checks in place that would prevent you from doing this.
 | 
			
		||||
 | 
			
		||||
    Defaults to none, which disables this feature.
 | 
			
		||||
 | 
			
		||||
Software tweaks
 | 
			
		||||
###############
 | 
			
		||||
@ -156,11 +166,11 @@ PAPERLESS_THREADS_PER_WORKER=<num>
 | 
			
		||||
    in parallel on a single document.
 | 
			
		||||
 | 
			
		||||
    .. caution::
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        Ensure that the product
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
            PAPERLESS_TASK_WORKERS * PAPERLESS_THREADS_PER_WORKER
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        does not exceed your CPU core count or else paperless will be extremely slow.
 | 
			
		||||
        If you want paperless to process many documents in parallel, choose a high
 | 
			
		||||
        worker count. If you want paperless to process very large documents faster,
 | 
			
		||||
@ -197,10 +207,10 @@ PAPERLESS_OCR_PAGES=<num>
 | 
			
		||||
PAPERLESS_OCR_LANGUAGE=<lang>
 | 
			
		||||
    Customize the default language that tesseract will attempt to use when
 | 
			
		||||
    parsing documents. The default language is used whenever
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    * No language could be detected on a document
 | 
			
		||||
    * No tesseract data files are available for the detected language
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    It should be a 3-letter language code consistent with ISO
 | 
			
		||||
    639: https://www.loc.gov/standards/iso639-2/php/code_list.php
 | 
			
		||||
 | 
			
		||||
@ -220,7 +230,7 @@ PAPERLESS_CONSUMER_POLLING=<num>
 | 
			
		||||
    specify a polling interval in seconds here, which will then cause paperless
 | 
			
		||||
    to periodically check your consumption directory for changes.
 | 
			
		||||
 | 
			
		||||
    Defaults to 0, which disables polling and uses filesystem notifiactions.
 | 
			
		||||
    Defaults to 0, which disables polling and uses filesystem notifications.
 | 
			
		||||
 | 
			
		||||
PAPERLESS_CONSUMER_DELETE_DUPLICATES=<bool>
 | 
			
		||||
    When the consumer detects a duplicate document, it will not touch the
 | 
			
		||||
@ -234,7 +244,7 @@ PAPERLESS_CONVERT_MEMORY_LIMIT=<num>
 | 
			
		||||
    such cases, try setting this to a reasonably low value, like 32.  The
 | 
			
		||||
    default is to use whatever is necessary to do everything without writing to
 | 
			
		||||
    disk, and units are in megabytes.
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    For more information on how to use this value, you should search
 | 
			
		||||
    the web for "MAGICK_MEMORY_LIMIT".
 | 
			
		||||
 | 
			
		||||
@ -245,7 +255,7 @@ PAPERLESS_CONVERT_TMPDIR=<path>
 | 
			
		||||
    /tmp as tmpfs, you should set this to a path that's on a physical disk, like
 | 
			
		||||
    /home/your_user/tmp or something.  ImageMagick will use this as scratch space
 | 
			
		||||
    when crunching through very large documents.
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    For more information on how to use this value, you should search
 | 
			
		||||
    the web for "MAGICK_TMPDIR".
 | 
			
		||||
 | 
			
		||||
@ -264,7 +274,7 @@ PAPERLESS_CONVERT_DENSITY=<num>
 | 
			
		||||
    Default is 300.
 | 
			
		||||
 | 
			
		||||
PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
 | 
			
		||||
    Use optipng to optimize thumbnails. This usually reduces the sice of
 | 
			
		||||
    Use optipng to optimize thumbnails. This usually reduces the size of
 | 
			
		||||
    thumbnails by about 20%, but uses considerable compute time during
 | 
			
		||||
    consumption.
 | 
			
		||||
 | 
			
		||||
@ -282,7 +292,7 @@ PAPERLESS_FILENAME_DATE_ORDER=<format>
 | 
			
		||||
    Use this setting to enable checking the document filename for date
 | 
			
		||||
    information. The date order can be set to any option as specified in
 | 
			
		||||
    https://dateparser.readthedocs.io/en/latest/settings.html#date-order.
 | 
			
		||||
    The filename will be checked first, and if nothing is found, the document 
 | 
			
		||||
    The filename will be checked first, and if nothing is found, the document
 | 
			
		||||
    text will be checked as normal.
 | 
			
		||||
 | 
			
		||||
    Defaults to none, which disables this feature.
 | 
			
		||||
 | 
			
		||||
@ -85,7 +85,7 @@ quoted, or triple-quoted string will do:
 | 
			
		||||
    problematic_string = 'This is a "string" with "quotes" in it'
 | 
			
		||||
 | 
			
		||||
In HTML templates, please use double-quotes for tag attributes, and single
 | 
			
		||||
quotes for arguments passed to Django tempalte tags:
 | 
			
		||||
quotes for arguments passed to Django template tags:
 | 
			
		||||
 | 
			
		||||
.. code:: html
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -17,7 +17,7 @@ is
 | 
			
		||||
 | 
			
		||||
.. caution::
 | 
			
		||||
 | 
			
		||||
    Dont mess with this folder. Don't change permissions and don't move
 | 
			
		||||
    Do not mess with this folder. Don't change permissions and don't move
 | 
			
		||||
    files around manually. This folder is meant to be entirely managed by docker
 | 
			
		||||
    and paperless.
 | 
			
		||||
 | 
			
		||||
@ -36,9 +36,9 @@ file extensions do not matter.
 | 
			
		||||
 | 
			
		||||
**A:** The short answer is yes. I've tested it on a Raspberry Pi 3 B.
 | 
			
		||||
The long answer is that certain parts of
 | 
			
		||||
Paperless will run very slow, such as the tesseract OCR. On Rasperry Pi,
 | 
			
		||||
Paperless will run very slow, such as the tesseract OCR. On Raspberry Pi,
 | 
			
		||||
try to OCR documents before feeding them into paperless so that paperless can
 | 
			
		||||
reuse the text. The web interface should be alot snappier, since it runs
 | 
			
		||||
reuse the text. The web interface should be a lot snappier, since it runs
 | 
			
		||||
in your browser and paperless has to do much less work to serve the data.
 | 
			
		||||
 | 
			
		||||
.. note::
 | 
			
		||||
 | 
			
		||||
@ -8,7 +8,7 @@ Scanner recommendations
 | 
			
		||||
As Paperless operates by watching a folder for new files, doesn't care what
 | 
			
		||||
scanner you use, but sometimes finding a scanner that will write to an FTP,
 | 
			
		||||
NFS, or SMB server can be difficult.  This page is here to help you find one
 | 
			
		||||
that works right for you based on recommentations from other Paperless users.
 | 
			
		||||
that works right for you based on recommendations from other Paperless users.
 | 
			
		||||
 | 
			
		||||
+---------+----------------+-----+-----+-----+----------------+
 | 
			
		||||
| Brand   | Model          | Supports        | Recommended By |
 | 
			
		||||
 | 
			
		||||
@ -21,7 +21,7 @@ Extensive filtering mechanisms:
 | 
			
		||||
 | 
			
		||||
.. image:: _static/screenshots/documents-filter.png
 | 
			
		||||
 | 
			
		||||
Side-by-side editing of documents. Optmized for 1080p.
 | 
			
		||||
Side-by-side editing of documents. Optimized for 1080p.
 | 
			
		||||
 | 
			
		||||
.. image:: _static/screenshots/editing.png
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -85,7 +85,7 @@ Paperless consists of the following components:
 | 
			
		||||
        needs to do from time to time in order to operate properly.
 | 
			
		||||
 | 
			
		||||
    This allows paperless to process multiple documents from your consumption folder in parallel! On
 | 
			
		||||
    a modern multicore system, consumption with full ocr is blazing fast.
 | 
			
		||||
    a modern multi core system, consumption with full ocr is blazing fast.
 | 
			
		||||
 | 
			
		||||
    The task processor comes with a built-in admin interface that you can use to see whenever any of the
 | 
			
		||||
    tasks fail and inspect the errors (i.e., wrong email credentials, errors during consuming a specific
 | 
			
		||||
@ -265,15 +265,17 @@ Migration to paperless-ng is then performed in a few simple steps:
 | 
			
		||||
    ``docker-compose.env`` to your needs.
 | 
			
		||||
    See `docker route`_ for details on which edits are advised.
 | 
			
		||||
 | 
			
		||||
6.  Start paperless-ng.
 | 
			
		||||
6.  In order to find your existing documents with the new search feature, you need
 | 
			
		||||
    to invoke a one-time operation that will create the search index:
 | 
			
		||||
 | 
			
		||||
    .. code:: bash
 | 
			
		||||
    .. code:: shell-session
 | 
			
		||||
 | 
			
		||||
        $ docker-compose up
 | 
			
		||||
        $ docker-compose run --rm webserver document_index reindex
 | 
			
		||||
    
 | 
			
		||||
    This will migrate your database and create the search index. After that,
 | 
			
		||||
    paperless will take care of maintaining the index by itself.
 | 
			
		||||
 | 
			
		||||
    If you see everything working (you should see some migrations getting
 | 
			
		||||
    applied, for instance), you can gracefully stop paperless-ng with Ctrl-C
 | 
			
		||||
    and then start paperless-ng as usual with
 | 
			
		||||
7.  Start paperless-ng.
 | 
			
		||||
 | 
			
		||||
    .. code:: bash
 | 
			
		||||
 | 
			
		||||
@ -281,11 +283,11 @@ Migration to paperless-ng is then performed in a few simple steps:
 | 
			
		||||
 | 
			
		||||
    This will run paperless in the background and automatically start it on system boot.
 | 
			
		||||
 | 
			
		||||
7.  Paperless installed a permanent redirect to ``admin/`` in your browser. This
 | 
			
		||||
8.  Paperless installed a permanent redirect to ``admin/`` in your browser. This
 | 
			
		||||
    redirect is still in place and prevents access to the new UI. Clear
 | 
			
		||||
    browsing cache in order to fix this.
 | 
			
		||||
 | 
			
		||||
8.  Optionally, follow the instructions below to migrate your existing data to PostgreSQL.
 | 
			
		||||
9.  Optionally, follow the instructions below to migrate your existing data to PostgreSQL.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
.. _setup-sqlite_to_psql:
 | 
			
		||||
@ -322,7 +324,7 @@ management commands as below.
 | 
			
		||||
            $ cd /path/to/paperless
 | 
			
		||||
            $ docker-compose run --rm webserver /bin/bash
 | 
			
		||||
        
 | 
			
		||||
        This will lauch the container and initialize the PostgreSQL database.
 | 
			
		||||
        This will launch the container and initialize the PostgreSQL database.
 | 
			
		||||
    
 | 
			
		||||
    b)  Without docker, open a shell in your virtual environment, switch to
 | 
			
		||||
        the ``src`` directory and create the database schema:
 | 
			
		||||
@ -357,6 +359,35 @@ management commands as below.
 | 
			
		||||
7.  Start paperless.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Moving back to paperless
 | 
			
		||||
========================
 | 
			
		||||
 | 
			
		||||
Lets say you migrated to Paperless-ng and used it for a while, but decided that
 | 
			
		||||
you don't like it and want to move back (If you do, send me a mail about what
 | 
			
		||||
part you didn't like!), you can totally do that with a few simple steps.
 | 
			
		||||
 | 
			
		||||
Paperless-ng modified the database schema slightly, however, these changes can
 | 
			
		||||
be reverted while keeping your current data, so that your current data will
 | 
			
		||||
be compatible with original Paperless.
 | 
			
		||||
 | 
			
		||||
Execute this:
 | 
			
		||||
 | 
			
		||||
.. code:: shell-session
 | 
			
		||||
 | 
			
		||||
    $ cd /path/to/paperless
 | 
			
		||||
    $ docker-compose run --rm webserver migrate documents 0023
 | 
			
		||||
 | 
			
		||||
Or without docker:
 | 
			
		||||
 | 
			
		||||
.. code:: shell-session
 | 
			
		||||
 | 
			
		||||
    $ cd /path/to/paperless/src
 | 
			
		||||
    $ python3 manage.py migrate documents 0023
 | 
			
		||||
 | 
			
		||||
After that, you need to clear your cookies (Paperless-ng comes with updated
 | 
			
		||||
dependencies that do cookie-processing differently) and probably your cache
 | 
			
		||||
as well.
 | 
			
		||||
 | 
			
		||||
.. _setup-less_powerful_devices:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -372,7 +403,7 @@ configuring some options in paperless can help improve performance immensely:
 | 
			
		||||
*   ``PAPERLESS_TASK_WORKERS`` and ``PAPERLESS_THREADS_PER_WORKER`` are configured
 | 
			
		||||
    to use all cores. The Raspberry Pi models 3 and up have 4 cores, meaning that
 | 
			
		||||
    paperless will use 2 workers and 2 threads per worker. This may result in
 | 
			
		||||
    slugish response times during consumption, so you might want to lower these
 | 
			
		||||
    sluggish response times during consumption, so you might want to lower these
 | 
			
		||||
    settings (example: 2 workers and 1 thread to always have some computing power
 | 
			
		||||
    left for other tasks).
 | 
			
		||||
*   Keep ``PAPERLESS_OCR_ALWAYS`` at its default value 'false' and consider OCR'ing
 | 
			
		||||
 | 
			
		||||
@ -5,13 +5,13 @@ Usage Overview
 | 
			
		||||
Paperless is an application that manages your personal documents. With
 | 
			
		||||
the help of a document scanner (see :ref:`scanners`), paperless transforms
 | 
			
		||||
your wieldy physical document binders into a searchable archive and
 | 
			
		||||
provices many utilities for finding and managing your documents.
 | 
			
		||||
provides many utilities for finding and managing your documents.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Terms and definitions
 | 
			
		||||
#####################
 | 
			
		||||
 | 
			
		||||
Paperless esentially consists of two different parts for managing your
 | 
			
		||||
Paperless essentially consists of two different parts for managing your
 | 
			
		||||
documents:
 | 
			
		||||
 | 
			
		||||
* The *consumer* watches a specified folder and adds all documents in that
 | 
			
		||||
@ -30,12 +30,12 @@ Each document has a couple of fields that you can assign to them:
 | 
			
		||||
  tag, however, a single document can also have multiple tags. This is not
 | 
			
		||||
  possible with folders. The reason folders are not implemented in paperless
 | 
			
		||||
  is simply that tags are much more versatile than folders.
 | 
			
		||||
* A *document type* is used to demarkate the type of a document such as letter,
 | 
			
		||||
* A *document type* is used to demarcate the type of a document such as letter,
 | 
			
		||||
  bank statement, invoice, contract, etc. It is used to identify what a document
 | 
			
		||||
  is about.
 | 
			
		||||
* The *date added* of a document is the date the document was scanned into
 | 
			
		||||
  paperless. You cannot and should not change this date.
 | 
			
		||||
* The *date created* of a document is the date the document was intially issued.
 | 
			
		||||
* The *date created* of a document is the date the document was initially issued.
 | 
			
		||||
  This can be the date you bought a product, the date you signed a contract, or
 | 
			
		||||
  the date a letter was sent to you.
 | 
			
		||||
* The *archive serial number* (short: ASN) of a document is the identifier of
 | 
			
		||||
@ -131,7 +131,7 @@ These are as follows:
 | 
			
		||||
 | 
			
		||||
    With the correct set of rules, you can completely automate your email documents.
 | 
			
		||||
    Create rules for every correspondent you receive digital documents from and
 | 
			
		||||
    paperless will read them automatically. The default acion "mark as read" is
 | 
			
		||||
    paperless will read them automatically. The default action "mark as read" is
 | 
			
		||||
    pretty tame and will not cause any damage or data loss whatsoever.
 | 
			
		||||
 | 
			
		||||
    You can also setup a special folder in your mail account for paperless and use
 | 
			
		||||
@ -182,7 +182,7 @@ Processing of the physical documents
 | 
			
		||||
====================================
 | 
			
		||||
 | 
			
		||||
Keep a physical inbox. Whenever you receive a document that you need to
 | 
			
		||||
archive, put it into your inbox. Regulary, do the following for all documents
 | 
			
		||||
archive, put it into your inbox. Regularly, do the following for all documents
 | 
			
		||||
in your inbox:
 | 
			
		||||
 | 
			
		||||
1.  For each document, decide if you need to keep the document in physical
 | 
			
		||||
@ -217,18 +217,24 @@ Once you have scanned in a document, proceed in paperless as follows.
 | 
			
		||||
 | 
			
		||||
1.  If the document has an ASN, assign the ASN to the document.
 | 
			
		||||
2.  Assign a correspondent to the document (i.e., your employer, bank, etc)
 | 
			
		||||
    This isnt strictly necessary but helps in finding a document when you need
 | 
			
		||||
    This isn't strictly necessary but helps in finding a document when you need
 | 
			
		||||
    it.
 | 
			
		||||
3.  Assign a document type (i.e., invoice, bank statement, etc) to the document
 | 
			
		||||
    This isnt strictly necessary but helps in finding a document when you need
 | 
			
		||||
    This isn't strictly necessary but helps in finding a document when you need
 | 
			
		||||
    it.
 | 
			
		||||
4.  Assign a proper title to the document (the name of an item you bought, the
 | 
			
		||||
    subject of the letter, etc)
 | 
			
		||||
5.  Check that the date of the document is corrent. Paperless tries to read
 | 
			
		||||
5.  Check that the date of the document is correct. Paperless tries to read
 | 
			
		||||
    the date from the content of the document, but this fails sometimes if the
 | 
			
		||||
    OCR is bad or multiple dates appear on the document.
 | 
			
		||||
6.  Remove inbox tags from the documents.
 | 
			
		||||
 | 
			
		||||
.. hint::
 | 
			
		||||
    
 | 
			
		||||
    You can setup manual matching rules for your correspondents and tags and
 | 
			
		||||
    paperless will assign them automatically. After consuming a couple documents,
 | 
			
		||||
    you can even ask paperless to *learn* when to assign tags and correspondents
 | 
			
		||||
    by itself. For details on this feature, see :ref:`advanced-matching`.
 | 
			
		||||
 | 
			
		||||
Task management
 | 
			
		||||
===============
 | 
			
		||||
 | 
			
		||||
@ -29,6 +29,7 @@
 | 
			
		||||
#PAPERLESS_CORS_ALLOWED_HOSTS=localhost:8080,example.com,localhost:8000
 | 
			
		||||
#PAPERLESS_FORCE_SCRIPT_NAME=
 | 
			
		||||
#PAPERLESS_STATIC_URL=/static/
 | 
			
		||||
#PAPERLESS_AUTO_LOGIN_USERNAME=
 | 
			
		||||
 | 
			
		||||
# Software tweaks
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,19 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# Release checklist
 | 
			
		||||
# - wait for travis build.
 | 
			
		||||
# adjust src/paperless/version.py
 | 
			
		||||
# changelog in the documentation
 | 
			
		||||
# adjust versions in docker/hub/*
 | 
			
		||||
# If docker-compose was modified: all compose files are the same.
 | 
			
		||||
 | 
			
		||||
# Steps:
 | 
			
		||||
# run release script "dev", push
 | 
			
		||||
# if it works: new tag, merge into master
 | 
			
		||||
# on master: make release "lastest", push
 | 
			
		||||
# on master: make release "version-tag", push
 | 
			
		||||
# publish release files
 | 
			
		||||
 | 
			
		||||
set -e
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -23,7 +23,7 @@ import { TagEditDialogComponent } from './components/manage/tag-list/tag-edit-di
 | 
			
		||||
import { DocumentTypeEditDialogComponent } from './components/manage/document-type-list/document-type-edit-dialog/document-type-edit-dialog.component';
 | 
			
		||||
import { TagComponent } from './components/common/tag/tag.component';
 | 
			
		||||
import { SearchComponent } from './components/search/search.component';
 | 
			
		||||
import { ResultHightlightComponent } from './components/search/result-hightlight/result-hightlight.component';
 | 
			
		||||
import { ResultHighlightComponent } from './components/search/result-highlight/result-highlight.component';
 | 
			
		||||
import { PageHeaderComponent } from './components/common/page-header/page-header.component';
 | 
			
		||||
import { AppFrameComponent } from './components/app-frame/app-frame.component';
 | 
			
		||||
import { ToastsComponent } from './components/common/toasts/toasts.component';
 | 
			
		||||
@ -65,7 +65,7 @@ import { WidgetFrameComponent } from './components/dashboard/widgets/widget-fram
 | 
			
		||||
    DocumentTypeEditDialogComponent,
 | 
			
		||||
    TagComponent,
 | 
			
		||||
    SearchComponent,
 | 
			
		||||
    ResultHightlightComponent,
 | 
			
		||||
    ResultHighlightComponent,
 | 
			
		||||
    PageHeaderComponent,
 | 
			
		||||
    AppFrameComponent,
 | 
			
		||||
    ToastsComponent,
 | 
			
		||||
 | 
			
		||||
@ -11,7 +11,7 @@
 | 
			
		||||
          <h5 class="card-title" *ngIf="document.archive_serial_number">#{{document.archive_serial_number}}</h5>
 | 
			
		||||
        </div>
 | 
			
		||||
        <p class="card-text">
 | 
			
		||||
          <app-result-hightlight *ngIf="getDetailsAsHighlight()" class="result-content" [highlights]="getDetailsAsHighlight()"></app-result-hightlight>
 | 
			
		||||
          <app-result-highlight *ngIf="getDetailsAsHighlight()" class="result-content" [highlights]="getDetailsAsHighlight()"></app-result-highlight>
 | 
			
		||||
          <span *ngIf="getDetailsAsString()" class="result-content">{{getDetailsAsString()}}</span>
 | 
			
		||||
        </p>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,20 +1,20 @@
 | 
			
		||||
import { ComponentFixture, TestBed } from '@angular/core/testing';
 | 
			
		||||
 | 
			
		||||
import { ResultHightlightComponent } from './result-hightlight.component';
 | 
			
		||||
import { ResultHighlightComponent } from './result-highlight.component';
 | 
			
		||||
 | 
			
		||||
describe('ResultHightlightComponent', () => {
 | 
			
		||||
  let component: ResultHightlightComponent;
 | 
			
		||||
  let fixture: ComponentFixture<ResultHightlightComponent>;
 | 
			
		||||
describe('ResultHighlightComponent', () => {
 | 
			
		||||
  let component: ResultHighlightComponent;
 | 
			
		||||
  let fixture: ComponentFixture<ResultHighlightComponent>;
 | 
			
		||||
 | 
			
		||||
  beforeEach(async () => {
 | 
			
		||||
    await TestBed.configureTestingModule({
 | 
			
		||||
      declarations: [ ResultHightlightComponent ]
 | 
			
		||||
      declarations: [ ResultHighlightComponent ]
 | 
			
		||||
    })
 | 
			
		||||
    .compileComponents();
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  beforeEach(() => {
 | 
			
		||||
    fixture = TestBed.createComponent(ResultHightlightComponent);
 | 
			
		||||
    fixture = TestBed.createComponent(ResultHighlightComponent);
 | 
			
		||||
    component = fixture.componentInstance;
 | 
			
		||||
    fixture.detectChanges();
 | 
			
		||||
  });
 | 
			
		||||
@ -2,11 +2,11 @@ import { Component, Input, OnInit } from '@angular/core';
 | 
			
		||||
import { SearchHitHighlight } from 'src/app/data/search-result';
 | 
			
		||||
 | 
			
		||||
@Component({
 | 
			
		||||
  selector: 'app-result-hightlight',
 | 
			
		||||
  templateUrl: './result-hightlight.component.html',
 | 
			
		||||
  styleUrls: ['./result-hightlight.component.scss']
 | 
			
		||||
  selector: 'app-result-highlight',
 | 
			
		||||
  templateUrl: './result-highlight.component.html',
 | 
			
		||||
  styleUrls: ['./result-highlight.component.scss']
 | 
			
		||||
})
 | 
			
		||||
export class ResultHightlightComponent implements OnInit {
 | 
			
		||||
export class ResultHighlightComponent implements OnInit {
 | 
			
		||||
 | 
			
		||||
  constructor() { }
 | 
			
		||||
 | 
			
		||||
@ -1 +1,2 @@
 | 
			
		||||
from .checks import changed_password_check
 | 
			
		||||
# this is here so that django finds the checks.
 | 
			
		||||
from .checks import *
 | 
			
		||||
 | 
			
		||||
@ -4,12 +4,13 @@ import os
 | 
			
		||||
import pickle
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from sklearn.feature_extraction.text import CountVectorizer
 | 
			
		||||
from sklearn.neural_network import MLPClassifier
 | 
			
		||||
from sklearn.preprocessing import MultiLabelBinarizer
 | 
			
		||||
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
 | 
			
		||||
from sklearn.utils.multiclass import type_of_target
 | 
			
		||||
 | 
			
		||||
from documents.models import Document, MatchingModel
 | 
			
		||||
from paperless import settings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class IncompatibleClassifierVersionError(Exception):
 | 
			
		||||
@ -27,7 +28,7 @@ def preprocess_content(content):
 | 
			
		||||
 | 
			
		||||
class DocumentClassifier(object):
 | 
			
		||||
 | 
			
		||||
    FORMAT_VERSION = 5
 | 
			
		||||
    FORMAT_VERSION = 6
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        # mtime of the model file on disk. used to prevent reloading when
 | 
			
		||||
@ -54,6 +55,8 @@ class DocumentClassifier(object):
 | 
			
		||||
                        "Cannor load classifier, incompatible versions.")
 | 
			
		||||
                else:
 | 
			
		||||
                    if self.classifier_version > 0:
 | 
			
		||||
                        # Don't be confused by this check. It's simply here
 | 
			
		||||
                        # so that we wont log anything on initial reload.
 | 
			
		||||
                        logger.info("Classifier updated on disk, "
 | 
			
		||||
                                    "reloading classifier models")
 | 
			
		||||
                    self.data_hash = pickle.load(f)
 | 
			
		||||
@ -122,9 +125,14 @@ class DocumentClassifier(object):
 | 
			
		||||
        labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
 | 
			
		||||
 | 
			
		||||
        num_tags = len(labels_tags_unique)
 | 
			
		||||
 | 
			
		||||
        # substract 1 since -1 (null) is also part of the classes.
 | 
			
		||||
        num_correspondents = len(set(labels_correspondent)) - 1
 | 
			
		||||
        num_document_types = len(set(labels_document_type)) - 1
 | 
			
		||||
 | 
			
		||||
        # union with {-1} accounts for cases where all documents have
 | 
			
		||||
        # correspondents and types assigned, so -1 isnt part of labels_x, which
 | 
			
		||||
        # it usually is.
 | 
			
		||||
        num_correspondents = len(set(labels_correspondent) | {-1}) - 1
 | 
			
		||||
        num_document_types = len(set(labels_document_type) | {-1}) - 1
 | 
			
		||||
 | 
			
		||||
        logging.getLogger(__name__).debug(
 | 
			
		||||
            "{} documents, {} tag(s), {} correspondent(s), "
 | 
			
		||||
@ -145,12 +153,23 @@ class DocumentClassifier(object):
 | 
			
		||||
        )
 | 
			
		||||
        data_vectorized = self.data_vectorizer.fit_transform(data)
 | 
			
		||||
 | 
			
		||||
        self.tags_binarizer = MultiLabelBinarizer()
 | 
			
		||||
        labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)
 | 
			
		||||
 | 
			
		||||
        # Step 3: train the classifiers
 | 
			
		||||
        if num_tags > 0:
 | 
			
		||||
            logging.getLogger(__name__).debug("Training tags classifier...")
 | 
			
		||||
 | 
			
		||||
            if num_tags == 1:
 | 
			
		||||
                # Special case where only one tag has auto:
 | 
			
		||||
                # Fallback to binary classification.
 | 
			
		||||
                labels_tags = [label[0] if len(label) == 1 else -1
 | 
			
		||||
                               for label in labels_tags]
 | 
			
		||||
                self.tags_binarizer = LabelBinarizer()
 | 
			
		||||
                labels_tags_vectorized = self.tags_binarizer.fit_transform(
 | 
			
		||||
                    labels_tags).ravel()
 | 
			
		||||
            else:
 | 
			
		||||
                self.tags_binarizer = MultiLabelBinarizer()
 | 
			
		||||
                labels_tags_vectorized = self.tags_binarizer.fit_transform(
 | 
			
		||||
                    labels_tags)
 | 
			
		||||
 | 
			
		||||
            self.tags_classifier = MLPClassifier(tol=0.01)
 | 
			
		||||
            self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
 | 
			
		||||
        else:
 | 
			
		||||
@ -222,6 +241,16 @@ class DocumentClassifier(object):
 | 
			
		||||
            X = self.data_vectorizer.transform([preprocess_content(content)])
 | 
			
		||||
            y = self.tags_classifier.predict(X)
 | 
			
		||||
            tags_ids = self.tags_binarizer.inverse_transform(y)[0]
 | 
			
		||||
            return tags_ids
 | 
			
		||||
            if type_of_target(y).startswith('multilabel'):
 | 
			
		||||
                # the usual case when there are multiple tags.
 | 
			
		||||
                return list(tags_ids)
 | 
			
		||||
            elif type_of_target(y) == 'binary' and tags_ids != -1:
 | 
			
		||||
                # This is for when we have binary classification with only one
 | 
			
		||||
                # tag and the result is to assign this tag.
 | 
			
		||||
                return [tags_ids]
 | 
			
		||||
            else:
 | 
			
		||||
                # Usually binary as well with -1 as the result, but we're
 | 
			
		||||
                # going to catch everything else here as well.
 | 
			
		||||
                return []
 | 
			
		||||
        else:
 | 
			
		||||
            return []
 | 
			
		||||
 | 
			
		||||
@ -8,7 +8,6 @@ from django.conf import settings
 | 
			
		||||
from django.db import transaction
 | 
			
		||||
from django.utils import timezone
 | 
			
		||||
 | 
			
		||||
from paperless.db import GnuPG
 | 
			
		||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
 | 
			
		||||
from .file_handling import generate_filename, create_source_path_directory
 | 
			
		||||
from .loggers import LoggingMixin
 | 
			
		||||
@ -40,17 +39,6 @@ class Consumer(LoggingMixin):
 | 
			
		||||
            raise ConsumerError("Cannot consume {}: It is not a file".format(
 | 
			
		||||
                self.path))
 | 
			
		||||
 | 
			
		||||
    def pre_check_consumption_dir(self):
 | 
			
		||||
        if not settings.CONSUMPTION_DIR:
 | 
			
		||||
            raise ConsumerError(
 | 
			
		||||
                "The CONSUMPTION_DIR settings variable does not appear to be "
 | 
			
		||||
                "set.")
 | 
			
		||||
 | 
			
		||||
        if not os.path.isdir(settings.CONSUMPTION_DIR):
 | 
			
		||||
            raise ConsumerError(
 | 
			
		||||
                "Consumption directory {} does not exist".format(
 | 
			
		||||
                    settings.CONSUMPTION_DIR))
 | 
			
		||||
 | 
			
		||||
    def pre_check_duplicate(self):
 | 
			
		||||
        with open(self.path, "rb") as f:
 | 
			
		||||
            checksum = hashlib.md5(f.read()).hexdigest()
 | 
			
		||||
@ -92,7 +80,6 @@ class Consumer(LoggingMixin):
 | 
			
		||||
        # Make sure that preconditions for consuming the file are met.
 | 
			
		||||
 | 
			
		||||
        self.pre_check_file_exists()
 | 
			
		||||
        self.pre_check_consumption_dir()
 | 
			
		||||
        self.pre_check_directories()
 | 
			
		||||
        self.pre_check_duplicate()
 | 
			
		||||
 | 
			
		||||
@ -208,10 +195,7 @@ class Consumer(LoggingMixin):
 | 
			
		||||
        created = file_info.created or date or timezone.make_aware(
 | 
			
		||||
            datetime.datetime.fromtimestamp(stats.st_mtime))
 | 
			
		||||
 | 
			
		||||
        if settings.PASSPHRASE:
 | 
			
		||||
            storage_type = Document.STORAGE_TYPE_GPG
 | 
			
		||||
        else:
 | 
			
		||||
            storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
 | 
			
		||||
        with open(self.path, "rb") as f:
 | 
			
		||||
            document = Document.objects.create(
 | 
			
		||||
@ -260,8 +244,4 @@ class Consumer(LoggingMixin):
 | 
			
		||||
    def _write(self, document, source, target):
 | 
			
		||||
        with open(source, "rb") as read_file:
 | 
			
		||||
            with open(target, "wb") as write_file:
 | 
			
		||||
                if document.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
 | 
			
		||||
                    write_file.write(read_file.read())
 | 
			
		||||
                    return
 | 
			
		||||
                self.log("debug", "Encrypting")
 | 
			
		||||
                write_file.write(GnuPG.encrypted(read_file))
 | 
			
		||||
                write_file.write(read_file.read())
 | 
			
		||||
 | 
			
		||||
@ -64,15 +64,15 @@ def get_schema():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def open_index(recreate=False):
 | 
			
		||||
    if exists_in(settings.INDEX_DIR) and not recreate:
 | 
			
		||||
        return open_dir(settings.INDEX_DIR)
 | 
			
		||||
    else:
 | 
			
		||||
        # TODO: this is not thread safe. If 2 instances try to create the index
 | 
			
		||||
        #  at the same time, this fails. This currently prevents parallel
 | 
			
		||||
        #  tests.
 | 
			
		||||
        if not os.path.isdir(settings.INDEX_DIR):
 | 
			
		||||
            os.makedirs(settings.INDEX_DIR, exist_ok=True)
 | 
			
		||||
        return create_in(settings.INDEX_DIR, get_schema())
 | 
			
		||||
    try:
 | 
			
		||||
        if exists_in(settings.INDEX_DIR) and not recreate:
 | 
			
		||||
            return open_dir(settings.INDEX_DIR)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Error while opening the index: {e}, recreating.")
 | 
			
		||||
 | 
			
		||||
    if not os.path.isdir(settings.INDEX_DIR):
 | 
			
		||||
        os.makedirs(settings.INDEX_DIR, exist_ok=True)
 | 
			
		||||
    return create_in(settings.INDEX_DIR, get_schema())
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def update_document(writer, doc):
 | 
			
		||||
 | 
			
		||||
@ -1,9 +1,14 @@
 | 
			
		||||
import logging
 | 
			
		||||
import uuid
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PaperlessHandler(logging.Handler):
 | 
			
		||||
    def emit(self, record):
 | 
			
		||||
        if settings.DISABLE_DBHANDLER:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        # We have to do the import here or Django will barf when it tries to
 | 
			
		||||
        # load this because the apps aren't loaded at that point
 | 
			
		||||
        from .models import Log
 | 
			
		||||
 | 
			
		||||
@ -17,16 +17,6 @@ class Command(BaseCommand):
 | 
			
		||||
 | 
			
		||||
    def add_arguments(self, parser):
 | 
			
		||||
 | 
			
		||||
        parser.add_argument(
 | 
			
		||||
            "from",
 | 
			
		||||
            choices=("gpg", "unencrypted"),
 | 
			
		||||
            help="The state you want to change your documents from"
 | 
			
		||||
        )
 | 
			
		||||
        parser.add_argument(
 | 
			
		||||
            "to",
 | 
			
		||||
            choices=("gpg", "unencrypted"),
 | 
			
		||||
            help="The state you want to change your documents to"
 | 
			
		||||
        )
 | 
			
		||||
        parser.add_argument(
 | 
			
		||||
            "--passphrase",
 | 
			
		||||
            help="If PAPERLESS_PASSPHRASE isn't set already, you need to "
 | 
			
		||||
@ -50,11 +40,6 @@ class Command(BaseCommand):
 | 
			
		||||
        except KeyboardInterrupt:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        if options["from"] == options["to"]:
 | 
			
		||||
            raise CommandError(
 | 
			
		||||
                'The "from" and "to" values can\'t be the same.'
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        passphrase = options["passphrase"] or settings.PASSPHRASE
 | 
			
		||||
        if not passphrase:
 | 
			
		||||
            raise CommandError(
 | 
			
		||||
@ -62,10 +47,7 @@ class Command(BaseCommand):
 | 
			
		||||
                "by declaring it in your environment or your config."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        if options["from"] == "gpg" and options["to"] == "unencrypted":
 | 
			
		||||
            self.__gpg_to_unencrypted(passphrase)
 | 
			
		||||
        elif options["from"] == "unencrypted" and options["to"] == "gpg":
 | 
			
		||||
            self.__unencrypted_to_gpg(passphrase)
 | 
			
		||||
        self.__gpg_to_unencrypted(passphrase)
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def __gpg_to_unencrypted(passphrase):
 | 
			
		||||
@ -79,42 +61,28 @@ class Command(BaseCommand):
 | 
			
		||||
                document).encode('utf-8'), "green"))
 | 
			
		||||
 | 
			
		||||
            old_paths = [document.source_path, document.thumbnail_path]
 | 
			
		||||
 | 
			
		||||
            raw_document = GnuPG.decrypted(document.source_file, passphrase)
 | 
			
		||||
            raw_thumb = GnuPG.decrypted(document.thumbnail_file, passphrase)
 | 
			
		||||
 | 
			
		||||
            document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
 | 
			
		||||
            ext = os.path.splitext(document.filename)[1]
 | 
			
		||||
 | 
			
		||||
            if not ext == '.gpg':
 | 
			
		||||
                raise CommandError(
 | 
			
		||||
                    f"Abort: encrypted file {document.source_path} does not "
 | 
			
		||||
                    f"end with .gpg")
 | 
			
		||||
 | 
			
		||||
            document.filename = os.path.splitext(document.filename)[0]
 | 
			
		||||
 | 
			
		||||
            with open(document.source_path, "wb") as f:
 | 
			
		||||
                f.write(raw_document)
 | 
			
		||||
 | 
			
		||||
            with open(document.thumbnail_path, "wb") as f:
 | 
			
		||||
                f.write(raw_thumb)
 | 
			
		||||
 | 
			
		||||
            document.save(update_fields=("storage_type",))
 | 
			
		||||
 | 
			
		||||
            for path in old_paths:
 | 
			
		||||
                os.unlink(path)
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def __unencrypted_to_gpg(passphrase):
 | 
			
		||||
 | 
			
		||||
        unencrypted_files = Document.objects.filter(
 | 
			
		||||
            storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
 | 
			
		||||
 | 
			
		||||
        for document in unencrypted_files:
 | 
			
		||||
 | 
			
		||||
            print(coloured("Encrypting {}".format(document), "green"))
 | 
			
		||||
 | 
			
		||||
            old_paths = [document.source_path, document.thumbnail_path]
 | 
			
		||||
            with open(document.source_path, "rb") as raw_document:
 | 
			
		||||
                with open(document.thumbnail_path, "rb") as raw_thumb:
 | 
			
		||||
                    document.storage_type = Document.STORAGE_TYPE_GPG
 | 
			
		||||
                    with open(document.source_path, "wb") as f:
 | 
			
		||||
                        f.write(GnuPG.encrypted(raw_document, passphrase))
 | 
			
		||||
                    with open(document.thumbnail_path, "wb") as f:
 | 
			
		||||
                        f.write(GnuPG.encrypted(raw_thumb, passphrase))
 | 
			
		||||
 | 
			
		||||
            document.save(update_fields=("storage_type",))
 | 
			
		||||
            document.save(update_fields=("storage_type", "filename"))
 | 
			
		||||
 | 
			
		||||
            for path in old_paths:
 | 
			
		||||
                os.unlink(path)
 | 
			
		||||
@ -1,11 +1,11 @@
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
from time import sleep
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.core.management.base import BaseCommand
 | 
			
		||||
from django.core.management.base import BaseCommand, CommandError
 | 
			
		||||
from django_q.tasks import async_task
 | 
			
		||||
from watchdog.events import FileSystemEventHandler
 | 
			
		||||
from watchdog.observers import Observer
 | 
			
		||||
from watchdog.observers.polling import PollingObserver
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
@ -13,25 +13,54 @@ try:
 | 
			
		||||
except ImportError:
 | 
			
		||||
    INotify = flags = None
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _consume(file):
 | 
			
		||||
    try:
 | 
			
		||||
        if os.path.isfile(file):
 | 
			
		||||
            async_task("documents.tasks.consume_file",
 | 
			
		||||
                       file,
 | 
			
		||||
                       task_name=os.path.basename(file)[:100])
 | 
			
		||||
        else:
 | 
			
		||||
            logger.debug(
 | 
			
		||||
                f"Not consuming file {file}: File has moved.")
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        # Catch all so that the consumer won't crash.
 | 
			
		||||
        # This is also what the test case is listening for to check for
 | 
			
		||||
        # errors.
 | 
			
		||||
        logger.error(
 | 
			
		||||
            "Error while consuming document: {}".format(e))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
 | 
			
		||||
    mtime = -1
 | 
			
		||||
    current_try = 0
 | 
			
		||||
    while current_try < num_tries:
 | 
			
		||||
        try:
 | 
			
		||||
            new_mtime = os.stat(file).st_mtime
 | 
			
		||||
        except FileNotFoundError:
 | 
			
		||||
            logger.debug(f"File {file} moved while waiting for it to remain "
 | 
			
		||||
                         f"unmodified.")
 | 
			
		||||
            return
 | 
			
		||||
        if new_mtime == mtime:
 | 
			
		||||
            _consume(file)
 | 
			
		||||
            return
 | 
			
		||||
        mtime = new_mtime
 | 
			
		||||
        sleep(wait_time)
 | 
			
		||||
        current_try += 1
 | 
			
		||||
 | 
			
		||||
    logger.error(f"Timeout while waiting on file {file} to remain unmodified.")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Handler(FileSystemEventHandler):
 | 
			
		||||
 | 
			
		||||
    def _consume(self, file):
 | 
			
		||||
        if os.path.isfile(file):
 | 
			
		||||
            try:
 | 
			
		||||
                async_task("documents.tasks.consume_file",
 | 
			
		||||
                           file,
 | 
			
		||||
                           task_name=os.path.basename(file)[:100])
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                # Catch all so that the consumer won't crash.
 | 
			
		||||
                logging.getLogger(__name__).error(
 | 
			
		||||
                    "Error while consuming document: {}".format(e))
 | 
			
		||||
 | 
			
		||||
    def on_created(self, event):
 | 
			
		||||
        self._consume(event.src_path)
 | 
			
		||||
        _consume_wait_unmodified(event.src_path)
 | 
			
		||||
 | 
			
		||||
    def on_moved(self, event):
 | 
			
		||||
        self._consume(event.src_path)
 | 
			
		||||
        _consume_wait_unmodified(event.dest_path)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Command(BaseCommand):
 | 
			
		||||
@ -40,12 +69,15 @@ class Command(BaseCommand):
 | 
			
		||||
    consumption directory.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    # This is here primarily for the tests and is irrelevant in production.
 | 
			
		||||
    stop_flag = False
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
 | 
			
		||||
        self.verbosity = 0
 | 
			
		||||
        self.logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
        BaseCommand.__init__(self, *args, **kwargs)
 | 
			
		||||
        self.observer = None
 | 
			
		||||
 | 
			
		||||
    def add_arguments(self, parser):
 | 
			
		||||
        parser.add_argument(
 | 
			
		||||
@ -54,38 +86,66 @@ class Command(BaseCommand):
 | 
			
		||||
            nargs="?",
 | 
			
		||||
            help="The consumption directory."
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def handle(self, *args, **options):
 | 
			
		||||
 | 
			
		||||
        self.verbosity = options["verbosity"]
 | 
			
		||||
        directory = options["directory"]
 | 
			
		||||
 | 
			
		||||
        logging.getLogger(__name__).info(
 | 
			
		||||
            "Starting document consumer at {}".format(
 | 
			
		||||
                directory
 | 
			
		||||
            )
 | 
			
		||||
        parser.add_argument(
 | 
			
		||||
            "--oneshot",
 | 
			
		||||
            action="store_true",
 | 
			
		||||
            help="Run only once."
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # Consume all files as this is not done initially by the watchdog
 | 
			
		||||
        for entry in os.scandir(directory):
 | 
			
		||||
            if entry.is_file():
 | 
			
		||||
                async_task("documents.tasks.consume_file",
 | 
			
		||||
                           entry.path,
 | 
			
		||||
                           task_name=os.path.basename(entry.path)[:100])
 | 
			
		||||
    def handle(self, *args, **options):
 | 
			
		||||
        directory = options["directory"]
 | 
			
		||||
 | 
			
		||||
        # Start the watchdog. Woof!
 | 
			
		||||
        if settings.CONSUMER_POLLING > 0:
 | 
			
		||||
            logging.getLogger(__name__).info(
 | 
			
		||||
                "Using polling instead of file system notifications.")
 | 
			
		||||
            observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
 | 
			
		||||
        if not directory:
 | 
			
		||||
            raise CommandError(
 | 
			
		||||
                "CONSUMPTION_DIR does not appear to be set."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        if not os.path.isdir(directory):
 | 
			
		||||
            raise CommandError(
 | 
			
		||||
                f"Consumption directory {directory} does not exist")
 | 
			
		||||
 | 
			
		||||
        for entry in os.scandir(directory):
 | 
			
		||||
            _consume(entry.path)
 | 
			
		||||
 | 
			
		||||
        if options["oneshot"]:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        if settings.CONSUMER_POLLING == 0 and INotify:
 | 
			
		||||
            self.handle_inotify(directory)
 | 
			
		||||
        else:
 | 
			
		||||
            observer = Observer()
 | 
			
		||||
        event_handler = Handler()
 | 
			
		||||
        observer.schedule(event_handler, directory, recursive=True)
 | 
			
		||||
        observer.start()
 | 
			
		||||
            self.handle_polling(directory)
 | 
			
		||||
 | 
			
		||||
        logger.debug("Consumer exiting.")
 | 
			
		||||
 | 
			
		||||
    def handle_polling(self, directory):
 | 
			
		||||
        logging.getLogger(__name__).info(
 | 
			
		||||
            f"Polling directory for changes: {directory}")
 | 
			
		||||
        self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
 | 
			
		||||
        self.observer.schedule(Handler(), directory, recursive=False)
 | 
			
		||||
        self.observer.start()
 | 
			
		||||
        try:
 | 
			
		||||
            while observer.is_alive():
 | 
			
		||||
                observer.join(1)
 | 
			
		||||
            while self.observer.is_alive():
 | 
			
		||||
                self.observer.join(1)
 | 
			
		||||
                if self.stop_flag:
 | 
			
		||||
                    self.observer.stop()
 | 
			
		||||
        except KeyboardInterrupt:
 | 
			
		||||
            observer.stop()
 | 
			
		||||
        observer.join()
 | 
			
		||||
            self.observer.stop()
 | 
			
		||||
        self.observer.join()
 | 
			
		||||
 | 
			
		||||
    def handle_inotify(self, directory):
 | 
			
		||||
        logging.getLogger(__name__).info(
 | 
			
		||||
            f"Using inotify to watch directory for changes: {directory}")
 | 
			
		||||
 | 
			
		||||
        inotify = INotify()
 | 
			
		||||
        descriptor = inotify.add_watch(
 | 
			
		||||
            directory, flags.CLOSE_WRITE | flags.MOVED_TO)
 | 
			
		||||
        try:
 | 
			
		||||
            while not self.stop_flag:
 | 
			
		||||
                for event in inotify.read(timeout=1000, read_delay=1000):
 | 
			
		||||
                    file = os.path.join(directory, event.name)
 | 
			
		||||
                    _consume(file)
 | 
			
		||||
        except KeyboardInterrupt:
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
        inotify.rm_watch(descriptor)
 | 
			
		||||
        inotify.close()
 | 
			
		||||
 | 
			
		||||
@ -22,13 +22,6 @@ class Command(Renderable, BaseCommand):
 | 
			
		||||
 | 
			
		||||
    def add_arguments(self, parser):
 | 
			
		||||
        parser.add_argument("target")
 | 
			
		||||
        parser.add_argument(
 | 
			
		||||
            "--legacy",
 | 
			
		||||
            action="store_true",
 | 
			
		||||
            help="Don't try to export all of the document data, just dump the "
 | 
			
		||||
                 "original document files out in a format that makes "
 | 
			
		||||
                 "re-consuming them easy."
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        BaseCommand.__init__(self, *args, **kwargs)
 | 
			
		||||
@ -44,10 +37,7 @@ class Command(Renderable, BaseCommand):
 | 
			
		||||
        if not os.access(self.target, os.W_OK):
 | 
			
		||||
            raise CommandError("That path doesn't appear to be writable")
 | 
			
		||||
 | 
			
		||||
        if options["legacy"]:
 | 
			
		||||
            self.dump_legacy()
 | 
			
		||||
        else:
 | 
			
		||||
            self.dump()
 | 
			
		||||
        self.dump()
 | 
			
		||||
 | 
			
		||||
    def dump(self):
 | 
			
		||||
 | 
			
		||||
@ -102,33 +92,3 @@ class Command(Renderable, BaseCommand):
 | 
			
		||||
 | 
			
		||||
        with open(os.path.join(self.target, "manifest.json"), "w") as f:
 | 
			
		||||
            json.dump(manifest, f, indent=2)
 | 
			
		||||
 | 
			
		||||
    def dump_legacy(self):
 | 
			
		||||
 | 
			
		||||
        for document in Document.objects.all():
 | 
			
		||||
 | 
			
		||||
            target = os.path.join(
 | 
			
		||||
                self.target, self._get_legacy_file_name(document))
 | 
			
		||||
 | 
			
		||||
            print("Exporting: {}".format(target))
 | 
			
		||||
 | 
			
		||||
            with open(target, "wb") as f:
 | 
			
		||||
                f.write(GnuPG.decrypted(document.source_file))
 | 
			
		||||
                t = int(time.mktime(document.created.timetuple()))
 | 
			
		||||
                os.utime(target, times=(t, t))
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _get_legacy_file_name(doc):
 | 
			
		||||
 | 
			
		||||
        if not doc.correspondent and not doc.title:
 | 
			
		||||
            return os.path.basename(doc.source_path)
 | 
			
		||||
 | 
			
		||||
        created = doc.created.strftime("%Y%m%d%H%M%SZ")
 | 
			
		||||
        tags = ",".join([t.slug for t in doc.tags.all()])
 | 
			
		||||
 | 
			
		||||
        if tags:
 | 
			
		||||
            return "{} - {} - {} - {}{}".format(
 | 
			
		||||
                created, doc.correspondent, doc.title, tags, doc.file_type)
 | 
			
		||||
 | 
			
		||||
        return "{} - {} - {}{}".format(
 | 
			
		||||
            created, doc.correspondent, doc.title, doc.file_type)
 | 
			
		||||
 | 
			
		||||
@ -82,8 +82,6 @@ class Command(Renderable, BaseCommand):
 | 
			
		||||
    def _import_files_from_manifest(self):
 | 
			
		||||
 | 
			
		||||
        storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
        if settings.PASSPHRASE:
 | 
			
		||||
            storage_type = Document.STORAGE_TYPE_GPG
 | 
			
		||||
 | 
			
		||||
        for record in self.manifest:
 | 
			
		||||
 | 
			
		||||
@ -105,23 +103,8 @@ class Command(Renderable, BaseCommand):
 | 
			
		||||
 | 
			
		||||
            create_source_path_directory(document.source_path)
 | 
			
		||||
 | 
			
		||||
            if settings.PASSPHRASE:
 | 
			
		||||
 | 
			
		||||
                with open(document_path, "rb") as unencrypted:
 | 
			
		||||
                    with open(document.source_path, "wb") as encrypted:
 | 
			
		||||
                        print("Encrypting {} and saving it to {}".format(
 | 
			
		||||
                            doc_file, document.source_path))
 | 
			
		||||
                        encrypted.write(GnuPG.encrypted(unencrypted))
 | 
			
		||||
 | 
			
		||||
                with open(thumbnail_path, "rb") as unencrypted:
 | 
			
		||||
                    with open(document.thumbnail_path, "wb") as encrypted:
 | 
			
		||||
                        print("Encrypting {} and saving it to {}".format(
 | 
			
		||||
                            thumb_file, document.thumbnail_path))
 | 
			
		||||
                        encrypted.write(GnuPG.encrypted(unencrypted))
 | 
			
		||||
 | 
			
		||||
            else:
 | 
			
		||||
                print(f"Moving {document_path} to {document.source_path}")
 | 
			
		||||
                shutil.copy(document_path, document.source_path)
 | 
			
		||||
                shutil.copy(thumbnail_path, document.thumbnail_path)
 | 
			
		||||
            print(f"Moving {document_path} to {document.source_path}")
 | 
			
		||||
            shutil.copy(document_path, document.source_path)
 | 
			
		||||
            shutil.copy(thumbnail_path, document.thumbnail_path)
 | 
			
		||||
 | 
			
		||||
            document.save()
 | 
			
		||||
 | 
			
		||||
@ -5,23 +5,6 @@ from django.db import migrations, models
 | 
			
		||||
import django.db.models.deletion
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def make_index(apps, schema_editor):
 | 
			
		||||
    Document = apps.get_model("documents", "Document")
 | 
			
		||||
    documents = Document.objects.all()
 | 
			
		||||
    print()
 | 
			
		||||
    try:
 | 
			
		||||
        print("  --> Creating document index...")
 | 
			
		||||
        from whoosh.writing import AsyncWriter
 | 
			
		||||
        from documents import index
 | 
			
		||||
        ix = index.open_index(recreate=True)
 | 
			
		||||
        with AsyncWriter(ix) as writer:
 | 
			
		||||
            for document in documents:
 | 
			
		||||
                index.update_document(writer, document)
 | 
			
		||||
    except ImportError:
 | 
			
		||||
        # index may not be relevant anymore
 | 
			
		||||
        print("  --> Cannot create document index.")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def logs_set_default_group(apps, schema_editor):
 | 
			
		||||
    Log = apps.get_model('documents', 'Log')
 | 
			
		||||
    for log in Log.objects.all():
 | 
			
		||||
@ -99,8 +82,4 @@ class Migration(migrations.Migration):
 | 
			
		||||
            code=django.db.migrations.operations.special.RunPython.noop,
 | 
			
		||||
            reverse_code=logs_set_default_group
 | 
			
		||||
        ),
 | 
			
		||||
        migrations.RunPython(
 | 
			
		||||
            code=make_index,
 | 
			
		||||
            reverse_code=django.db.migrations.operations.special.RunPython.noop,
 | 
			
		||||
        ),
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										26
									
								
								src/documents/migrations/1004_sanity_check_schedule.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								src/documents/migrations/1004_sanity_check_schedule.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,26 @@
 | 
			
		||||
# Generated by Django 3.1.3 on 2020-11-25 14:53
 | 
			
		||||
 | 
			
		||||
from django.db import migrations
 | 
			
		||||
from django.db.migrations import RunPython
 | 
			
		||||
from django_q.models import Schedule
 | 
			
		||||
from django_q.tasks import schedule
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def add_schedules(apps, schema_editor):
 | 
			
		||||
    schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def remove_schedules(apps, schema_editor):
 | 
			
		||||
    Schedule.objects.filter(func='documents.tasks.sanity_check').delete()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Migration(migrations.Migration):
 | 
			
		||||
 | 
			
		||||
    dependencies = [
 | 
			
		||||
        ('documents', '1003_mime_types'),
 | 
			
		||||
        ('django_q', '0013_task_attempt_count'),
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    operations = [
 | 
			
		||||
        RunPython(add_schedules, remove_schedules)
 | 
			
		||||
    ]
 | 
			
		||||
@ -230,6 +230,7 @@ class Document(models.Model):
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def file_type(self):
 | 
			
		||||
        # TODO: this is not stable across python versions
 | 
			
		||||
        return mimetypes.guess_extension(str(self.mime_type))
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										94
									
								
								src/documents/sanity_checker.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										94
									
								
								src/documents/sanity_checker.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,94 @@
 | 
			
		||||
import hashlib
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
 | 
			
		||||
from documents.models import Document
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SanityMessage:
 | 
			
		||||
    message = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SanityWarning(SanityMessage):
 | 
			
		||||
    def __init__(self, message):
 | 
			
		||||
        self.message = message
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        return f"Warning: {self.message}"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SanityError(SanityMessage):
 | 
			
		||||
    def __init__(self, message):
 | 
			
		||||
        self.message = message
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        return f"ERROR: {self.message}"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SanityFailedError(Exception):
 | 
			
		||||
 | 
			
		||||
    def __init__(self, messages):
 | 
			
		||||
        self.messages = messages
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        message_string = "\n".join([str(m) for m in self.messages])
 | 
			
		||||
        return (
 | 
			
		||||
            f"The following issuse were found by the sanity checker:\n"
 | 
			
		||||
            f"{message_string}\n\n===============\n\n")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_sanity():
 | 
			
		||||
    messages = []
 | 
			
		||||
 | 
			
		||||
    present_files = []
 | 
			
		||||
    for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
 | 
			
		||||
        for f in files:
 | 
			
		||||
            present_files.append(os.path.normpath(os.path.join(root, f)))
 | 
			
		||||
 | 
			
		||||
    for doc in Document.objects.all():
 | 
			
		||||
        # Check thumbnail
 | 
			
		||||
        if not os.path.isfile(doc.thumbnail_path):
 | 
			
		||||
            messages.append(SanityError(
 | 
			
		||||
                f"Thumbnail of document {doc.pk} does not exist."))
 | 
			
		||||
        else:
 | 
			
		||||
            present_files.remove(os.path.normpath(doc.thumbnail_path))
 | 
			
		||||
            try:
 | 
			
		||||
                with doc.thumbnail_file as f:
 | 
			
		||||
                    f.read()
 | 
			
		||||
            except OSError as e:
 | 
			
		||||
                messages.append(SanityError(
 | 
			
		||||
                    f"Cannot read thumbnail file of document {doc.pk}: {e}"
 | 
			
		||||
                ))
 | 
			
		||||
 | 
			
		||||
        # Check document
 | 
			
		||||
        if not os.path.isfile(doc.source_path):
 | 
			
		||||
            messages.append(SanityError(
 | 
			
		||||
                f"Original of document {doc.pk} does not exist."))
 | 
			
		||||
        else:
 | 
			
		||||
            present_files.remove(os.path.normpath(doc.source_path))
 | 
			
		||||
            checksum = None
 | 
			
		||||
            try:
 | 
			
		||||
                with doc.source_file as f:
 | 
			
		||||
                    checksum = hashlib.md5(f.read()).hexdigest()
 | 
			
		||||
            except OSError as e:
 | 
			
		||||
                messages.append(SanityError(
 | 
			
		||||
                    f"Cannot read original file of document {doc.pk}: {e}"))
 | 
			
		||||
 | 
			
		||||
            if checksum and not checksum == doc.checksum:
 | 
			
		||||
                messages.append(SanityError(
 | 
			
		||||
                    f"Checksum mismatch of document {doc.pk}. "
 | 
			
		||||
                    f"Stored: {doc.checksum}, actual: {checksum}."
 | 
			
		||||
                ))
 | 
			
		||||
 | 
			
		||||
        if not doc.content:
 | 
			
		||||
            messages.append(SanityWarning(
 | 
			
		||||
                f"Document {doc.pk} has no content."
 | 
			
		||||
            ))
 | 
			
		||||
 | 
			
		||||
    for extra_file in present_files:
 | 
			
		||||
        messages.append(SanityWarning(
 | 
			
		||||
            f"Orphaned file in media dir: {extra_file}"
 | 
			
		||||
        ))
 | 
			
		||||
 | 
			
		||||
    return messages
 | 
			
		||||
@ -93,14 +93,11 @@ class DocumentSerializer(serializers.ModelSerializer):
 | 
			
		||||
            "document_type_id",
 | 
			
		||||
            "title",
 | 
			
		||||
            "content",
 | 
			
		||||
            "mime_type",
 | 
			
		||||
            "tags",
 | 
			
		||||
            "tags_id",
 | 
			
		||||
            "checksum",
 | 
			
		||||
            "created",
 | 
			
		||||
            "modified",
 | 
			
		||||
            "added",
 | 
			
		||||
            "file_name",
 | 
			
		||||
            "archive_serial_number"
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -3,11 +3,12 @@ import logging
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from whoosh.writing import AsyncWriter
 | 
			
		||||
 | 
			
		||||
from documents import index
 | 
			
		||||
from documents import index, sanity_checker
 | 
			
		||||
from documents.classifier import DocumentClassifier, \
 | 
			
		||||
    IncompatibleClassifierVersionError
 | 
			
		||||
from documents.consumer import Consumer, ConsumerError
 | 
			
		||||
from documents.models import Document
 | 
			
		||||
from documents.sanity_checker import SanityFailedError
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def index_optimize():
 | 
			
		||||
@ -74,3 +75,12 @@ def consume_file(path,
 | 
			
		||||
    else:
 | 
			
		||||
        raise ConsumerError("Unknown error: Returned document was null, but "
 | 
			
		||||
                            "no error message was given.")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def sanity_check():
 | 
			
		||||
    messages = sanity_checker.check_sanity()
 | 
			
		||||
 | 
			
		||||
    if len(messages) > 0:
 | 
			
		||||
        raise SanityFailedError(messages)
 | 
			
		||||
    else:
 | 
			
		||||
        return "No issues detected."
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/originals/0000001.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/originals/0000001.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/originals/0000002.pdf.gpg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/originals/0000002.pdf.gpg
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/simple.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/simple.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/simple.zip
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/simple.zip
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/thumb/0000001.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/thumb/0000001.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| 
		 After Width: | Height: | Size: 7.7 KiB  | 
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/thumb/0000002.png.gpg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/thumb/0000002.png.gpg
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							@ -1,40 +1,24 @@
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
import tempfile
 | 
			
		||||
from unittest import mock
 | 
			
		||||
 | 
			
		||||
from django.contrib.auth.models import User
 | 
			
		||||
from django.test import override_settings
 | 
			
		||||
from pathvalidate import ValidationError
 | 
			
		||||
from rest_framework.test import APITestCase
 | 
			
		||||
 | 
			
		||||
from documents import index
 | 
			
		||||
from documents.models import Document, Correspondent, DocumentType, Tag
 | 
			
		||||
from documents.tests.utils import DirectoriesMixin
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DocumentApiTest(APITestCase):
 | 
			
		||||
class DocumentApiTest(DirectoriesMixin, APITestCase):
 | 
			
		||||
 | 
			
		||||
    def setUp(self):
 | 
			
		||||
        self.scratch_dir = tempfile.mkdtemp()
 | 
			
		||||
        self.media_dir = tempfile.mkdtemp()
 | 
			
		||||
        self.originals_dir = os.path.join(self.media_dir, "documents", "originals")
 | 
			
		||||
        self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails")
 | 
			
		||||
 | 
			
		||||
        os.makedirs(self.originals_dir, exist_ok=True)
 | 
			
		||||
        os.makedirs(self.thumbnail_dir, exist_ok=True)
 | 
			
		||||
 | 
			
		||||
        override_settings(
 | 
			
		||||
            SCRATCH_DIR=self.scratch_dir,
 | 
			
		||||
            MEDIA_ROOT=self.media_dir,
 | 
			
		||||
            ORIGINALS_DIR=self.originals_dir,
 | 
			
		||||
            THUMBNAIL_DIR=self.thumbnail_dir
 | 
			
		||||
        ).enable()
 | 
			
		||||
        super(DocumentApiTest, self).setUp()
 | 
			
		||||
 | 
			
		||||
        user = User.objects.create_superuser(username="temp_admin")
 | 
			
		||||
        self.client.force_login(user=user)
 | 
			
		||||
 | 
			
		||||
    def tearDown(self):
 | 
			
		||||
        shutil.rmtree(self.scratch_dir, ignore_errors=True)
 | 
			
		||||
        shutil.rmtree(self.media_dir, ignore_errors=True)
 | 
			
		||||
 | 
			
		||||
    def testDocuments(self):
 | 
			
		||||
 | 
			
		||||
        response = self.client.get("/api/documents/").data
 | 
			
		||||
@ -87,7 +71,7 @@ class DocumentApiTest(APITestCase):
 | 
			
		||||
 | 
			
		||||
    def test_document_actions(self):
 | 
			
		||||
 | 
			
		||||
        _, filename = tempfile.mkstemp(dir=self.originals_dir)
 | 
			
		||||
        _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
 | 
			
		||||
 | 
			
		||||
        content = b"This is a test"
 | 
			
		||||
        content_thumbnail = b"thumbnail content"
 | 
			
		||||
@ -97,7 +81,7 @@ class DocumentApiTest(APITestCase):
 | 
			
		||||
 | 
			
		||||
        doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")
 | 
			
		||||
 | 
			
		||||
        with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
 | 
			
		||||
        with open(os.path.join(self.dirs.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
 | 
			
		||||
            f.write(content_thumbnail)
 | 
			
		||||
 | 
			
		||||
        response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
 | 
			
		||||
@ -179,6 +163,109 @@ class DocumentApiTest(APITestCase):
 | 
			
		||||
        results = response.data['results']
 | 
			
		||||
        self.assertEqual(len(results), 3)
 | 
			
		||||
 | 
			
		||||
    def test_search_no_query(self):
 | 
			
		||||
        response = self.client.get("/api/search/")
 | 
			
		||||
        results = response.data['results']
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(len(results), 0)
 | 
			
		||||
 | 
			
		||||
    def test_search(self):
 | 
			
		||||
        d1=Document.objects.create(title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1)
 | 
			
		||||
        d2=Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B")
 | 
			
		||||
        d3=Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C")
 | 
			
		||||
        with index.open_index(False).writer() as writer:
 | 
			
		||||
            # Note to future self: there is a reason we dont use a model signal handler to update the index: some operations edit many documents at once
 | 
			
		||||
            # (retagger, renamer) and we don't want to open a writer for each of these, but rather perform the entire operation with one writer.
 | 
			
		||||
            # That's why we cant open the writer in a model on_save handler or something.
 | 
			
		||||
            index.update_document(writer, d1)
 | 
			
		||||
            index.update_document(writer, d2)
 | 
			
		||||
            index.update_document(writer, d3)
 | 
			
		||||
        response = self.client.get("/api/search/?query=bank")
 | 
			
		||||
        results = response.data['results']
 | 
			
		||||
        self.assertEqual(response.data['count'], 3)
 | 
			
		||||
        self.assertEqual(response.data['page'], 1)
 | 
			
		||||
        self.assertEqual(response.data['page_count'], 1)
 | 
			
		||||
        self.assertEqual(len(results), 3)
 | 
			
		||||
 | 
			
		||||
        response = self.client.get("/api/search/?query=september")
 | 
			
		||||
        results = response.data['results']
 | 
			
		||||
        self.assertEqual(response.data['count'], 1)
 | 
			
		||||
        self.assertEqual(response.data['page'], 1)
 | 
			
		||||
        self.assertEqual(response.data['page_count'], 1)
 | 
			
		||||
        self.assertEqual(len(results), 1)
 | 
			
		||||
 | 
			
		||||
        response = self.client.get("/api/search/?query=statement")
 | 
			
		||||
        results = response.data['results']
 | 
			
		||||
        self.assertEqual(response.data['count'], 2)
 | 
			
		||||
        self.assertEqual(response.data['page'], 1)
 | 
			
		||||
        self.assertEqual(response.data['page_count'], 1)
 | 
			
		||||
        self.assertEqual(len(results), 2)
 | 
			
		||||
 | 
			
		||||
        response = self.client.get("/api/search/?query=sfegdfg")
 | 
			
		||||
        results = response.data['results']
 | 
			
		||||
        self.assertEqual(response.data['count'], 0)
 | 
			
		||||
        self.assertEqual(response.data['page'], 0)
 | 
			
		||||
        self.assertEqual(response.data['page_count'], 0)
 | 
			
		||||
        self.assertEqual(len(results), 0)
 | 
			
		||||
 | 
			
		||||
    def test_search_multi_page(self):
 | 
			
		||||
        with index.open_index(False).writer() as writer:
 | 
			
		||||
            for i in range(55):
 | 
			
		||||
                doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
 | 
			
		||||
                index.update_document(writer, doc)
 | 
			
		||||
 | 
			
		||||
        # This is here so that we test that no document gets returned twice (might happen if the paging is not working)
 | 
			
		||||
        seen_ids = []
 | 
			
		||||
 | 
			
		||||
        for i in range(1, 6):
 | 
			
		||||
            response = self.client.get(f"/api/search/?query=content&page={i}")
 | 
			
		||||
            results = response.data['results']
 | 
			
		||||
            self.assertEqual(response.data['count'], 55)
 | 
			
		||||
            self.assertEqual(response.data['page'], i)
 | 
			
		||||
            self.assertEqual(response.data['page_count'], 6)
 | 
			
		||||
            self.assertEqual(len(results), 10)
 | 
			
		||||
 | 
			
		||||
            for result in results:
 | 
			
		||||
                self.assertNotIn(result['id'], seen_ids)
 | 
			
		||||
                seen_ids.append(result['id'])
 | 
			
		||||
 | 
			
		||||
        response = self.client.get(f"/api/search/?query=content&page=6")
 | 
			
		||||
        results = response.data['results']
 | 
			
		||||
        self.assertEqual(response.data['count'], 55)
 | 
			
		||||
        self.assertEqual(response.data['page'], 6)
 | 
			
		||||
        self.assertEqual(response.data['page_count'], 6)
 | 
			
		||||
        self.assertEqual(len(results), 5)
 | 
			
		||||
 | 
			
		||||
        for result in results:
 | 
			
		||||
            self.assertNotIn(result['id'], seen_ids)
 | 
			
		||||
            seen_ids.append(result['id'])
 | 
			
		||||
 | 
			
		||||
        response = self.client.get(f"/api/search/?query=content&page=7")
 | 
			
		||||
        results = response.data['results']
 | 
			
		||||
        self.assertEqual(response.data['count'], 55)
 | 
			
		||||
        self.assertEqual(response.data['page'], 6)
 | 
			
		||||
        self.assertEqual(response.data['page_count'], 6)
 | 
			
		||||
        self.assertEqual(len(results), 5)
 | 
			
		||||
 | 
			
		||||
    def test_search_invalid_page(self):
 | 
			
		||||
        with index.open_index(False).writer() as writer:
 | 
			
		||||
            for i in range(15):
 | 
			
		||||
                doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
 | 
			
		||||
                index.update_document(writer, doc)
 | 
			
		||||
 | 
			
		||||
        first_page = self.client.get(f"/api/search/?query=content&page=1").data
 | 
			
		||||
        second_page = self.client.get(f"/api/search/?query=content&page=2").data
 | 
			
		||||
        should_be_first_page_1 = self.client.get(f"/api/search/?query=content&page=0").data
 | 
			
		||||
        should_be_first_page_2 = self.client.get(f"/api/search/?query=content&page=dgfd").data
 | 
			
		||||
        should_be_first_page_3 = self.client.get(f"/api/search/?query=content&page=").data
 | 
			
		||||
        should_be_first_page_4 = self.client.get(f"/api/search/?query=content&page=-7868").data
 | 
			
		||||
 | 
			
		||||
        self.assertDictEqual(first_page, should_be_first_page_1)
 | 
			
		||||
        self.assertDictEqual(first_page, should_be_first_page_2)
 | 
			
		||||
        self.assertDictEqual(first_page, should_be_first_page_3)
 | 
			
		||||
        self.assertDictEqual(first_page, should_be_first_page_4)
 | 
			
		||||
        self.assertNotEqual(len(first_page['results']), len(second_page['results']))
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.index.autocomplete")
 | 
			
		||||
    def test_search_autocomplete(self, m):
 | 
			
		||||
        m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
 | 
			
		||||
@ -215,3 +302,42 @@ class DocumentApiTest(APITestCase):
 | 
			
		||||
        self.assertEqual(response.status_code, 200)
 | 
			
		||||
        self.assertEqual(response.data['documents_total'], 3)
 | 
			
		||||
        self.assertEqual(response.data['documents_inbox'], 1)
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.forms.async_task")
 | 
			
		||||
    def test_upload(self, m):
 | 
			
		||||
 | 
			
		||||
        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
 | 
			
		||||
            response = self.client.post("/api/documents/post_document/", {"document": f})
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(response.status_code, 200)
 | 
			
		||||
 | 
			
		||||
        m.assert_called_once()
 | 
			
		||||
 | 
			
		||||
        args, kwargs = m.call_args
 | 
			
		||||
        self.assertEqual(kwargs['override_filename'], "simple.pdf")
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.forms.async_task")
 | 
			
		||||
    def test_upload_invalid_form(self, m):
 | 
			
		||||
 | 
			
		||||
        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
 | 
			
		||||
            response = self.client.post("/api/documents/post_document/", {"documenst": f})
 | 
			
		||||
        self.assertEqual(response.status_code, 400)
 | 
			
		||||
        m.assert_not_called()
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.forms.async_task")
 | 
			
		||||
    def test_upload_invalid_file(self, m):
 | 
			
		||||
 | 
			
		||||
        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.zip"), "rb") as f:
 | 
			
		||||
            response = self.client.post("/api/documents/post_document/", {"document": f})
 | 
			
		||||
        self.assertEqual(response.status_code, 400)
 | 
			
		||||
        m.assert_not_called()
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.forms.async_task")
 | 
			
		||||
    @mock.patch("documents.forms.validate_filename")
 | 
			
		||||
    def test_upload_invalid_filename(self, validate_filename, async_task):
 | 
			
		||||
        validate_filename.side_effect = ValidationError()
 | 
			
		||||
        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
 | 
			
		||||
            response = self.client.post("/api/documents/post_document/", {"document": f})
 | 
			
		||||
        self.assertEqual(response.status_code, 400)
 | 
			
		||||
 | 
			
		||||
        async_task.assert_not_called()
 | 
			
		||||
 | 
			
		||||
@ -1,24 +1,29 @@
 | 
			
		||||
import tempfile
 | 
			
		||||
from time import sleep
 | 
			
		||||
from unittest import mock
 | 
			
		||||
 | 
			
		||||
from django.test import TestCase, override_settings
 | 
			
		||||
 | 
			
		||||
from documents.classifier import DocumentClassifier
 | 
			
		||||
from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError
 | 
			
		||||
from documents.models import Correspondent, Document, Tag, DocumentType
 | 
			
		||||
from documents.tests.utils import DirectoriesMixin
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestClassifier(TestCase):
 | 
			
		||||
class TestClassifier(DirectoriesMixin, TestCase):
 | 
			
		||||
 | 
			
		||||
    def setUp(self):
 | 
			
		||||
 | 
			
		||||
        super(TestClassifier, self).setUp()
 | 
			
		||||
        self.classifier = DocumentClassifier()
 | 
			
		||||
 | 
			
		||||
    def generate_test_data(self):
 | 
			
		||||
        self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
 | 
			
		||||
        self.c2 = Correspondent.objects.create(name="c2")
 | 
			
		||||
        self.c3 = Correspondent.objects.create(name="c3", matching_algorithm=Correspondent.MATCH_AUTO)
 | 
			
		||||
        self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
 | 
			
		||||
        self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
 | 
			
		||||
        self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
 | 
			
		||||
        self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
 | 
			
		||||
        self.dt2 = DocumentType.objects.create(name="dt2", matching_algorithm=DocumentType.MATCH_AUTO)
 | 
			
		||||
 | 
			
		||||
        self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
 | 
			
		||||
        self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
 | 
			
		||||
@ -59,8 +64,8 @@ class TestClassifier(TestCase):
 | 
			
		||||
        self.classifier.train()
 | 
			
		||||
        self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
 | 
			
		||||
        self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
 | 
			
		||||
        self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,))
 | 
			
		||||
        self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk))
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(self.doc1.content), [self.t1.pk])
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(self.doc2.content), [self.t1.pk, self.t3.pk])
 | 
			
		||||
        self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
 | 
			
		||||
        self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
 | 
			
		||||
 | 
			
		||||
@ -71,6 +76,44 @@ class TestClassifier(TestCase):
 | 
			
		||||
        self.assertTrue(self.classifier.train())
 | 
			
		||||
        self.assertFalse(self.classifier.train())
 | 
			
		||||
 | 
			
		||||
    def testVersionIncreased(self):
 | 
			
		||||
 | 
			
		||||
        self.generate_test_data()
 | 
			
		||||
        self.assertTrue(self.classifier.train())
 | 
			
		||||
        self.assertFalse(self.classifier.train())
 | 
			
		||||
 | 
			
		||||
        self.classifier.save_classifier()
 | 
			
		||||
 | 
			
		||||
        classifier2 = DocumentClassifier()
 | 
			
		||||
 | 
			
		||||
        current_ver = DocumentClassifier.FORMAT_VERSION
 | 
			
		||||
        with mock.patch("documents.classifier.DocumentClassifier.FORMAT_VERSION", current_ver+1):
 | 
			
		||||
            # assure that we won't load old classifiers.
 | 
			
		||||
            self.assertRaises(IncompatibleClassifierVersionError, classifier2.reload)
 | 
			
		||||
 | 
			
		||||
            self.classifier.save_classifier()
 | 
			
		||||
 | 
			
		||||
            # assure that we can load the classifier after saving it.
 | 
			
		||||
            classifier2.reload()
 | 
			
		||||
 | 
			
		||||
    def testReload(self):
 | 
			
		||||
 | 
			
		||||
        self.generate_test_data()
 | 
			
		||||
        self.assertTrue(self.classifier.train())
 | 
			
		||||
        self.classifier.save_classifier()
 | 
			
		||||
 | 
			
		||||
        classifier2 = DocumentClassifier()
 | 
			
		||||
        classifier2.reload()
 | 
			
		||||
        v1 = classifier2.classifier_version
 | 
			
		||||
 | 
			
		||||
        # change the classifier after some time.
 | 
			
		||||
        sleep(1)
 | 
			
		||||
        self.classifier.save_classifier()
 | 
			
		||||
 | 
			
		||||
        classifier2.reload()
 | 
			
		||||
        v2 = classifier2.classifier_version
 | 
			
		||||
        self.assertNotEqual(v1, v2)
 | 
			
		||||
 | 
			
		||||
    @override_settings(DATA_DIR=tempfile.mkdtemp())
 | 
			
		||||
    def testSaveClassifier(self):
 | 
			
		||||
 | 
			
		||||
@ -83,3 +126,112 @@ class TestClassifier(TestCase):
 | 
			
		||||
        new_classifier = DocumentClassifier()
 | 
			
		||||
        new_classifier.reload()
 | 
			
		||||
        self.assertFalse(new_classifier.train())
 | 
			
		||||
 | 
			
		||||
    def test_one_correspondent_predict(self):
 | 
			
		||||
        c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
 | 
			
		||||
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
 | 
			
		||||
 | 
			
		||||
        self.classifier.train()
 | 
			
		||||
        self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
 | 
			
		||||
 | 
			
		||||
    def test_one_correspondent_predict_manydocs(self):
 | 
			
		||||
        c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
 | 
			
		||||
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
 | 
			
		||||
        doc2 = Document.objects.create(title="doc2", content="this is a document from noone", checksum="B")
 | 
			
		||||
 | 
			
		||||
        self.classifier.train()
 | 
			
		||||
        self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
 | 
			
		||||
        self.assertIsNone(self.classifier.predict_correspondent(doc2.content))
 | 
			
		||||
 | 
			
		||||
    def test_one_type_predict(self):
 | 
			
		||||
        dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
 | 
			
		||||
 | 
			
		||||
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
 | 
			
		||||
                                            checksum="A", document_type=dt)
 | 
			
		||||
 | 
			
		||||
        self.classifier.train()
 | 
			
		||||
        self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
 | 
			
		||||
 | 
			
		||||
    def test_one_type_predict_manydocs(self):
 | 
			
		||||
        dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
 | 
			
		||||
 | 
			
		||||
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
 | 
			
		||||
                                            checksum="A", document_type=dt)
 | 
			
		||||
 | 
			
		||||
        doc2 = Document.objects.create(title="doc1", content="this is a document from c2",
 | 
			
		||||
                                            checksum="B")
 | 
			
		||||
 | 
			
		||||
        self.classifier.train()
 | 
			
		||||
        self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
 | 
			
		||||
        self.assertIsNone(self.classifier.predict_document_type(doc2.content))
 | 
			
		||||
 | 
			
		||||
    def test_one_tag_predict(self):
 | 
			
		||||
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
 | 
			
		||||
 | 
			
		||||
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
 | 
			
		||||
 | 
			
		||||
        doc1.tags.add(t1)
 | 
			
		||||
        self.classifier.train()
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
 | 
			
		||||
 | 
			
		||||
    def test_one_tag_predict_unassigned(self):
 | 
			
		||||
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
 | 
			
		||||
 | 
			
		||||
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
 | 
			
		||||
 | 
			
		||||
        self.classifier.train()
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [])
 | 
			
		||||
 | 
			
		||||
    def test_two_tags_predict_singledoc(self):
 | 
			
		||||
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
 | 
			
		||||
        t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
 | 
			
		||||
 | 
			
		||||
        doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
 | 
			
		||||
 | 
			
		||||
        doc4.tags.add(t1)
 | 
			
		||||
        doc4.tags.add(t2)
 | 
			
		||||
        self.classifier.train()
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk])
 | 
			
		||||
 | 
			
		||||
    def test_two_tags_predict(self):
 | 
			
		||||
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
 | 
			
		||||
        t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
 | 
			
		||||
 | 
			
		||||
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
 | 
			
		||||
        doc2 = Document.objects.create(title="doc1", content="this is a document from c2", checksum="B")
 | 
			
		||||
        doc3 = Document.objects.create(title="doc1", content="this is a document from c3", checksum="C")
 | 
			
		||||
        doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
 | 
			
		||||
 | 
			
		||||
        doc1.tags.add(t1)
 | 
			
		||||
        doc2.tags.add(t2)
 | 
			
		||||
 | 
			
		||||
        doc4.tags.add(t1)
 | 
			
		||||
        doc4.tags.add(t2)
 | 
			
		||||
        self.classifier.train()
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(doc2.content), [t2.pk])
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(doc3.content), [])
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk])
 | 
			
		||||
 | 
			
		||||
    def test_one_tag_predict_multi(self):
 | 
			
		||||
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
 | 
			
		||||
 | 
			
		||||
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
 | 
			
		||||
        doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
 | 
			
		||||
 | 
			
		||||
        doc1.tags.add(t1)
 | 
			
		||||
        doc2.tags.add(t1)
 | 
			
		||||
        self.classifier.train()
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(doc2.content), [t1.pk])
 | 
			
		||||
 | 
			
		||||
    def test_one_tag_predict_multi_2(self):
 | 
			
		||||
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
 | 
			
		||||
 | 
			
		||||
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
 | 
			
		||||
        doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
 | 
			
		||||
 | 
			
		||||
        doc1.tags.add(t1)
 | 
			
		||||
        self.classifier.train()
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
 | 
			
		||||
        self.assertListEqual(self.classifier.predict_tags(doc2.content), [])
 | 
			
		||||
 | 
			
		||||
@ -1,12 +1,12 @@
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import shutil
 | 
			
		||||
import tempfile
 | 
			
		||||
from unittest import mock
 | 
			
		||||
from unittest.mock import MagicMock
 | 
			
		||||
 | 
			
		||||
from django.test import TestCase, override_settings
 | 
			
		||||
 | 
			
		||||
from .utils import DirectoriesMixin
 | 
			
		||||
from ..consumer import Consumer, ConsumerError
 | 
			
		||||
from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
 | 
			
		||||
from ..parsers import DocumentParser, ParseError
 | 
			
		||||
@ -408,26 +408,16 @@ def fake_magic_from_file(file, mime=False):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
 | 
			
		||||
class TestConsumer(TestCase):
 | 
			
		||||
class TestConsumer(DirectoriesMixin, TestCase):
 | 
			
		||||
 | 
			
		||||
    def make_dummy_parser(self, path, logging_group):
 | 
			
		||||
        return DummyParser(path, logging_group, self.scratch_dir)
 | 
			
		||||
        return DummyParser(path, logging_group, self.dirs.scratch_dir)
 | 
			
		||||
 | 
			
		||||
    def make_faulty_parser(self, path, logging_group):
 | 
			
		||||
        return FaultyParser(path, logging_group, self.scratch_dir)
 | 
			
		||||
        return FaultyParser(path, logging_group, self.dirs.scratch_dir)
 | 
			
		||||
 | 
			
		||||
    def setUp(self):
 | 
			
		||||
        self.scratch_dir = tempfile.mkdtemp()
 | 
			
		||||
        self.media_dir = tempfile.mkdtemp()
 | 
			
		||||
        self.consumption_dir = tempfile.mkdtemp()
 | 
			
		||||
 | 
			
		||||
        override_settings(
 | 
			
		||||
            SCRATCH_DIR=self.scratch_dir,
 | 
			
		||||
            MEDIA_ROOT=self.media_dir,
 | 
			
		||||
            ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
 | 
			
		||||
            THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
 | 
			
		||||
            CONSUMPTION_DIR=self.consumption_dir
 | 
			
		||||
        ).enable()
 | 
			
		||||
        super(TestConsumer, self).setUp()
 | 
			
		||||
 | 
			
		||||
        patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
 | 
			
		||||
        m = patcher.start()
 | 
			
		||||
@ -441,13 +431,8 @@ class TestConsumer(TestCase):
 | 
			
		||||
 | 
			
		||||
        self.consumer = Consumer()
 | 
			
		||||
 | 
			
		||||
    def tearDown(self):
 | 
			
		||||
        shutil.rmtree(self.scratch_dir, ignore_errors=True)
 | 
			
		||||
        shutil.rmtree(self.media_dir, ignore_errors=True)
 | 
			
		||||
        shutil.rmtree(self.consumption_dir, ignore_errors=True)
 | 
			
		||||
 | 
			
		||||
    def get_test_file(self):
 | 
			
		||||
        fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
 | 
			
		||||
        fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.dirs.scratch_dir)
 | 
			
		||||
        return f
 | 
			
		||||
 | 
			
		||||
    def testNormalOperation(self):
 | 
			
		||||
@ -516,26 +501,6 @@ class TestConsumer(TestCase):
 | 
			
		||||
 | 
			
		||||
        self.fail("Should throw exception")
 | 
			
		||||
 | 
			
		||||
    @override_settings(CONSUMPTION_DIR=None)
 | 
			
		||||
    def testConsumptionDirUnset(self):
 | 
			
		||||
        try:
 | 
			
		||||
            self.consumer.try_consume_file(self.get_test_file())
 | 
			
		||||
        except ConsumerError as e:
 | 
			
		||||
            self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        self.fail("Should throw exception")
 | 
			
		||||
 | 
			
		||||
    @override_settings(CONSUMPTION_DIR="asd")
 | 
			
		||||
    def testNoConsumptionDir(self):
 | 
			
		||||
        try:
 | 
			
		||||
            self.consumer.try_consume_file(self.get_test_file())
 | 
			
		||||
        except ConsumerError as e:
 | 
			
		||||
            self.assertEqual(str(e), "Consumption directory asd does not exist")
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        self.fail("Should throw exception")
 | 
			
		||||
 | 
			
		||||
    def testDuplicates(self):
 | 
			
		||||
        self.consumer.try_consume_file(self.get_test_file())
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -2,7 +2,7 @@ import logging
 | 
			
		||||
import uuid
 | 
			
		||||
from unittest import mock
 | 
			
		||||
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
from django.test import TestCase, override_settings
 | 
			
		||||
 | 
			
		||||
from ..models import Log
 | 
			
		||||
 | 
			
		||||
@ -14,6 +14,7 @@ class TestPaperlessLog(TestCase):
 | 
			
		||||
        self.logger = logging.getLogger(
 | 
			
		||||
            "documents.management.commands.document_consumer")
 | 
			
		||||
 | 
			
		||||
    @override_settings(DISABLE_DBHANDLER=False)
 | 
			
		||||
    def test_that_it_saves_at_all(self):
 | 
			
		||||
 | 
			
		||||
        kw = {"group": uuid.uuid4()}
 | 
			
		||||
@ -38,6 +39,7 @@ class TestPaperlessLog(TestCase):
 | 
			
		||||
            self.logger.critical("This is a critical message", extra=kw)
 | 
			
		||||
            self.assertEqual(Log.objects.all().count(), 5)
 | 
			
		||||
 | 
			
		||||
    @override_settings(DISABLE_DBHANDLER=False)
 | 
			
		||||
    def test_groups(self):
 | 
			
		||||
 | 
			
		||||
        kw1 = {"group": uuid.uuid4()}
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										210
									
								
								src/documents/tests/test_management_consumer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										210
									
								
								src/documents/tests/test_management_consumer.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,210 @@
 | 
			
		||||
import filecmp
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
from threading import Thread
 | 
			
		||||
from time import sleep
 | 
			
		||||
from unittest import mock
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.core.management import call_command, CommandError
 | 
			
		||||
from django.test import override_settings, TestCase
 | 
			
		||||
 | 
			
		||||
from documents.consumer import ConsumerError
 | 
			
		||||
from documents.management.commands import document_consumer
 | 
			
		||||
from documents.tests.utils import DirectoriesMixin
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ConsumerThread(Thread):
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
        self.cmd = document_consumer.Command()
 | 
			
		||||
 | 
			
		||||
    def run(self) -> None:
 | 
			
		||||
        self.cmd.handle(directory=settings.CONSUMPTION_DIR, oneshot=False)
 | 
			
		||||
 | 
			
		||||
    def stop(self):
 | 
			
		||||
        # Consumer checks this every second.
 | 
			
		||||
        self.cmd.stop_flag = True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def chunked(size, source):
 | 
			
		||||
    for i in range(0, len(source), size):
 | 
			
		||||
        yield source[i:i+size]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestConsumer(DirectoriesMixin, TestCase):
 | 
			
		||||
 | 
			
		||||
    sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
 | 
			
		||||
 | 
			
		||||
    def setUp(self) -> None:
 | 
			
		||||
        super(TestConsumer, self).setUp()
 | 
			
		||||
        self.t = None
 | 
			
		||||
        patcher = mock.patch("documents.management.commands.document_consumer.async_task")
 | 
			
		||||
        self.task_mock = patcher.start()
 | 
			
		||||
        self.addCleanup(patcher.stop)
 | 
			
		||||
 | 
			
		||||
    def t_start(self):
 | 
			
		||||
        self.t = ConsumerThread()
 | 
			
		||||
        self.t.start()
 | 
			
		||||
        # give the consumer some time to do initial work
 | 
			
		||||
        sleep(1)
 | 
			
		||||
 | 
			
		||||
    def tearDown(self) -> None:
 | 
			
		||||
        if self.t:
 | 
			
		||||
            # set the stop flag
 | 
			
		||||
            self.t.stop()
 | 
			
		||||
            # wait for the consumer to exit.
 | 
			
		||||
            self.t.join()
 | 
			
		||||
 | 
			
		||||
        super(TestConsumer, self).tearDown()
 | 
			
		||||
 | 
			
		||||
    def wait_for_task_mock_call(self):
 | 
			
		||||
        n = 0
 | 
			
		||||
        while n < 100:
 | 
			
		||||
            if self.task_mock.call_count > 0:
 | 
			
		||||
                # give task_mock some time to finish and raise errors
 | 
			
		||||
                sleep(1)
 | 
			
		||||
                return
 | 
			
		||||
            n += 1
 | 
			
		||||
            sleep(0.1)
 | 
			
		||||
        self.fail("async_task was never called")
 | 
			
		||||
 | 
			
		||||
    # A bogus async_task that will simply check the file for
 | 
			
		||||
    # completeness and raise an exception otherwise.
 | 
			
		||||
    def bogus_task(self, func, filename, **kwargs):
 | 
			
		||||
        eq = filecmp.cmp(filename, self.sample_file, shallow=False)
 | 
			
		||||
        if not eq:
 | 
			
		||||
            print("Consumed an INVALID file.")
 | 
			
		||||
            raise ConsumerError("Incomplete File READ FAILED")
 | 
			
		||||
        else:
 | 
			
		||||
            print("Consumed a perfectly valid file.")
 | 
			
		||||
 | 
			
		||||
    def slow_write_file(self, target, incomplete=False):
 | 
			
		||||
        with open(self.sample_file, 'rb') as f:
 | 
			
		||||
            pdf_bytes = f.read()
 | 
			
		||||
 | 
			
		||||
        if incomplete:
 | 
			
		||||
            pdf_bytes = pdf_bytes[:len(pdf_bytes) - 100]
 | 
			
		||||
 | 
			
		||||
        with open(target, 'wb') as f:
 | 
			
		||||
            # this will take 2 seconds, since the file is about 20k.
 | 
			
		||||
            print("Start writing file.")
 | 
			
		||||
            for b in chunked(1000, pdf_bytes):
 | 
			
		||||
                f.write(b)
 | 
			
		||||
                sleep(0.1)
 | 
			
		||||
            print("file completed.")
 | 
			
		||||
 | 
			
		||||
    def test_consume_file(self):
 | 
			
		||||
        self.t_start()
 | 
			
		||||
 | 
			
		||||
        f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
 | 
			
		||||
        shutil.copy(self.sample_file, f)
 | 
			
		||||
 | 
			
		||||
        self.wait_for_task_mock_call()
 | 
			
		||||
 | 
			
		||||
        self.task_mock.assert_called_once()
 | 
			
		||||
 | 
			
		||||
        args, kwargs = self.task_mock.call_args
 | 
			
		||||
        self.assertEqual(args[1], f)
 | 
			
		||||
 | 
			
		||||
    @override_settings(CONSUMER_POLLING=1)
 | 
			
		||||
    def test_consume_file_polling(self):
 | 
			
		||||
        self.test_consume_file()
 | 
			
		||||
 | 
			
		||||
    def test_consume_existing_file(self):
 | 
			
		||||
        f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
 | 
			
		||||
        shutil.copy(self.sample_file, f)
 | 
			
		||||
 | 
			
		||||
        self.t_start()
 | 
			
		||||
        self.task_mock.assert_called_once()
 | 
			
		||||
 | 
			
		||||
        args, kwargs = self.task_mock.call_args
 | 
			
		||||
        self.assertEqual(args[1], f)
 | 
			
		||||
 | 
			
		||||
    @override_settings(CONSUMER_POLLING=1)
 | 
			
		||||
    def test_consume_existing_file_polling(self):
 | 
			
		||||
        self.test_consume_existing_file()
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.management.commands.document_consumer.logger.error")
 | 
			
		||||
    def test_slow_write_pdf(self, error_logger):
 | 
			
		||||
 | 
			
		||||
        self.task_mock.side_effect = self.bogus_task
 | 
			
		||||
 | 
			
		||||
        self.t_start()
 | 
			
		||||
 | 
			
		||||
        fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
 | 
			
		||||
 | 
			
		||||
        self.slow_write_file(fname)
 | 
			
		||||
 | 
			
		||||
        self.wait_for_task_mock_call()
 | 
			
		||||
 | 
			
		||||
        error_logger.assert_not_called()
 | 
			
		||||
 | 
			
		||||
        self.task_mock.assert_called_once()
 | 
			
		||||
 | 
			
		||||
        args, kwargs = self.task_mock.call_args
 | 
			
		||||
        self.assertEqual(args[1], fname)
 | 
			
		||||
 | 
			
		||||
    @override_settings(CONSUMER_POLLING=1)
 | 
			
		||||
    def test_slow_write_pdf_polling(self):
 | 
			
		||||
        self.test_slow_write_pdf()
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.management.commands.document_consumer.logger.error")
 | 
			
		||||
    def test_slow_write_and_move(self, error_logger):
 | 
			
		||||
 | 
			
		||||
        self.task_mock.side_effect = self.bogus_task
 | 
			
		||||
 | 
			
		||||
        self.t_start()
 | 
			
		||||
 | 
			
		||||
        fname = os.path.join(self.dirs.consumption_dir, "my_file.~df")
 | 
			
		||||
        fname2 = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
 | 
			
		||||
 | 
			
		||||
        self.slow_write_file(fname)
 | 
			
		||||
        shutil.move(fname, fname2)
 | 
			
		||||
 | 
			
		||||
        self.wait_for_task_mock_call()
 | 
			
		||||
 | 
			
		||||
        self.task_mock.assert_called_once()
 | 
			
		||||
 | 
			
		||||
        args, kwargs = self.task_mock.call_args
 | 
			
		||||
        self.assertEqual(args[1], fname2)
 | 
			
		||||
 | 
			
		||||
        error_logger.assert_not_called()
 | 
			
		||||
 | 
			
		||||
    @override_settings(CONSUMER_POLLING=1)
 | 
			
		||||
    def test_slow_write_and_move_polling(self):
 | 
			
		||||
        self.test_slow_write_and_move()
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.management.commands.document_consumer.logger.error")
 | 
			
		||||
    def test_slow_write_incomplete(self, error_logger):
 | 
			
		||||
 | 
			
		||||
        self.task_mock.side_effect = self.bogus_task
 | 
			
		||||
 | 
			
		||||
        self.t_start()
 | 
			
		||||
 | 
			
		||||
        fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
 | 
			
		||||
        self.slow_write_file(fname, incomplete=True)
 | 
			
		||||
 | 
			
		||||
        self.wait_for_task_mock_call()
 | 
			
		||||
 | 
			
		||||
        self.task_mock.assert_called_once()
 | 
			
		||||
        args, kwargs = self.task_mock.call_args
 | 
			
		||||
        self.assertEqual(args[1], fname)
 | 
			
		||||
 | 
			
		||||
        # assert that we have an error logged with this invalid file.
 | 
			
		||||
        error_logger.assert_called_once()
 | 
			
		||||
 | 
			
		||||
    @override_settings(CONSUMER_POLLING=1)
 | 
			
		||||
    def test_slow_write_incomplete_polling(self):
 | 
			
		||||
        self.test_slow_write_incomplete()
 | 
			
		||||
 | 
			
		||||
    @override_settings(CONSUMPTION_DIR="does_not_exist")
 | 
			
		||||
    def test_consumption_directory_invalid(self):
 | 
			
		||||
 | 
			
		||||
        self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
 | 
			
		||||
 | 
			
		||||
    @override_settings(CONSUMPTION_DIR="")
 | 
			
		||||
    def test_consumption_directory_unset(self):
 | 
			
		||||
 | 
			
		||||
        self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
 | 
			
		||||
							
								
								
									
										56
									
								
								src/documents/tests/test_management_decrypt.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								src/documents/tests/test_management_decrypt.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,56 @@
 | 
			
		||||
import hashlib
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
import tempfile
 | 
			
		||||
from unittest import mock
 | 
			
		||||
 | 
			
		||||
from django.core.management import call_command
 | 
			
		||||
from django.test import TestCase, override_settings
 | 
			
		||||
 | 
			
		||||
from documents.management.commands import document_exporter
 | 
			
		||||
from documents.models import Document, Tag, DocumentType, Correspondent
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestDecryptDocuments(TestCase):
 | 
			
		||||
 | 
			
		||||
    @override_settings(
 | 
			
		||||
        ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
 | 
			
		||||
        THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
 | 
			
		||||
        PASSPHRASE="test"
 | 
			
		||||
    )
 | 
			
		||||
    @mock.patch("documents.management.commands.decrypt_documents.input")
 | 
			
		||||
    def test_decrypt(self, m):
 | 
			
		||||
 | 
			
		||||
        media_dir = tempfile.mkdtemp()
 | 
			
		||||
        originals_dir = os.path.join(media_dir, "documents", "originals")
 | 
			
		||||
        thumb_dir = os.path.join(media_dir, "documents", "thumbnails")
 | 
			
		||||
        os.makedirs(originals_dir, exist_ok=True)
 | 
			
		||||
        os.makedirs(thumb_dir, exist_ok=True)
 | 
			
		||||
 | 
			
		||||
        override_settings(
 | 
			
		||||
            ORIGINALS_DIR=originals_dir,
 | 
			
		||||
            THUMBNAIL_DIR=thumb_dir,
 | 
			
		||||
            PASSPHRASE="test"
 | 
			
		||||
        ).enable()
 | 
			
		||||
 | 
			
		||||
        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
 | 
			
		||||
        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "thumb", "0000002.png.gpg"), os.path.join(thumb_dir, "0000002.png.gpg"))
 | 
			
		||||
 | 
			
		||||
        Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
 | 
			
		||||
 | 
			
		||||
        call_command('decrypt_documents')
 | 
			
		||||
 | 
			
		||||
        doc = Document.objects.get(id=2)
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
 | 
			
		||||
        self.assertEqual(doc.filename, "0000002.pdf")
 | 
			
		||||
        self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf")))
 | 
			
		||||
        self.assertTrue(os.path.isfile(doc.source_path))
 | 
			
		||||
        self.assertTrue(os.path.isfile(os.path.join(thumb_dir, "0000002.png")))
 | 
			
		||||
        self.assertTrue(os.path.isfile(doc.thumbnail_path))
 | 
			
		||||
 | 
			
		||||
        with doc.source_file as f:
 | 
			
		||||
            checksum = hashlib.md5(f.read()).hexdigest()
 | 
			
		||||
            self.assertEqual(checksum, doc.checksum)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										53
									
								
								src/documents/tests/test_management_exporter.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								src/documents/tests/test_management_exporter.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,53 @@
 | 
			
		||||
import hashlib
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
import tempfile
 | 
			
		||||
 | 
			
		||||
from django.core.management import call_command
 | 
			
		||||
from django.test import TestCase, override_settings
 | 
			
		||||
 | 
			
		||||
from documents.management.commands import document_exporter
 | 
			
		||||
from documents.models import Document, Tag, DocumentType, Correspondent
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestExporter(TestCase):
 | 
			
		||||
 | 
			
		||||
    @override_settings(
 | 
			
		||||
        ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
 | 
			
		||||
        THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
 | 
			
		||||
        PASSPHRASE="test"
 | 
			
		||||
    )
 | 
			
		||||
    def test_exporter(self):
 | 
			
		||||
        file = os.path.join(os.path.dirname(__file__), "samples", "originals", "0000001.pdf")
 | 
			
		||||
 | 
			
		||||
        with open(file, "rb") as f:
 | 
			
		||||
            checksum = hashlib.md5(f.read()).hexdigest()
 | 
			
		||||
 | 
			
		||||
        Document.objects.create(checksum=checksum, title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
 | 
			
		||||
        Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
 | 
			
		||||
        Tag.objects.create(name="t")
 | 
			
		||||
        DocumentType.objects.create(name="dt")
 | 
			
		||||
        Correspondent.objects.create(name="c")
 | 
			
		||||
 | 
			
		||||
        target = tempfile.mkdtemp()
 | 
			
		||||
 | 
			
		||||
        call_command('document_exporter', target)
 | 
			
		||||
 | 
			
		||||
        with open(os.path.join(target, "manifest.json")) as f:
 | 
			
		||||
            manifest = json.load(f)
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(len(manifest), 5)
 | 
			
		||||
 | 
			
		||||
        for element in manifest:
 | 
			
		||||
            if element['model'] == 'documents.document':
 | 
			
		||||
                fname = os.path.join(target, element[document_exporter.EXPORTER_FILE_NAME])
 | 
			
		||||
                self.assertTrue(os.path.exists(fname))
 | 
			
		||||
                self.assertTrue(os.path.exists(os.path.join(target, element[document_exporter.EXPORTER_THUMBNAIL_NAME])))
 | 
			
		||||
 | 
			
		||||
                with open(fname, "rb") as f:
 | 
			
		||||
                    checksum = hashlib.md5(f.read()).hexdigest()
 | 
			
		||||
                self.assertEqual(checksum, element['fields']['checksum'])
 | 
			
		||||
 | 
			
		||||
        Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
 | 
			
		||||
 | 
			
		||||
        self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
 | 
			
		||||
							
								
								
									
										58
									
								
								src/documents/tests/test_management_retagger.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								src/documents/tests/test_management_retagger.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,58 @@
 | 
			
		||||
from django.core.management import call_command
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
 | 
			
		||||
from documents.models import Document, Tag, Correspondent, DocumentType
 | 
			
		||||
from documents.tests.utils import DirectoriesMixin
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestRetagger(DirectoriesMixin, TestCase):
 | 
			
		||||
 | 
			
		||||
    def make_models(self):
 | 
			
		||||
        self.d1 = Document.objects.create(checksum="A", title="A", content="first document")
 | 
			
		||||
        self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
 | 
			
		||||
        self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
 | 
			
		||||
 | 
			
		||||
        self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
 | 
			
		||||
        self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
 | 
			
		||||
 | 
			
		||||
        self.correspondent_first = Correspondent.objects.create(
 | 
			
		||||
            name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY)
 | 
			
		||||
        self.correspondent_second = Correspondent.objects.create(
 | 
			
		||||
            name="c2", match="second", matching_algorithm=Correspondent.MATCH_ANY)
 | 
			
		||||
 | 
			
		||||
        self.doctype_first = DocumentType.objects.create(
 | 
			
		||||
            name="dt1", match="first", matching_algorithm=DocumentType.MATCH_ANY)
 | 
			
		||||
        self.doctype_second = DocumentType.objects.create(
 | 
			
		||||
            name="dt2", match="second", matching_algorithm=DocumentType.MATCH_ANY)
 | 
			
		||||
 | 
			
		||||
    def get_updated_docs(self):
 | 
			
		||||
        return Document.objects.get(title="A"), Document.objects.get(title="B"), Document.objects.get(title="C")
 | 
			
		||||
 | 
			
		||||
    def setUp(self) -> None:
 | 
			
		||||
        super(TestRetagger, self).setUp()
 | 
			
		||||
        self.make_models()
 | 
			
		||||
 | 
			
		||||
    def test_add_tags(self):
 | 
			
		||||
        call_command('document_retagger', '--tags')
 | 
			
		||||
        d_first, d_second, d_unrelated = self.get_updated_docs()
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(d_first.tags.count(), 1)
 | 
			
		||||
        self.assertEqual(d_second.tags.count(), 1)
 | 
			
		||||
        self.assertEqual(d_unrelated.tags.count(), 0)
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(d_first.tags.first(), self.tag_first)
 | 
			
		||||
        self.assertEqual(d_second.tags.first(), self.tag_second)
 | 
			
		||||
 | 
			
		||||
    def test_add_type(self):
 | 
			
		||||
        call_command('document_retagger', '--document_type')
 | 
			
		||||
        d_first, d_second, d_unrelated = self.get_updated_docs()
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(d_first.document_type, self.doctype_first)
 | 
			
		||||
        self.assertEqual(d_second.document_type, self.doctype_second)
 | 
			
		||||
 | 
			
		||||
    def test_add_correspondent(self):
 | 
			
		||||
        call_command('document_retagger', '--correspondent')
 | 
			
		||||
        d_first, d_second, d_unrelated = self.get_updated_docs()
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(d_first.correspondent, self.correspondent_first)
 | 
			
		||||
        self.assertEqual(d_second.correspondent, self.correspondent_second)
 | 
			
		||||
@ -1,3 +1,5 @@
 | 
			
		||||
import shutil
 | 
			
		||||
import tempfile
 | 
			
		||||
from random import randint
 | 
			
		||||
 | 
			
		||||
from django.contrib.admin.models import LogEntry
 | 
			
		||||
@ -215,6 +217,13 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
 | 
			
		||||
        self.doc_contains = Document.objects.create(
 | 
			
		||||
            content="I contain the keyword.", mime_type="application/pdf")
 | 
			
		||||
 | 
			
		||||
        self.index_dir = tempfile.mkdtemp()
 | 
			
		||||
        # TODO: we should not need the index here.
 | 
			
		||||
        override_settings(INDEX_DIR=self.index_dir).enable()
 | 
			
		||||
 | 
			
		||||
    def tearDown(self) -> None:
 | 
			
		||||
        shutil.rmtree(self.index_dir, ignore_errors=True)
 | 
			
		||||
 | 
			
		||||
    def test_tag_applied_any(self):
 | 
			
		||||
        t1 = Tag.objects.create(
 | 
			
		||||
            name="test", match="keyword", matching_algorithm=Tag.MATCH_ANY)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										59
									
								
								src/documents/tests/utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								src/documents/tests/utils.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,59 @@
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
import tempfile
 | 
			
		||||
from collections import namedtuple
 | 
			
		||||
 | 
			
		||||
from django.test import override_settings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def setup_directories():
 | 
			
		||||
 | 
			
		||||
    dirs = namedtuple("Dirs", ())
 | 
			
		||||
 | 
			
		||||
    dirs.data_dir = tempfile.mkdtemp()
 | 
			
		||||
    dirs.scratch_dir = tempfile.mkdtemp()
 | 
			
		||||
    dirs.media_dir = tempfile.mkdtemp()
 | 
			
		||||
    dirs.consumption_dir = tempfile.mkdtemp()
 | 
			
		||||
    dirs.index_dir = os.path.join(dirs.data_dir, "index")
 | 
			
		||||
    dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
 | 
			
		||||
    dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
 | 
			
		||||
 | 
			
		||||
    os.makedirs(dirs.index_dir, exist_ok=True)
 | 
			
		||||
    os.makedirs(dirs.originals_dir, exist_ok=True)
 | 
			
		||||
    os.makedirs(dirs.thumbnail_dir, exist_ok=True)
 | 
			
		||||
 | 
			
		||||
    override_settings(
 | 
			
		||||
        DATA_DIR=dirs.data_dir,
 | 
			
		||||
        SCRATCH_DIR=dirs.scratch_dir,
 | 
			
		||||
        MEDIA_ROOT=dirs.media_dir,
 | 
			
		||||
        ORIGINALS_DIR=dirs.originals_dir,
 | 
			
		||||
        THUMBNAIL_DIR=dirs.thumbnail_dir,
 | 
			
		||||
        CONSUMPTION_DIR=dirs.consumption_dir,
 | 
			
		||||
        INDEX_DIR=dirs.index_dir,
 | 
			
		||||
        MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle")
 | 
			
		||||
 | 
			
		||||
    ).enable()
 | 
			
		||||
 | 
			
		||||
    return dirs
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def remove_dirs(dirs):
 | 
			
		||||
    shutil.rmtree(dirs.media_dir, ignore_errors=True)
 | 
			
		||||
    shutil.rmtree(dirs.data_dir, ignore_errors=True)
 | 
			
		||||
    shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
 | 
			
		||||
    shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DirectoriesMixin:
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        super().__init__(*args, **kwargs)
 | 
			
		||||
        self.dirs = None
 | 
			
		||||
 | 
			
		||||
    def setUp(self) -> None:
 | 
			
		||||
        self.dirs = setup_directories()
 | 
			
		||||
        super(DirectoriesMixin, self).setUp()
 | 
			
		||||
 | 
			
		||||
    def tearDown(self) -> None:
 | 
			
		||||
        super(DirectoriesMixin, self).tearDown()
 | 
			
		||||
        remove_dirs(self.dirs)
 | 
			
		||||
@ -149,13 +149,25 @@ class DocumentViewSet(RetrieveModelMixin,
 | 
			
		||||
        else:
 | 
			
		||||
            return HttpResponseBadRequest(str(form.errors))
 | 
			
		||||
 | 
			
		||||
    @action(methods=['get'], detail=True)
 | 
			
		||||
    def metadata(self, request, pk=None):
 | 
			
		||||
        try:
 | 
			
		||||
            doc = Document.objects.get(pk=pk)
 | 
			
		||||
            return Response({
 | 
			
		||||
                "paperless__checksum": doc.checksum,
 | 
			
		||||
                "paperless__mime_type": doc.mime_type,
 | 
			
		||||
                "paperless__filename": doc.filename,
 | 
			
		||||
            })
 | 
			
		||||
        except Document.DoesNotExist:
 | 
			
		||||
            raise Http404()
 | 
			
		||||
 | 
			
		||||
    @action(methods=['get'], detail=True)
 | 
			
		||||
    def preview(self, request, pk=None):
 | 
			
		||||
        try:
 | 
			
		||||
            response = self.file_response(pk, "inline")
 | 
			
		||||
            return response
 | 
			
		||||
        except FileNotFoundError:
 | 
			
		||||
            raise Http404("Document source file does not exist")
 | 
			
		||||
        except (FileNotFoundError, Document.DoesNotExist):
 | 
			
		||||
            raise Http404()
 | 
			
		||||
 | 
			
		||||
    @action(methods=['get'], detail=True)
 | 
			
		||||
    @cache_control(public=False, max_age=315360000)
 | 
			
		||||
@ -163,15 +175,15 @@ class DocumentViewSet(RetrieveModelMixin,
 | 
			
		||||
        try:
 | 
			
		||||
            return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
 | 
			
		||||
                                content_type='image/png')
 | 
			
		||||
        except FileNotFoundError:
 | 
			
		||||
            raise Http404("Document thumbnail does not exist")
 | 
			
		||||
        except (FileNotFoundError, Document.DoesNotExist):
 | 
			
		||||
            raise Http404()
 | 
			
		||||
 | 
			
		||||
    @action(methods=['get'], detail=True)
 | 
			
		||||
    def download(self, request, pk=None):
 | 
			
		||||
        try:
 | 
			
		||||
            return self.file_response(pk, "attachment")
 | 
			
		||||
        except FileNotFoundError:
 | 
			
		||||
            raise Http404("Document source file does not exist")
 | 
			
		||||
        except (FileNotFoundError, Document.DoesNotExist):
 | 
			
		||||
            raise Http404()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class LogViewSet(ReadOnlyModelViewSet):
 | 
			
		||||
@ -190,7 +202,9 @@ class SearchView(APIView):
 | 
			
		||||
 | 
			
		||||
    permission_classes = (IsAuthenticated,)
 | 
			
		||||
 | 
			
		||||
    ix = index.open_index()
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        super(SearchView, self).__init__(*args, **kwargs)
 | 
			
		||||
        self.ix = index.open_index()
 | 
			
		||||
 | 
			
		||||
    def add_infos_to_hit(self, r):
 | 
			
		||||
        doc = Document.objects.get(id=r['id'])
 | 
			
		||||
@ -210,6 +224,9 @@ class SearchView(APIView):
 | 
			
		||||
            except (ValueError, TypeError):
 | 
			
		||||
                page = 1
 | 
			
		||||
 | 
			
		||||
            if page < 1:
 | 
			
		||||
                page = 1
 | 
			
		||||
 | 
			
		||||
            with index.query_page(self.ix, query, page) as result_page:
 | 
			
		||||
                return Response(
 | 
			
		||||
                    {'count': len(result_page),
 | 
			
		||||
@ -229,7 +246,9 @@ class SearchAutoCompleteView(APIView):
 | 
			
		||||
 | 
			
		||||
    permission_classes = (IsAuthenticated,)
 | 
			
		||||
 | 
			
		||||
    ix = index.open_index()
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
 | 
			
		||||
        self.ix = index.open_index()
 | 
			
		||||
 | 
			
		||||
    def get(self, request, format=None):
 | 
			
		||||
        if 'term' in request.query_params:
 | 
			
		||||
 | 
			
		||||
@ -1,8 +1,19 @@
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.contrib.auth.models import User
 | 
			
		||||
from django.utils.deprecation import MiddlewareMixin
 | 
			
		||||
from rest_framework import authentication
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AutoLoginMiddleware(MiddlewareMixin):
 | 
			
		||||
 | 
			
		||||
    def process_request(self, request):
 | 
			
		||||
        try:
 | 
			
		||||
            request.user = User.objects.get(
 | 
			
		||||
                username=settings.AUTO_LOGIN_USERNAME)
 | 
			
		||||
        except User.DoesNotExist:
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AngularApiAuthenticationOverride(authentication.BaseAuthentication):
 | 
			
		||||
    """ This class is here to provide authentication to the angular dev server
 | 
			
		||||
        during development. This is disabled in production.
 | 
			
		||||
 | 
			
		||||
@ -144,6 +144,15 @@ TEMPLATES = [
 | 
			
		||||
# Security                                                                    #
 | 
			
		||||
###############################################################################
 | 
			
		||||
 | 
			
		||||
AUTO_LOGIN_USERNAME = os.getenv("PAPERLESS_AUTO_LOGIN_USERNAME")
 | 
			
		||||
 | 
			
		||||
if AUTO_LOGIN_USERNAME:
 | 
			
		||||
    _index = MIDDLEWARE.index('django.contrib.auth.middleware.AuthenticationMiddleware')
 | 
			
		||||
    # This overrides everything the auth middleware is doing but still allows
 | 
			
		||||
    # regular login in case the provided user does not exist.
 | 
			
		||||
    MIDDLEWARE.insert(_index+1, 'paperless.auth.AutoLoginMiddleware')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if DEBUG:
 | 
			
		||||
    X_FRAME_OPTIONS = ''
 | 
			
		||||
    # this should really be 'allow-from uri' but its not supported in any mayor
 | 
			
		||||
@ -241,6 +250,8 @@ USE_TZ = True
 | 
			
		||||
# Logging                                                                     #
 | 
			
		||||
###############################################################################
 | 
			
		||||
 | 
			
		||||
DISABLE_DBHANDLER = __get_boolean("PAPERLESS_DISABLE_DBHANDLER")
 | 
			
		||||
 | 
			
		||||
LOGGING = {
 | 
			
		||||
    "version": 1,
 | 
			
		||||
    "disable_existing_loggers": False,
 | 
			
		||||
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = (0, 9, 2)
 | 
			
		||||
__version__ = (0, 9, 3)
 | 
			
		||||
 | 
			
		||||
@ -0,0 +1,2 @@
 | 
			
		||||
# this is here so that django finds the checks.
 | 
			
		||||
from .checks import *
 | 
			
		||||
							
								
								
									
										25
									
								
								src/paperless_tesseract/checks.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								src/paperless_tesseract/checks.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,25 @@
 | 
			
		||||
import subprocess
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.core.checks import Error, register
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_tesseract_langs():
 | 
			
		||||
    with subprocess.Popen(['tesseract', '--list-langs'],
 | 
			
		||||
                          stdout=subprocess.PIPE) as p:
 | 
			
		||||
        stdout, stderr = p.communicate()
 | 
			
		||||
 | 
			
		||||
    return stdout.decode().strip().split("\n")[1:]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@register()
 | 
			
		||||
def check_default_language_available(app_configs, **kwargs):
 | 
			
		||||
    langs = get_tesseract_langs()
 | 
			
		||||
 | 
			
		||||
    if settings.OCR_LANGUAGE not in langs:
 | 
			
		||||
        return [Error(
 | 
			
		||||
            f"The default ocr language {settings.OCR_LANGUAGE} is "
 | 
			
		||||
            f"not installed. Paperless cannot OCR your documents "
 | 
			
		||||
            f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
 | 
			
		||||
    else:
 | 
			
		||||
        return []
 | 
			
		||||
@ -3,10 +3,9 @@ exclude = migrations, paperless/settings.py, .tox, */tests/*
 | 
			
		||||
 | 
			
		||||
[tool:pytest]
 | 
			
		||||
DJANGO_SETTINGS_MODULE=paperless.settings
 | 
			
		||||
addopts = --pythonwarnings=all
 | 
			
		||||
addopts = --pythonwarnings=all --cov --cov-report=html -n auto
 | 
			
		||||
env =
 | 
			
		||||
  PAPERLESS_SECRET=paperless
 | 
			
		||||
  PAPERLESS_EMAIL_SECRET=paperless
 | 
			
		||||
  PAPERLESS_DISABLE_DBHANDLER=true
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[coverage:run]
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user