mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 08:12:34 -04:00 
			
		
		
		
	
						commit
						f5e0a89a3f
					
				
							
								
								
									
										7
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -57,7 +57,9 @@ docs/_build/ | |||||||
| target/ | target/ | ||||||
| 
 | 
 | ||||||
| # Stored PDFs | # Stored PDFs | ||||||
| media/* | media/documents/*.gpg | ||||||
|  | media/documents/thumbnails/*.gpg | ||||||
|  | media/documents/originals/*.gpg | ||||||
| 
 | 
 | ||||||
| # Sqlite database | # Sqlite database | ||||||
| db.sqlite3 | db.sqlite3 | ||||||
| @ -68,8 +70,9 @@ db.sqlite3 | |||||||
| # Other stuff that doesn't belong | # Other stuff that doesn't belong | ||||||
| virtualenv | virtualenv | ||||||
| .vagrant | .vagrant | ||||||
|  | docker-compose.yml | ||||||
|  | docker-compose.env | ||||||
| 
 | 
 | ||||||
| # Used for development | # Used for development | ||||||
| scripts/import-for-development | scripts/import-for-development | ||||||
| environment | environment | ||||||
| 
 |  | ||||||
|  | |||||||
							
								
								
									
										18
									
								
								.travis.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								.travis.yml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,18 @@ | |||||||
|  | language: python | ||||||
|  | 
 | ||||||
|  | sudo: false | ||||||
|  | 
 | ||||||
|  | matrix: | ||||||
|  |     include: | ||||||
|  |         - python: 3.4 | ||||||
|  |           env: TOXENV=py34 | ||||||
|  |         - python: 3.5 | ||||||
|  |           env: TOXENV=py35 | ||||||
|  |         - python: 3.5 | ||||||
|  |           env: TOXENV=pep8 | ||||||
|  | 
 | ||||||
|  | install: | ||||||
|  |     - pip install --requirement requirements.txt | ||||||
|  |     - pip install tox | ||||||
|  | 
 | ||||||
|  | script: tox -c src/tox.ini | ||||||
							
								
								
									
										46
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,46 @@ | |||||||
|  | FROM python:3.5.1 | ||||||
|  | MAINTAINER Pit Kleyersburg <pitkley@googlemail.com> | ||||||
|  | 
 | ||||||
|  | # Install dependencies | ||||||
|  | RUN apt-get update \ | ||||||
|  |     && apt-get install -y --no-install-recommends \ | ||||||
|  |         sudo \ | ||||||
|  |         tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \ | ||||||
|  |     && rm -rf /var/lib/apt/lists/* | ||||||
|  | 
 | ||||||
|  | # Install python dependencies | ||||||
|  | RUN mkdir -p /usr/src/paperless | ||||||
|  | WORKDIR /usr/src/paperless | ||||||
|  | COPY requirements.txt /usr/src/paperless/ | ||||||
|  | RUN pip install --no-cache-dir -r requirements.txt | ||||||
|  | 
 | ||||||
|  | # Copy application | ||||||
|  | RUN mkdir -p /usr/src/paperless/src | ||||||
|  | RUN mkdir -p /usr/src/paperless/data | ||||||
|  | RUN mkdir -p /usr/src/paperless/media | ||||||
|  | COPY src/ /usr/src/paperless/src/ | ||||||
|  | COPY data/ /usr/src/paperless/data/ | ||||||
|  | COPY media/ /usr/src/paperless/media/ | ||||||
|  | 
 | ||||||
|  | # Set consumption directory | ||||||
|  | ENV PAPERLESS_CONSUMPTION_DIR /consume | ||||||
|  | RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR | ||||||
|  | 
 | ||||||
|  | # Migrate database | ||||||
|  | WORKDIR /usr/src/paperless/src | ||||||
|  | RUN ./manage.py migrate | ||||||
|  | 
 | ||||||
|  | # Create user | ||||||
|  | RUN groupadd -g 1000 paperless \ | ||||||
|  |     && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \ | ||||||
|  |     && chown -Rh paperless:paperless /usr/src/paperless | ||||||
|  | 
 | ||||||
|  | # Setup entrypoint | ||||||
|  | COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh | ||||||
|  | RUN chmod 755 /sbin/docker-entrypoint.sh | ||||||
|  | 
 | ||||||
|  | # Mount volumes | ||||||
|  | VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"] | ||||||
|  | 
 | ||||||
|  | ENTRYPOINT ["/sbin/docker-entrypoint.sh"] | ||||||
|  | CMD ["--help"] | ||||||
| @ -3,6 +3,7 @@ Paperless | |||||||
| 
 | 
 | ||||||
| |Documentation| | |Documentation| | ||||||
| |Chat| | |Chat| | ||||||
|  | |Travis| | ||||||
| 
 | 
 | ||||||
| Scan, index, and archive all of your paper documents | Scan, index, and archive all of your paper documents | ||||||
| 
 | 
 | ||||||
| @ -55,6 +56,7 @@ powerful tools. | |||||||
| 
 | 
 | ||||||
| * `ImageMagick`_ converts the images between colour and greyscale. | * `ImageMagick`_ converts the images between colour and greyscale. | ||||||
| * `Tesseract`_ does the character recognition. | * `Tesseract`_ does the character recognition. | ||||||
|  | * `Unpaper`_ despeckles and and deskews the scanned image. | ||||||
| * `GNU Privacy Guard`_ is used as the encryption backend. | * `GNU Privacy Guard`_ is used as the encryption backend. | ||||||
| * `Python 3`_ is the language of the project. | * `Python 3`_ is the language of the project. | ||||||
| 
 | 
 | ||||||
| @ -92,6 +94,7 @@ home. | |||||||
| .. _this one: http://www.brother.ca/en-CA/Scanners/11/ProductDetail/ADS1500W?ProductDetail=productdetail | .. _this one: http://www.brother.ca/en-CA/Scanners/11/ProductDetail/ADS1500W?ProductDetail=productdetail | ||||||
| .. _ImageMagick: http://imagemagick.org/ | .. _ImageMagick: http://imagemagick.org/ | ||||||
| .. _Tesseract: https://github.com/tesseract-ocr | .. _Tesseract: https://github.com/tesseract-ocr | ||||||
|  | .. _Unpaper: https://www.flameeyes.eu/projects/unpaper | ||||||
| .. _GNU Privacy Guard: https://gnupg.org/ | .. _GNU Privacy Guard: https://gnupg.org/ | ||||||
| .. _Python 3: https://python.org/ | .. _Python 3: https://python.org/ | ||||||
| .. _Pillow: https://pypi.python.org/pypi/pillowfight/ | .. _Pillow: https://pypi.python.org/pypi/pillowfight/ | ||||||
| @ -105,4 +108,5 @@ home. | |||||||
| .. |Chat| image:: https://badges.gitter.im/danielquinn/paperless.svg | .. |Chat| image:: https://badges.gitter.im/danielquinn/paperless.svg | ||||||
|    :alt: Join the chat at https://gitter.im/danielquinn/paperless |    :alt: Join the chat at https://gitter.im/danielquinn/paperless | ||||||
|    :target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge |    :target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge | ||||||
| 
 | .. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master | ||||||
|  |    :target: https://travis-ci.org/danielquinn/paperless | ||||||
|  | |||||||
							
								
								
									
										15
									
								
								docker-compose.env.example
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								docker-compose.env.example
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,15 @@ | |||||||
|  | # Environment variables to set for Paperless | ||||||
|  | # Commented out variables will be replaced by a default within Paperless. | ||||||
|  | 
 | ||||||
|  | # Passphrase Paperless uses to encrypt and decrypt your documents | ||||||
|  | PAPERLESS_PASSPHRASE=CHANGE_ME | ||||||
|  | 
 | ||||||
|  | # The amount of threads to use for text recognition | ||||||
|  | # PAPERLESS_OCR_THREADS=4 | ||||||
|  | 
 | ||||||
|  | # Additional languages to install for text recognition | ||||||
|  | # PAPERLESS_OCR_LANGUAGES=deu ita | ||||||
|  | 
 | ||||||
|  | # You can change the default user and group id to a custom one | ||||||
|  | # USERMAP_UID=1000 | ||||||
|  | # USERMAP_GID=1000 | ||||||
							
								
								
									
										37
									
								
								docker-compose.yml.example
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								docker-compose.yml.example
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,37 @@ | |||||||
|  | version: '2' | ||||||
|  | 
 | ||||||
|  | services: | ||||||
|  |     webserver: | ||||||
|  |         image: paperless | ||||||
|  |         ports: | ||||||
|  |             # You can adapt the port you want Paperless to listen on by | ||||||
|  |             # modifying the part before the `:`. | ||||||
|  |             - "8000:8000" | ||||||
|  |         volumes: | ||||||
|  |             - data:/usr/src/paperless/data | ||||||
|  |             - media:/usr/src/paperless/media | ||||||
|  |         env_file: docker-compose.env | ||||||
|  |         environment: | ||||||
|  |             - PAPERLESS_OCR_LANGUAGES= | ||||||
|  |         command: ["runserver", "0.0.0.0:8000"] | ||||||
|  | 
 | ||||||
|  |     consumer: | ||||||
|  |         image: paperless | ||||||
|  |         volumes: | ||||||
|  |             - data:/usr/src/paperless/data | ||||||
|  |             - media:/usr/src/paperless/media | ||||||
|  |             # You have to adapt the local path you want the consumption | ||||||
|  |             # directory to mount to by modifying the part before the ':'. | ||||||
|  |             - /path/to/arbitrary/place:/consume | ||||||
|  |             # Likewise, you can add a local path to mount a directory for | ||||||
|  |             # exporting. This is not strictly needed for paperless to | ||||||
|  |             # function, only if you're exporting your files: uncomment | ||||||
|  |             # it and fill in a local path if you know you're going to | ||||||
|  |             # want to export your documents. | ||||||
|  |             # - /path/to/another/arbitrary/place:/export | ||||||
|  |         env_file: docker-compose.env | ||||||
|  |         command: ["document_consumer"] | ||||||
|  | 
 | ||||||
|  | volumes: | ||||||
|  |     data: | ||||||
|  |     media: | ||||||
							
								
								
									
										18
									
								
								docs/Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								docs/Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,18 @@ | |||||||
|  | FROM python:3.5.1 | ||||||
|  | MAINTAINER Pit Kleyersburg <pitkley@googlemail.com> | ||||||
|  | 
 | ||||||
|  | # Install Sphinx and Pygments | ||||||
|  | RUN pip install Sphinx Pygments | ||||||
|  | 
 | ||||||
|  | # Setup directories, copy data | ||||||
|  | RUN mkdir /build | ||||||
|  | COPY . /build | ||||||
|  | WORKDIR /build/docs | ||||||
|  | 
 | ||||||
|  | # Build documentation | ||||||
|  | RUN make html | ||||||
|  | 
 | ||||||
|  | # Start webserver | ||||||
|  | WORKDIR /build/docs/_build/html | ||||||
|  | EXPOSE 8000/tcp | ||||||
|  | CMD ["python3", "-m", "http.server"] | ||||||
							
								
								
									
										23
									
								
								docs/api.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								docs/api.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,23 @@ | |||||||
|  | .. _api: | ||||||
|  | 
 | ||||||
|  | The REST API | ||||||
|  | ############ | ||||||
|  | 
 | ||||||
|  | Paperless makes use of the `Django REST Framework`_ standard API interface | ||||||
|  | because of its inherent awesomeness.  Conveniently, the system is also | ||||||
|  | self-documenting, so learn more about the access points, schema, what's | ||||||
|  | accepted and what isn't, you need only visit ``/api`` on your local Paperless | ||||||
|  | installation. | ||||||
|  | 
 | ||||||
|  | .. _Django REST Framework: http://django-rest-framework.org/ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _api-uploading: | ||||||
|  | 
 | ||||||
|  | Uploading | ||||||
|  | --------- | ||||||
|  | 
 | ||||||
|  | File uploads in an API are hard and so far as I've been able to tell, there's | ||||||
|  | no standard way of accepting them, so rather than crowbar file uploads into the | ||||||
|  | REST API and endure that headache, I've left that process to a simple HTTP | ||||||
|  | POST, documented on the :ref:`consumption page <consumption-http>`. | ||||||
| @ -1,10 +1,51 @@ | |||||||
| Changelog | Changelog | ||||||
| ######### | ######### | ||||||
| 
 | 
 | ||||||
|  | * 0.1.1 | ||||||
|  | 
 | ||||||
|  |   * Potentially **Breaking Change**: All references to "sender" in the code | ||||||
|  |     have been renamed to "correspondent" to better reflect the nature of the | ||||||
|  |     property (one could quite reasonably scan a document before sending it to | ||||||
|  |     someone.) | ||||||
|  |   * `#67`_: Rewrote the document exporter and added a new importer that allows | ||||||
|  |     for full metadata retention without depending on the file name and | ||||||
|  |     modification time.  A big thanks to `Tikitu de Jager`_, `Pit`_, | ||||||
|  |     `Florian Jung`_, and `Christopher Luu`_ for their code snippets and | ||||||
|  |     contributing conversation that lead to this change. | ||||||
|  |   * `#20`_: Added *unpaper* support to help in cleaning up the scanned image | ||||||
|  |     before it's OCR'd.  Thanks to `Pit`_ for this one. | ||||||
|  |   * `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI. | ||||||
|  |   * `#68`_: Added support for using a proper config file at | ||||||
|  |     ``/etc/paperless.conf`` and modified the systemd unit files to use it. | ||||||
|  |   * Refactored the Vagrant installation process to use environment variables | ||||||
|  |     rather than asking the user to modify ``settings.py``. | ||||||
|  |   * `#44`_: Harmonise environment variable names with constant names. | ||||||
|  |   * `#60`_: Setup logging to actually use the Python native logging framework. | ||||||
|  |   * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images | ||||||
|  |     to be imported but made unavailable. | ||||||
|  | 
 | ||||||
|  | * 0.1.0 | ||||||
|  | 
 | ||||||
|  |   * Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and | ||||||
|  |     `Tikitu de Jager`_ for this one, and especially to `Pit`_ | ||||||
|  |     who spearheadded this effort. | ||||||
|  |   * A simple REST API is in place, but it should be considered unstable. | ||||||
|  |   * Cleaned up the consumer to use temporary directories instead of a single | ||||||
|  |     scratch space.  (Thanks `Pit`_) | ||||||
|  |   * Improved the efficiency of the consumer by parsing pages more intelligently | ||||||
|  |     and introducing a threaded OCR process (thanks again `Pit`_). | ||||||
|  |   * `#45`_: Cleaned up the logic for tag matching.  Reported by `darkmatter`_. | ||||||
|  |   * `#47`_: Auto-rotate landscape documents.  Reported by `Paul`_ and fixed by | ||||||
|  |     `Pit`_. | ||||||
|  |   * `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_) | ||||||
|  |   * `#54`_: Documented the re-tagger (`zedster`_) | ||||||
|  |   * `#57`_: Make sure file is preserved on import failure (`darkmatter`_) | ||||||
|  |   * Added tox with pep8 checking | ||||||
|  | 
 | ||||||
| * 0.0.6 | * 0.0.6 | ||||||
| 
 | 
 | ||||||
|   * Added support for parallel OCR (significant work from pitkley) |   * Added support for parallel OCR (significant work from `Pit`_) | ||||||
|   * Sped up the language detection (significant work from pitkley) |   * Sped up the language detection (significant work from `Pit`_) | ||||||
|   * Added simple logging |   * Added simple logging | ||||||
| 
 | 
 | ||||||
| * 0.0.5 | * 0.0.5 | ||||||
| @ -35,3 +76,26 @@ Changelog | |||||||
| * 0.0.1 | * 0.0.1 | ||||||
| 
 | 
 | ||||||
|   * Initial release |   * Initial release | ||||||
|  | 
 | ||||||
|  | .. _Brian Conn: https://github.com/TheConnMan | ||||||
|  | .. _Christopher Luu: https://github.com/nuudles | ||||||
|  | .. _Florian Jung: https://github.com/the01 | ||||||
|  | .. _Tikitu de Jager: https://github.com/tikitu | ||||||
|  | .. _Paul: https://github.com/polo2ro | ||||||
|  | .. _Pit: https://github.com/pitkley | ||||||
|  | .. _Wayne Werner: https://github.com/waynew | ||||||
|  | .. _darkmatter: https://github.com/darkmatter | ||||||
|  | .. _zedster: https://github.com/zedster | ||||||
|  | 
 | ||||||
|  | .. _#20: https://github.com/danielquinn/paperless/issues/20 | ||||||
|  | .. _#44: https://github.com/danielquinn/paperless/issues/44 | ||||||
|  | .. _#45: https://github.com/danielquinn/paperless/issues/45 | ||||||
|  | .. _#47: https://github.com/danielquinn/paperless/issues/47 | ||||||
|  | .. _#48: https://github.com/danielquinn/paperless/issues/48 | ||||||
|  | .. _#53: https://github.com/danielquinn/paperless/issues/53 | ||||||
|  | .. _#54: https://github.com/danielquinn/paperless/issues/54 | ||||||
|  | .. _#57: https://github.com/danielquinn/paperless/issues/57 | ||||||
|  | .. _#60: https://github.com/danielquinn/paperless/issues/60 | ||||||
|  | .. _#67: https://github.com/danielquinn/paperless/issues/67 | ||||||
|  | .. _#68: https://github.com/danielquinn/paperless/issues/68 | ||||||
|  | .. _#71: https://github.com/danielquinn/paperless/issues/71 | ||||||
|  | |||||||
| @ -40,14 +40,14 @@ follow the :ref:`consumer <utilities-consumer>` instructions to get it running. | |||||||
| A Note on File Naming | A Note on File Naming | ||||||
| --------------------- | --------------------- | ||||||
| 
 | 
 | ||||||
| Any document you put into the consumption directory will be consumed, but if you | Any document you put into the consumption directory will be consumed, but if | ||||||
| name the file right, it'll automatically set some values in the database for | you name the file right, it'll automatically set some values in the database | ||||||
| you.  This is is the logic the consumer follows: | for you.  This is is the logic the consumer follows: | ||||||
| 
 | 
 | ||||||
| 1. Try to find the sender, title, and tags in the file name following the | 1. Try to find the correspondent, title, and tags in the file name following | ||||||
|    pattern: ``Sender - Title - tag,tag,tag.pdf``. |    the pattern: ``Correspondent - Title - tag,tag,tag.pdf``. | ||||||
| 2. If that doesn't work, try to find the sender and title in the file name | 2. If that doesn't work, try to find the correspondent and title in the file | ||||||
|    following the pattern:  ``Sender - Title.pdf``. |    name following the pattern:  ``Correspondent - Title.pdf``. | ||||||
| 3. If that doesn't work, just assume that the name of the file is the title. | 3. If that doesn't work, just assume that the name of the file is the title. | ||||||
| 
 | 
 | ||||||
| So given the above, the following examples would work as you'd expect: | So given the above, the following examples would work as you'd expect: | ||||||
| @ -97,9 +97,9 @@ So, with all that in mind, here's what you do to get it running: | |||||||
|    the configured email account every 10 minutes for something new and pull down |    the configured email account every 10 minutes for something new and pull down | ||||||
|    whatever it finds. |    whatever it finds. | ||||||
| 4. Send yourself an email!  Note that the subject is treated as the file name, | 4. Send yourself an email!  Note that the subject is treated as the file name, | ||||||
|    so if you set the subject to ``Sender - Title - tag,tag,tag``, you'll get |    so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll | ||||||
|    what you expect.  Also, you must include the aforementioned secret string in |    get what you expect.  Also, you must include the aforementioned secret | ||||||
|    every email so the fetcher knows that it's safe to import. |    string in every email so the fetcher knows that it's safe to import. | ||||||
| 5. After a few minutes, the consumer will poll your mailbox, pull down the | 5. After a few minutes, the consumer will poll your mailbox, pull down the | ||||||
|    message, and place the attachment in the consumption directory with the |    message, and place the attachment in the consumption directory with the | ||||||
|    appropriate name.  A few minutes later, the consumer will import it like any |    appropriate name.  A few minutes later, the consumer will import it like any | ||||||
| @ -111,23 +111,22 @@ So, with all that in mind, here's what you do to get it running: | |||||||
| HTTP POST | HTTP POST | ||||||
| ========= | ========= | ||||||
| 
 | 
 | ||||||
| Currently, the API is limited to only handling file uploads, it doesn't do tags | You can also submit a document via HTTP POST.  It doesn't do tags yet, and the | ||||||
| yet, and the URL schema isn't concrete, but it's a start.  It's also not much of | URL schema isn't concrete, but it's a start. | ||||||
| a real API, it's just a URL that accepts an HTTP POST. |  | ||||||
| 
 | 
 | ||||||
| To push your document to *Paperless*, send an HTTP POST to the server with the | To push your document to Paperless, send an HTTP POST to the server with the | ||||||
| following name/value pairs: | following name/value pairs: | ||||||
| 
 | 
 | ||||||
| * ``sender``: The name of the document's sender.  Note that there are | * ``correspondent``: The name of the document's correspondent.  Note that there | ||||||
|   restrictions on what characters you can use here.  Specifically, alphanumeric |   are restrictions on what characters you can use here.  Specifically, | ||||||
|   characters, `-`, `,`, `.`, and `'` are ok, everything else it out.  You also |   alphanumeric characters, `-`, `,`, `.`, and `'` are ok, everything else it | ||||||
|   can't use the sequence ` - ` (space, dash, space). |   out.  You also can't use the sequence ` - ` (space, dash, space). | ||||||
| * ``title``: The title of the document.  The rules for characters is the same | * ``title``: The title of the document.  The rules for characters is the same | ||||||
|   here as the sender. |   here as the correspondent. | ||||||
| * ``signature``: For security reasons, we have the sender send a signature using | * ``signature``: For security reasons, we have the correspondent send a | ||||||
|   a "shared secret" method to make sure that random strangers don't start |   signature using a "shared secret" method to make sure that random strangers | ||||||
|   uploading stuff to your server.  The means of generating this signature is |   don't start uploading stuff to your server.  The means of generating this | ||||||
|   defined below. |   signature is defined below. | ||||||
| 
 | 
 | ||||||
| Specify ``enctype="multipart/form-data"``, and then POST your file with::: | Specify ``enctype="multipart/form-data"``, and then POST your file with::: | ||||||
| 
 | 
 | ||||||
| @ -146,12 +145,12 @@ verification. | |||||||
| 
 | 
 | ||||||
| In the case of *Paperless*, you configure the server with the secret by setting | In the case of *Paperless*, you configure the server with the secret by setting | ||||||
| ``UPLOAD_SHARED_SECRET``.  Then on your client, you generate your signature by | ``UPLOAD_SHARED_SECRET``.  Then on your client, you generate your signature by | ||||||
| concatenating the sender, title, and the secret, and then using sha256 to | concatenating the correspondent, title, and the secret, and then using sha256 | ||||||
| generate a hexdigest. | to generate a hexdigest. | ||||||
| 
 | 
 | ||||||
| If you're using Python, this is what that looks like: | If you're using Python, this is what that looks like: | ||||||
| 
 | 
 | ||||||
| .. code:: python | .. code:: python | ||||||
| 
 | 
 | ||||||
|     from hashlib import sha256 |     from hashlib import sha256 | ||||||
|     signature = sha256(sender + title + secret).hexdigest() |     signature = sha256(correspondent + title + secret).hexdigest() | ||||||
|  | |||||||
| @ -30,6 +30,7 @@ Contents | |||||||
|    requirements |    requirements | ||||||
|    setup |    setup | ||||||
|    consumption |    consumption | ||||||
|  |    api | ||||||
|    utilities |    utilities | ||||||
|    migrating |    migrating | ||||||
|    changelog |    changelog | ||||||
|  | |||||||
| @ -4,31 +4,10 @@ Migrating, Updates, and Backups | |||||||
| =============================== | =============================== | ||||||
| 
 | 
 | ||||||
| As *Paperless* is still under active development, there's a lot that can change | As *Paperless* is still under active development, there's a lot that can change | ||||||
| as software updates roll out.  The thing you just need to remember for all of | as software updates roll out.  You should backup often, so if anything goes | ||||||
| this is that for the most part, **the database is expendable** so long as you | wrong during an update, you at least have a means of restoring to something | ||||||
| have your files.  This is because the file name of the exported files includes | usable.  Thankfully, there are automated ways of backing up, restoring, and | ||||||
| the name of the sender, the title, and the tags (if any) on each file. | updating the software. | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| .. _migrating-updates: |  | ||||||
| 
 |  | ||||||
| Updates |  | ||||||
| ------- |  | ||||||
| 
 |  | ||||||
| For the most part, all you have to do to update *Paperless* is run ``git pull`` |  | ||||||
| on the directory containing the project files, and then use Django's ``migrate`` |  | ||||||
| command to execute any database schema updates that might have been rolled in |  | ||||||
| as part of the update: |  | ||||||
| 
 |  | ||||||
| .. code:: bash |  | ||||||
| 
 |  | ||||||
|     $ cd /path/to/project |  | ||||||
|     $ git pull |  | ||||||
|     $ cd src |  | ||||||
|     $ ./manage.py migrate |  | ||||||
| 
 |  | ||||||
| Note that it's possible (even likely) that while ``git pull`` may update some |  | ||||||
| files, the ``migrate`` step may not update anything.  This is totally normal. |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| .. _migrating-backup: | .. _migrating-backup: | ||||||
| @ -38,20 +17,8 @@ Backing Up | |||||||
| 
 | 
 | ||||||
| So you're bored of this whole project, or you want to make a remote backup of | So you're bored of this whole project, or you want to make a remote backup of | ||||||
| the unencrypted files for whatever reason.  This is easy to do, simply use the | the unencrypted files for whatever reason.  This is easy to do, simply use the | ||||||
| :ref:`exporter <utilities-exporter>` to dump your documents out into an | :ref:`exporter <utilities-exporter>` to dump your documents and database out | ||||||
| arbitrary directory. | into an arbitrary directory. | ||||||
| 
 |  | ||||||
| Additionally however, you'll need to back up the tags themselves.  The file |  | ||||||
| names contain the tag names, but you still need to define the tags and their |  | ||||||
| matching algorithms in the database for things to work properly.  We do this |  | ||||||
| with Django's ``dumpdata`` command, which produces JSON output. |  | ||||||
| 
 |  | ||||||
| .. code:: bash |  | ||||||
| 
 |  | ||||||
|     $ cd /path/to/project |  | ||||||
|     $ cd src |  | ||||||
|     $ ./manage.py document_export /path/to/arbitrary/place/ |  | ||||||
|     $ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| .. _migrating-restoring: | .. _migrating-restoring: | ||||||
| @ -66,7 +33,7 @@ create an empty database (just follow the | |||||||
| ``tags.json`` file you created as part of your backup.  Lastly, copy your | ``tags.json`` file you created as part of your backup.  Lastly, copy your | ||||||
| exported documents into the consumption directory and start up the consumer. | exported documents into the consumption directory and start up the consumer. | ||||||
| 
 | 
 | ||||||
| .. code:: bash | .. code-block:: shell-session | ||||||
| 
 | 
 | ||||||
|     $ cd /path/to/project |     $ cd /path/to/project | ||||||
|     $ rm data/db.sqlite3  # Delete the database |     $ rm data/db.sqlite3  # Delete the database | ||||||
| @ -77,3 +44,60 @@ exported documents into the consumption directory and start up the consumer. | |||||||
|     $ cp /path/to/exported/docs/* /path/to/consumption/dir/ |     $ cp /path/to/exported/docs/* /path/to/consumption/dir/ | ||||||
|     $ ./manage.py document_consumer |     $ ./manage.py document_consumer | ||||||
| 
 | 
 | ||||||
|  | Importing your data if you are :ref:`using Docker <setup-installation-docker>` | ||||||
|  | is almost as simple: | ||||||
|  | 
 | ||||||
|  | .. code-block:: shell-session | ||||||
|  | 
 | ||||||
|  |     # Stop and remove your current containers | ||||||
|  |     $ docker-compose stop | ||||||
|  |     $ docker-compose rm -f | ||||||
|  | 
 | ||||||
|  |     # Recreate them, add the superuser | ||||||
|  |     $ docker-compose up -d | ||||||
|  |     $ docker-compose run --rm webserver createsuperuser | ||||||
|  | 
 | ||||||
|  |     # Load the tags | ||||||
|  |     $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin - | ||||||
|  | 
 | ||||||
|  |     # Load your exported documents into the consumption directory | ||||||
|  |     # (How you do this highly depends on how you have set this up) | ||||||
|  |     $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/ | ||||||
|  | 
 | ||||||
|  | After loading the documents into the consumption directory the consumer will | ||||||
|  | immediately start consuming the documents. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _migrating-updates: | ||||||
|  | 
 | ||||||
|  | Updates | ||||||
|  | ------- | ||||||
|  | 
 | ||||||
|  | For the most part, all you have to do to update *Paperless* is run ``git pull`` | ||||||
|  | on the directory containing the project files, and then use Django's ``migrate`` | ||||||
|  | command to execute any database schema updates that might have been rolled in | ||||||
|  | as part of the update: | ||||||
|  | 
 | ||||||
|  | .. code-block:: shell-session | ||||||
|  | 
 | ||||||
|  |     $ cd /path/to/project | ||||||
|  |     $ git pull | ||||||
|  |     $ cd src | ||||||
|  |     $ ./manage.py migrate | ||||||
|  | 
 | ||||||
|  | Note that it's possible (even likely) that while ``git pull`` may update some | ||||||
|  | files, the ``migrate`` step may not update anything.  This is totally normal. | ||||||
|  | 
 | ||||||
|  | If you are :ref:`using Docker <setup-installation-docker>` the update process | ||||||
|  | requires only one additional step: | ||||||
|  | 
 | ||||||
|  | .. code-block:: shell-session | ||||||
|  | 
 | ||||||
|  |     $ cd /path/to/project | ||||||
|  |     $ git pull | ||||||
|  |     $ docker build -t paperless . | ||||||
|  |     $ docker-compose up -d | ||||||
|  |     $ docker-compose run --rm webserver migrate | ||||||
|  | 
 | ||||||
|  | If ``git pull`` doesn't report any changes, there is no need to continue with | ||||||
|  | the remaining steps. | ||||||
|  | |||||||
| @ -10,11 +10,13 @@ should work) that has the following software installed on it: | |||||||
| * `GNU Privacy Guard`_ | * `GNU Privacy Guard`_ | ||||||
| * `Tesseract`_ | * `Tesseract`_ | ||||||
| * `Imagemagick`_ | * `Imagemagick`_ | ||||||
|  | * `unpaper`_ | ||||||
| 
 | 
 | ||||||
| .. _Python3: https://python.org/ | .. _Python3: https://python.org/ | ||||||
| .. _GNU Privacy Guard: https://gnupg.org | .. _GNU Privacy Guard: https://gnupg.org | ||||||
| .. _Tesseract: https://github.com/tesseract-ocr | .. _Tesseract: https://github.com/tesseract-ocr | ||||||
| .. _Imagemagick: http://imagemagick.org/ | .. _Imagemagick: http://imagemagick.org/ | ||||||
|  | .. _unpaper: https://www.flameeyes.eu/projects/unpaper | ||||||
| 
 | 
 | ||||||
| Notably, you should confirm how you access your Python3 installation.  Many | Notably, you should confirm how you access your Python3 installation.  Many | ||||||
| Linux distributions will install Python3 in parallel to Python2, using the names | Linux distributions will install Python3 in parallel to Python2, using the names | ||||||
| @ -101,3 +103,16 @@ you'd like to generate your own docs locally, you'll need to: | |||||||
|     $ pip install sphinx |     $ pip install sphinx | ||||||
| 
 | 
 | ||||||
| and then cd into the ``docs`` directory and type ``make html``. | and then cd into the ``docs`` directory and type ``make html``. | ||||||
|  | 
 | ||||||
|  | If you are using Docker, you can use the following commands to build the | ||||||
|  | documentation and run a webserver serving it on `port 8001`_: | ||||||
|  | 
 | ||||||
|  | .. code:: bash | ||||||
|  | 
 | ||||||
|  |     $ pwd | ||||||
|  |     /path/to/paperless | ||||||
|  | 
 | ||||||
|  |     $ docker build -t paperless:docs -f docs/Dockerfile . | ||||||
|  |     $ docker run --rm -it -p "8001:8000" paperless:docs | ||||||
|  | 
 | ||||||
|  | .. _port 8001: http://127.0.0.1:8001 | ||||||
|  | |||||||
							
								
								
									
										215
									
								
								docs/setup.rst
									
									
									
									
									
								
							
							
						
						
									
										215
									
								
								docs/setup.rst
									
									
									
									
									
								
							| @ -37,11 +37,19 @@ or just download the tarball and go that route: | |||||||
| Installation & Configuration | Installation & Configuration | ||||||
| ---------------------------- | ---------------------------- | ||||||
| 
 | 
 | ||||||
| You can go two routes with setting up and running Paperless.  The *Vagrant* | You can go multiple routes with setting up and running Paperless. The `Vagrant | ||||||
| route is quick & easy, but means you're running a VM which comes with memory | route`_ is quick & easy, but means you're running a VM which comes with memory | ||||||
| consumption etc.  Alternatively the standard, "bare metal" approach is a little | consumption etc. We also `support Docker`_, which you can use natively under | ||||||
| more complicated. | Linux and in a VM with `Docker Machine`_ (this guide was written for native | ||||||
|  | Docker usage under Linux, you might have to adapt it for Docker Machine.) | ||||||
|  | Alternatively the standard, `bare metal`_ approach is a little more complicated, | ||||||
|  | but worth it because it makes it easier to should you want to contribute some | ||||||
|  | code back. | ||||||
| 
 | 
 | ||||||
|  | .. _Vagrant route: setup-installation-vagrant_ | ||||||
|  | .. _support Docker: setup-installation-docker_ | ||||||
|  | .. _bare metal: setup-installation-standard_ | ||||||
|  | .. _Docker Machine: https://docs.docker.com/machine/ | ||||||
| 
 | 
 | ||||||
| .. _setup-installation-standard: | .. _setup-installation-standard: | ||||||
| 
 | 
 | ||||||
| @ -91,33 +99,188 @@ Vagrant Method | |||||||
| 2. Run ``vagrant up``.  An instance will start up for you.  When it's ready and | 2. Run ``vagrant up``.  An instance will start up for you.  When it's ready and | ||||||
|    provisioned... |    provisioned... | ||||||
| 3. Run ``vagrant ssh`` and once inside your new vagrant box, edit | 3. Run ``vagrant ssh`` and once inside your new vagrant box, edit | ||||||
|    ``/opt/paperless/src/paperless/settings.py`` and set the values for: |    ``/etc/paperless.conf`` and set the values for: | ||||||
|     * ``CONSUMPTION_DIR``: this is where your documents will be dumped to be |     * ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be | ||||||
|       consumed by Paperless. |       dumped to be consumed by Paperless. | ||||||
|     * ``PASSPHRASE``: this is the passphrase Paperless uses to encrypt/decrypt |     * ``PAPERLESS_PASSPHRASE``: this is the passphrase Paperless uses to | ||||||
|       the original document.  The default value attempts to source the |       encrypt/decrypt the original document. | ||||||
|       passphrase from the environment, so if you don't set it to a static value |     * ``PAPERLESS_SHARED_SECRET``: this is the "magic word" used when consuming | ||||||
|       here, you must set ``PAPERLESS_PASSPHRASE=some-secret-string`` on the |       documents from mail or via the API.  If you don't use either, leaving it | ||||||
|       command line whenever invoking the consumer or webserver. |       blank is just fine. | ||||||
| 4. Initialise the database with ``/opt/paperless/src/manage.py migrate``. | 4. Exit the vagrant box and re-enter it with ``vagrant ssh`` again.  This | ||||||
| 5. Still inside your vagrant box, create a user for your Paperless instance with |    updates the environment to make use of the changes you made to the config | ||||||
|    ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to |    file. | ||||||
|  | 5. Initialise the database with ``/opt/paperless/src/manage.py migrate``. | ||||||
|  | 6. Still inside your vagrant box, create a user for your Paperless instance | ||||||
|  |    with ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to | ||||||
|    create your user. |    create your user. | ||||||
| 6. Start the webserver with ``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. | 7. Start the webserver with | ||||||
|    You should now be able to visit your (empty) `Paperless webserver`_ at |    ``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. You should now be | ||||||
|    ``172.28.128.4:8000``.  You can login with the user/pass you created in #5. |    able to visit your (empty) `Paperless webserver`_ at ``172.28.128.4:8000``. | ||||||
| 7. In a separate window, run ``vagrant ssh`` again, but this time once inside |    You can login with the user/pass you created in #6. | ||||||
|  | 8. In a separate window, run ``vagrant ssh`` again, but this time once inside | ||||||
|    your vagrant instance, you should start the consumer script with |    your vagrant instance, you should start the consumer script with | ||||||
|    ``/opt/paperless/src/manage.py document_consumer``. |    ``/opt/paperless/src/manage.py document_consumer``. | ||||||
| 8. Scan something.  Put it in the ``CONSUMPTION_DIR``. | 9. Scan something.  Put it in the ``CONSUMPTION_DIR``. | ||||||
| 9. Wait a few minutes | 10. Wait a few minutes | ||||||
| 10. Visit the document list on your webserver, and it should be there, indexed | 11. Visit the document list on your webserver, and it should be there, indexed | ||||||
|     and downloadable. |     and downloadable. | ||||||
| 
 | 
 | ||||||
| .. _Vagrant: https://vagrantup.com/ | .. _Vagrant: https://vagrantup.com/ | ||||||
| .. _Paperless server: http://172.28.128.4:8000 | .. _Paperless server: http://172.28.128.4:8000 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | .. _setup-installation-docker: | ||||||
|  | 
 | ||||||
|  | Docker Method | ||||||
|  | ............. | ||||||
|  | 
 | ||||||
|  | 1. Install `Docker`_. | ||||||
|  | 
 | ||||||
|  |    .. caution:: | ||||||
|  | 
 | ||||||
|  |       As mentioned earlier, this guide assumes that you use Docker natively | ||||||
|  |       under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows, | ||||||
|  |       you will have to adapt IP addresses, volume-mounting, command execution | ||||||
|  |       and maybe more. | ||||||
|  | 
 | ||||||
|  | 2. Install `docker-compose`_. [#compose]_ | ||||||
|  | 
 | ||||||
|  |    .. caution:: | ||||||
|  | 
 | ||||||
|  |        If you want to use the included ``docker-compose.yml.example`` file, you | ||||||
|  |        need to have at least Docker version **1.10.0** and docker-compose | ||||||
|  |        version **1.6.0**. | ||||||
|  | 
 | ||||||
|  |        See the `Docker installation guide`_ on how to install the current | ||||||
|  |        version of Docker for your operating system or Linux distribution of | ||||||
|  |        choice. To get an up-to-date version of docker-compose, follow the | ||||||
|  |        `docker-compose installation guide`_ if your package repository doesn't | ||||||
|  |        include it. | ||||||
|  | 
 | ||||||
|  |        .. _Docker installation guide: https://docs.docker.com/engine/installation/ | ||||||
|  |        .. _docker-compose installation guide: https://docs.docker.com/compose/install/ | ||||||
|  | 
 | ||||||
|  | 3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and | ||||||
|  |    a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be | ||||||
|  |    editing both these files: taking a copy ensures that you can ``git pull`` to | ||||||
|  |    receive updates without risking merge conflicts with your modified versions | ||||||
|  |    of the configuration files. | ||||||
|  | 4. Modify ``docker-compose.yml`` to your preferences, following the instructions | ||||||
|  |    in comments in the file. The only change that is a hard requirement is to | ||||||
|  |    specify where the consumption directory should mount. | ||||||
|  | 5. Modify ``docker-compose.env`` and adapt the following environment variables: | ||||||
|  | 
 | ||||||
|  |    ``PAPERLESS_PASSPHRASE`` | ||||||
|  |      This is the passphrase Paperless uses to encrypt/decrypt the original | ||||||
|  |      document. | ||||||
|  | 
 | ||||||
|  |    ``PAPERLESS_OCR_THREADS`` | ||||||
|  |      This is the number of threads the OCR process will spawn to process | ||||||
|  |      document pages in parallel. If the variable is not set, Python determines | ||||||
|  |      the core-count of your CPU and uses that value. | ||||||
|  | 
 | ||||||
|  |    ``PAPERLESS_OCR_LANGUAGES`` | ||||||
|  |      If you want the OCR to recognize other languages in addition to the default | ||||||
|  |      English, set this parameter to a space separated list of three-letter | ||||||
|  |      language-codes after `ISO 639-2/T`_. For a list of available languages -- | ||||||
|  |      including their three letter codes -- see the `Debian packagelist`_. | ||||||
|  | 
 | ||||||
|  |    ``USERMAP_UID`` and ``USERMAP_GID`` | ||||||
|  |      If you want to mount the consumption volume (directory ``/consume`` within | ||||||
|  |      the containers) to a host-directory -- which you probably want to do -- | ||||||
|  |      access rights might be an issue. The default user and group ``paperless`` | ||||||
|  |      in the containers have an id of 1000. The containers will enforce that the | ||||||
|  |      owning group of the consumption directory will be ``paperless`` to be able | ||||||
|  |      to delete consumed documents. If your host-system has a group with an id of | ||||||
|  |      1000 and you don't want this group to have access rights to the consumption | ||||||
|  |      directory, you can use ``USERMAP_GID`` to change the id in the container | ||||||
|  |      and thus the one of the consumption directory. Furthermore, you can change | ||||||
|  |      the id of the default user as well using ``USERMAP_UID``. | ||||||
|  | 
 | ||||||
|  | 6. Run ``docker-compose up -d``. This will create and start the necessary | ||||||
|  |    containers. | ||||||
|  | 7. To be able to login, you will need a super user. To create it, execute the | ||||||
|  |    following command: | ||||||
|  | 
 | ||||||
|  |    .. code-block:: shell-session | ||||||
|  | 
 | ||||||
|  |        $ docker-compose run --rm webserver createsuperuser | ||||||
|  | 
 | ||||||
|  |    This will prompt you to set a username (default ``paperless``), an optional | ||||||
|  |    e-mail address and finally a password. | ||||||
|  | 8. The default ``docker-compose.yml`` exports the webserver on your local port | ||||||
|  |    8000. If you haven't adapted this, you should now be able to visit your | ||||||
|  |    `Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the | ||||||
|  |    user and password you just created. | ||||||
|  | 9. Add files to consumption directory the way you prefer to. Following are two | ||||||
|  |    possible options: | ||||||
|  | 
 | ||||||
|  |    1. Mount the consumption directory to a local host path by modifying your | ||||||
|  |       ``docker-compose.yml``: | ||||||
|  | 
 | ||||||
|  |       .. code-block:: diff | ||||||
|  | 
 | ||||||
|  |          diff --git a/docker-compose.yml b/docker-compose.yml | ||||||
|  |          --- a/docker-compose.yml | ||||||
|  |          +++ b/docker-compose.yml | ||||||
|  |          @@ -17,9 +18,8 @@ services: | ||||||
|  |                   volumes: | ||||||
|  |                       - paperless-data:/usr/src/paperless/data | ||||||
|  |                       - paperless-media:/usr/src/paperless/media | ||||||
|  |          -            - /consume | ||||||
|  |          +            - /local/path/you/choose:/consume | ||||||
|  | 
 | ||||||
|  |       .. danger:: | ||||||
|  | 
 | ||||||
|  |           While the consumption container will ensure at startup that it can | ||||||
|  |           **delete** a consumed file from a host-mounted directory, it might not | ||||||
|  |           be able to **read** the document in the first place if the access | ||||||
|  |           rights to the file are incorrect. | ||||||
|  | 
 | ||||||
|  |           Make sure that the documents you put into the consumption directory | ||||||
|  |           will either be readable by everyone (``chmod o+r file.pdf``) or | ||||||
|  |           readable by the default user or group id 1000 (or the one you have set | ||||||
|  |           with ``USERMAP_UID`` or ``USERMAP_GID`` respectively). | ||||||
|  | 
 | ||||||
|  |    2. Use ``docker cp`` to copy your files directly into the container: | ||||||
|  | 
 | ||||||
|  |       .. code-block:: shell-session | ||||||
|  | 
 | ||||||
|  |          $ # Identify your containers | ||||||
|  |          $ docker-compose ps | ||||||
|  |                  Name                       Command                State     Ports | ||||||
|  |          ------------------------------------------------------------------------- | ||||||
|  |          paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0 | ||||||
|  |          paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0 | ||||||
|  | 
 | ||||||
|  |          $ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume | ||||||
|  | 
 | ||||||
|  |       ``docker cp`` is a one-shot-command, just like ``cp``. This means that | ||||||
|  |       every time you want to consume a new document, you will have to execute | ||||||
|  |       ``docker cp`` again. You can of course automate this process, but option 1 | ||||||
|  |       is generally the preferred one. | ||||||
|  | 
 | ||||||
|  |       .. danger:: | ||||||
|  | 
 | ||||||
|  |           ``docker cp`` will change the owning user and group of a copied file | ||||||
|  |           to the acting user at the destination, which will be ``root``. | ||||||
|  | 
 | ||||||
|  |           You therefore need to ensure that the documents you want to copy into | ||||||
|  |           the container are readable by everyone (``chmod o+r file.pdf``) before | ||||||
|  |           copying them. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _Docker: https://www.docker.com/ | ||||||
|  | .. _docker-compose: https://docs.docker.com/compose/install/ | ||||||
|  | .. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes | ||||||
|  | .. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr- | ||||||
|  | 
 | ||||||
|  | .. [#compose] You of course don't have to use docker-compose, but it | ||||||
|  |    simplifies deployment immensely. If you know your way around Docker, feel | ||||||
|  |    free to tinker around without using compose! | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| .. _making-things-a-little-more-permanent: | .. _making-things-a-little-more-permanent: | ||||||
| 
 | 
 | ||||||
| Making Things a Little more Permanent | Making Things a Little more Permanent | ||||||
| @ -126,5 +289,9 @@ Making Things a Little more Permanent | |||||||
| Once you've tested things and are happy with the work flow, you can automate the | Once you've tested things and are happy with the work flow, you can automate the | ||||||
| process of starting the webserver and consumer automatically.  If you're running | process of starting the webserver and consumer automatically.  If you're running | ||||||
| on a bare metal system that's using Systemd, you can use the service unit files | on a bare metal system that's using Systemd, you can use the service unit files | ||||||
| in the ``scripts`` directory to set this up.  If you're on a SysV or other | in the ``scripts`` directory to set this up.  If you're on another startup | ||||||
| startup system (like the Vagrant box), then you're currently on your own. | system or are using a Vagrant box, then you're currently on your own. If you are | ||||||
|  | using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to | ||||||
|  | have the containers automatically start with the Docker daemon. | ||||||
|  | 
 | ||||||
|  | .. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart | ||||||
|  | |||||||
| @ -26,7 +26,7 @@ How to Use It | |||||||
| 
 | 
 | ||||||
| The webserver is started via the ``manage.py`` script: | The webserver is started via the ``manage.py`` script: | ||||||
| 
 | 
 | ||||||
| .. code:: bash | .. code-block:: shell-session | ||||||
| 
 | 
 | ||||||
|     $ /path/to/paperless/src/manage.py runserver |     $ /path/to/paperless/src/manage.py runserver | ||||||
| 
 | 
 | ||||||
| @ -64,7 +64,7 @@ How to Use It | |||||||
| 
 | 
 | ||||||
| The consumer is started via the ``manage.py`` script: | The consumer is started via the ``manage.py`` script: | ||||||
| 
 | 
 | ||||||
| .. code:: bash | .. code-block:: shell-session | ||||||
| 
 | 
 | ||||||
|     $ /path/to/paperless/src/manage.py document_consumer |     $ /path/to/paperless/src/manage.py document_consumer | ||||||
| 
 | 
 | ||||||
| @ -95,13 +95,110 @@ How to Use It | |||||||
| 
 | 
 | ||||||
| This too is done via the ``manage.py`` script: | This too is done via the ``manage.py`` script: | ||||||
| 
 | 
 | ||||||
|  | .. code-block:: shell-session | ||||||
|  | 
 | ||||||
|  |     $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/ | ||||||
|  | 
 | ||||||
|  | This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you | ||||||
|  | to do with as you please.  The files are accompanied with a special file, | ||||||
|  | ``manifest.json`` which can be used to | ||||||
|  | :ref:`import the files <utilities-importer>` at a later date if you wish. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _utilities-exporter-howto-docker: | ||||||
|  | 
 | ||||||
|  | Docker | ||||||
|  | ______ | ||||||
|  | 
 | ||||||
|  | If you are :ref:`using Docker <setup-installation-docker>`, running the | ||||||
|  | expoorter is almost as easy.  To mount a volume for exports, follow the | ||||||
|  | instructions in the ``docker-compose.yml.example`` file for the ``/export`` | ||||||
|  | volume (making the changes in your own ``docker-compose.yml`` file, of course). | ||||||
|  | Once you have the volume mounted, the command to run an export is: | ||||||
|  | 
 | ||||||
|  | .. code-block:: shell-session | ||||||
|  | 
 | ||||||
|  |    $ docker-compose run --rm consumer document_exporter /export | ||||||
|  | 
 | ||||||
|  | If you prefer to use ``docker run`` directly, supplying the necessary commandline | ||||||
|  | options: | ||||||
|  | 
 | ||||||
|  | .. code-block:: shell-session | ||||||
|  | 
 | ||||||
|  |    $ # Identify your containers | ||||||
|  |    $ docker-compose ps | ||||||
|  |            Name                       Command                State     Ports | ||||||
|  |    ------------------------------------------------------------------------- | ||||||
|  |    paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0 | ||||||
|  |    paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0 | ||||||
|  | 
 | ||||||
|  |    $ # Make sure to replace your passphrase and remove or adapt the id mapping | ||||||
|  |    $ docker run --rm \ | ||||||
|  |        --volumes-from paperless_data_1 \ | ||||||
|  |        --volume /path/to/arbitrary/place:/export \ | ||||||
|  |        -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ | ||||||
|  |        -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ | ||||||
|  |        paperless document_exporter /export | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _utilities-importer: | ||||||
|  | 
 | ||||||
|  | The Importer | ||||||
|  | ------------ | ||||||
|  | 
 | ||||||
|  | Looking to transfer Paperless data from one instance to another, or just want | ||||||
|  | to restore from a backup?  This is your go-to toy. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _utilities-importer-howto: | ||||||
|  | 
 | ||||||
|  | How to Use It | ||||||
|  | ............. | ||||||
|  | 
 | ||||||
|  | The importer works just like the exporter.  You point it at a directory, and | ||||||
|  | the script does the rest of the work: | ||||||
|  | 
 | ||||||
|  | .. code-block:: shell-session | ||||||
|  | 
 | ||||||
|  |     $ /path/to/paperless/src/manage.py document_importer /path/to/somewhere/ | ||||||
|  | 
 | ||||||
|  | Docker | ||||||
|  | ______ | ||||||
|  | 
 | ||||||
|  | Assuming that you've already gone through the steps above in the | ||||||
|  | :ref:`export <utilities-exporter-howto-docker>` section, then the easiest thing | ||||||
|  | to do is just re-use the ``/export`` path you already setup: | ||||||
|  | 
 | ||||||
|  | .. code-block:: shell-session | ||||||
|  | 
 | ||||||
|  |    $ docker-compose run --rm consumer document_importer /export | ||||||
|  | 
 | ||||||
|  | Similarly, if you're not using docker-compose, you can adjust the export | ||||||
|  | instructions above to do the import. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _utilities-retagger: | ||||||
|  | 
 | ||||||
|  | The Re-tagger | ||||||
|  | ------------- | ||||||
|  | 
 | ||||||
|  | Say you've imported a few hundred documents and now want to introduce a tag | ||||||
|  | and apply its matching to all of the currently-imported docs.  This problem is | ||||||
|  | common enough that there's a tool for it. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _utilities-retagger-howto: | ||||||
|  | 
 | ||||||
|  | How to Use It | ||||||
|  | ............. | ||||||
|  | 
 | ||||||
|  | This too is done via the ``manage.py`` script: | ||||||
|  | 
 | ||||||
| .. code:: bash | .. code:: bash | ||||||
| 
 | 
 | ||||||
|     $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere |     $ /path/to/paperless/src/manage.py document_retagger | ||||||
| 
 | 
 | ||||||
| This will dump all of your PDFs into ``/path/to/somewhere`` for you to do with | That's it.  It'll loop over all of the documents in your database and attempt | ||||||
| as you please.  The naming scheme on export is identical to that used for | to match all of your tags to them.  If one matches, it'll be applied.  And | ||||||
| import, so should you can now safely delete the entire project directly, | don't worry, you can run this as often as you like, it' won't double-tag | ||||||
| database, encrypted PDFs and all, and later create it all again simply by | a document. | ||||||
| running the consumer again and dumping all of these files into |  | ||||||
| ``CONSUMPTION_DIR``. |  | ||||||
|  | |||||||
							
								
								
									
										33
									
								
								paperless.conf.example
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								paperless.conf.example
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,33 @@ | |||||||
|  | # Sample paperless.conf | ||||||
|  | # Copy this file to /etc/paperless.conf and modify it to suit your needs. | ||||||
|  | 
 | ||||||
|  | # This where your documents should go to be consumed.  Make sure that it exists | ||||||
|  | # and that the user running the paperless service can read/write its contents | ||||||
|  | # before you start Paperless. | ||||||
|  | PAPERLESS_CONSUMPTION_DIR="" | ||||||
|  | 
 | ||||||
|  | # These values are required if you want paperless to check a particular email | ||||||
|  | # box every 10 minutes and attempt to consume documents from there.  If you | ||||||
|  | # don't define a HOST, mail checking will just be disabled. | ||||||
|  | PAPERLESS_CONSUME_MAIL_HOST="" | ||||||
|  | PAPERLESS_CONSUME_MAIL_PORT="" | ||||||
|  | PAPERLESS_CONSUME_MAIL_USER="" | ||||||
|  | PAPERLESS_CONSUME_MAIL_PASS="" | ||||||
|  | 
 | ||||||
|  | # You must have a passphrase in order for Paperless to work at all.  If you set | ||||||
|  | # this to "", GNUGPG will "encrypt" your PDF by writing it out as a zero-byte | ||||||
|  | # file. | ||||||
|  | # | ||||||
|  | # The passphrase you use here will be used when storing your documents in | ||||||
|  | # Paperless, but you can always export them in an unencrypted format by using | ||||||
|  | # document exporter.  See the documentaiton for more information. | ||||||
|  | # | ||||||
|  | # One final note about the passphrase.  Once you've consumed a document with | ||||||
|  | # one passphrase, DON'T CHANGE IT.  Paperless assumes this to be a constant and | ||||||
|  | # can't properly export documents that were encrypted with an old passphrase if | ||||||
|  | # you've since changed it to a new one. | ||||||
|  | PAPERLESS_PASSPHRASE="secret" | ||||||
|  | 
 | ||||||
|  | # If you intend to consume documents either via HTTP POST or by email, you must | ||||||
|  | # have a shared secret here. | ||||||
|  | PAPERLESS_SHARED_SECRET="" | ||||||
| @ -1,8 +1,10 @@ | |||||||
| Django==1.9 | Django==1.9.2 | ||||||
| django-extensions==1.6.1 | django-extensions==1.6.1 | ||||||
|  | djangorestframework==3.3.2 | ||||||
|  | python-dotenv==0.3.0 | ||||||
| filemagic==1.6 | filemagic==1.6 | ||||||
| langdetect==1.0.5 | langdetect==1.0.5 | ||||||
| Pillow==3.0.0 | Pillow==3.1.1 | ||||||
| pyocr==0.3.1 | pyocr==0.3.1 | ||||||
| python-dateutil==2.4.2 | python-dateutil==2.4.2 | ||||||
| python-gnupg==0.3.8 | python-gnupg==0.3.8 | ||||||
|  | |||||||
							
								
								
									
										74
									
								
								scripts/docker-entrypoint.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								scripts/docker-entrypoint.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,74 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | set -e | ||||||
|  | 
 | ||||||
|  | # Source: https://github.com/sameersbn/docker-gitlab/ | ||||||
|  | map_uidgid() { | ||||||
|  |     USERMAP_ORIG_UID=$(id -u paperless) | ||||||
|  |     USERMAP_ORIG_UID=$(id -g paperless) | ||||||
|  |     USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}} | ||||||
|  |     USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID} | ||||||
|  |     if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then | ||||||
|  |         echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID" | ||||||
|  |         groupmod -g ${USERMAP_GID} paperless | ||||||
|  |         sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd | ||||||
|  |     fi | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | set_permissions() { | ||||||
|  |     # Set permissions for consumption directory | ||||||
|  |     chgrp paperless "$PAPERLESS_CONSUMPTION_DIR" | ||||||
|  |     chmod g+x "$PAPERLESS_CONSUMPTION_DIR" | ||||||
|  | 
 | ||||||
|  |     # Set permissions for application directory | ||||||
|  |     chown -Rh paperless:paperless /usr/src/paperless | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | initialize() { | ||||||
|  |     map_uidgid | ||||||
|  |     set_permissions | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | install_languages() { | ||||||
|  |     local langs="$1" | ||||||
|  |     read -ra langs <<<"$langs" | ||||||
|  | 
 | ||||||
|  |     # Check that it is not empty | ||||||
|  |     if [ ${#langs[@]} -eq 0 ]; then | ||||||
|  |         return | ||||||
|  |     fi | ||||||
|  | 
 | ||||||
|  |     # Update apt-lists | ||||||
|  |     apt-get update | ||||||
|  | 
 | ||||||
|  |     # Loop over languages to be installed | ||||||
|  |     for lang in "${langs[@]}"; do | ||||||
|  |         pkg="tesseract-ocr-$lang" | ||||||
|  |         if dpkg -s "$pkg" 2>&1 > /dev/null; then | ||||||
|  |             continue | ||||||
|  |         fi | ||||||
|  | 
 | ||||||
|  |         if ! apt-cache show "$pkg" 2>&1 > /dev/null; then | ||||||
|  |             continue | ||||||
|  |         fi | ||||||
|  | 
 | ||||||
|  |         apt-get install "$pkg" | ||||||
|  |     done | ||||||
|  | 
 | ||||||
|  |     # Remove apt lists | ||||||
|  |     rm -rf /var/lib/apt/lists/* | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | if [[ "$1" != "/"* ]]; then | ||||||
|  |     initialize | ||||||
|  | 
 | ||||||
|  |     # Install additional languages if specified | ||||||
|  |     if [ ! -z "$PAPERLESS_OCR_LANGUAGES"  ]; then | ||||||
|  |         install_languages "$PAPERLESS_OCR_LANGUAGES" | ||||||
|  |     fi | ||||||
|  | 
 | ||||||
|  |     exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@" | ||||||
|  | fi | ||||||
|  | 
 | ||||||
|  | exec "$@" | ||||||
|  | 
 | ||||||
| @ -2,10 +2,9 @@ | |||||||
| Description=Paperless consumer | Description=Paperless consumer | ||||||
| 
 | 
 | ||||||
| [Service] | [Service] | ||||||
| EnvironmentFile=/etc/conf.d/paperless |  | ||||||
| User=paperless | User=paperless | ||||||
| Group=paperless | Group=paperless | ||||||
| ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY | ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer | ||||||
| 
 | 
 | ||||||
| [Install] | [Install] | ||||||
| WantedBy=multi-user.target | WantedBy=multi-user.target | ||||||
|  | |||||||
| @ -2,7 +2,6 @@ | |||||||
| Description=Paperless webserver | Description=Paperless webserver | ||||||
| 
 | 
 | ||||||
| [Service] | [Service] | ||||||
| EnvironmentFile=/etc/conf.d/paperless |  | ||||||
| User=paperless | User=paperless | ||||||
| Group=paperless | Group=paperless | ||||||
| ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py runserver 0.0.0.0:8000 | ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py runserver 0.0.0.0:8000 | ||||||
|  | |||||||
| @ -1,13 +1,31 @@ | |||||||
| #!/bin/bash | #!/bin/bash | ||||||
| 
 | 
 | ||||||
| # install packages | # Install packages | ||||||
| sudo apt-get update | apt-get update | ||||||
| sudo apt-get build-dep -y python-imaging | apt-get build-dep -y python-imaging | ||||||
| sudo apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev | apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev | ||||||
| sudo apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git | apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git | ||||||
| sudo apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick | apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick unpaper | ||||||
| 
 | 
 | ||||||
| # setup python project | # Python dependencies | ||||||
| pushd /opt/paperless | pip3 install -r /opt/paperless/requirements.txt | ||||||
| sudo pip3 install -r requirements.txt | 
 | ||||||
| popd | # Create the environment file | ||||||
|  | cat /opt/paperless/paperless.conf.example | sed -e 's#CONSUMPTION_DIR=""#CONSUMPTION_DIR="/home/vagrant/consumption"#' > /etc/paperless.conf | ||||||
|  | chmod 0640 /etc/paperless.conf | ||||||
|  | chown root:vagrant /etc/paperless.conf | ||||||
|  | 
 | ||||||
|  | # Create the consumption directory | ||||||
|  | mkdir /home/vagrant/consumption | ||||||
|  | chown vagrant:vagrant /home/vagrant/consumption | ||||||
|  | 
 | ||||||
|  | echo " | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | Now follow the remaining steps in the Vagrant section of the setup | ||||||
|  | documentation to complete the process: | ||||||
|  | 
 | ||||||
|  | http://paperless.readthedocs.org/en/latest/setup.html#setup-installation-vagrant | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | " | ||||||
|  | |||||||
| @ -3,7 +3,7 @@ from django.contrib.auth.models import User, Group | |||||||
| from django.core.urlresolvers import reverse | from django.core.urlresolvers import reverse | ||||||
| from django.templatetags.static import static | from django.templatetags.static import static | ||||||
| 
 | 
 | ||||||
| from .models import Sender, Tag, Document | from .models import Correspondent, Tag, Document, Log | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class MonthListFilter(admin.SimpleListFilter): | class MonthListFilter(admin.SimpleListFilter): | ||||||
| @ -45,39 +45,73 @@ class DocumentAdmin(admin.ModelAdmin): | |||||||
|             "all": ("paperless.css",) |             "all": ("paperless.css",) | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|     search_fields = ("sender__name", "title", "content") |     search_fields = ("correspondent__name", "title", "content") | ||||||
|     list_display = ("created", "sender", "title", "tags_", "document") |     list_display = ("created_", "correspondent", "title", "tags_", "document") | ||||||
|     list_filter = ("tags", "sender", MonthListFilter) |     list_filter = ("tags", "correspondent", MonthListFilter) | ||||||
|     list_per_page = 25 |     list_per_page = 25 | ||||||
| 
 | 
 | ||||||
|  |     def created_(self, obj): | ||||||
|  |         return obj.created.date().strftime("%Y-%m-%d") | ||||||
|  | 
 | ||||||
|     def tags_(self, obj): |     def tags_(self, obj): | ||||||
|         r = "" |         r = "" | ||||||
|         for tag in obj.tags.all(): |         for tag in obj.tags.all(): | ||||||
|             r += '<a class="tag" style="background-color: {};" href="{}">{}</a>'.format( |             colour = tag.get_colour_display() | ||||||
|                 tag.get_colour_display(), |             r += self._html_tag( | ||||||
|                 "{}?tags__id__exact={}".format( |                 "a", | ||||||
|                     reverse("admin:documents_document_changelist"), |                 tag.slug, | ||||||
|                     tag.pk |                 **{ | ||||||
|                 ), |                     "class": "tag", | ||||||
|                 tag.slug |                     "style": "background-color: {};".format(colour), | ||||||
|  |                     "href": "{}?tags__id__exact={}".format( | ||||||
|  |                         reverse("admin:documents_document_changelist"), | ||||||
|  |                         tag.pk | ||||||
|  |                     ) | ||||||
|  |                 } | ||||||
|             ) |             ) | ||||||
|         return r |         return r | ||||||
|     tags_.allow_tags = True |     tags_.allow_tags = True | ||||||
| 
 | 
 | ||||||
|     def document(self, obj): |     def document(self, obj): | ||||||
|         return '<a href="{}">' \ |         return self._html_tag( | ||||||
|                  '<img src="{}" width="22" height="22" alt="{} icon" title="{}">' \ |             "a", | ||||||
|                '</a>'.format( |             self._html_tag( | ||||||
|                     obj.download_url, |                 "img", | ||||||
|                     static("documents/img/{}.png".format(obj.file_type)), |                 src=static("documents/img/{}.png".format(obj.file_type)), | ||||||
|                     obj.file_type, |                 width=22, | ||||||
|                     obj.file_name |                 height=22, | ||||||
|                 ) |                 alt=obj.file_type, | ||||||
|  |                 title=obj.file_name | ||||||
|  |             ), | ||||||
|  |             href=obj.download_url | ||||||
|  |         ) | ||||||
|     document.allow_tags = True |     document.allow_tags = True | ||||||
| 
 | 
 | ||||||
| admin.site.register(Sender) |     @staticmethod | ||||||
|  |     def _html_tag(kind, inside=None, **kwargs): | ||||||
|  | 
 | ||||||
|  |         attributes = [] | ||||||
|  |         for lft, rgt in kwargs.items(): | ||||||
|  |             attributes.append('{}="{}"'.format(lft, rgt)) | ||||||
|  | 
 | ||||||
|  |         if inside is not None: | ||||||
|  |             return "<{kind} {attributes}>{inside}</{kind}>".format( | ||||||
|  |                 kind=kind, attributes=" ".join(attributes), inside=inside) | ||||||
|  | 
 | ||||||
|  |         return "<{} {}/>".format(kind, " ".join(attributes)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class LogAdmin(admin.ModelAdmin): | ||||||
|  | 
 | ||||||
|  |     list_display = ("message", "level", "component") | ||||||
|  |     list_filter = ("level", "component",) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | admin.site.register(Correspondent) | ||||||
| admin.site.register(Tag, TagAdmin) | admin.site.register(Tag, TagAdmin) | ||||||
| admin.site.register(Document, DocumentAdmin) | admin.site.register(Document, DocumentAdmin) | ||||||
|  | admin.site.register(Log, LogAdmin) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| # Unless we implement multi-user, these default registrations don't make sense. | # Unless we implement multi-user, these default registrations don't make sense. | ||||||
| admin.site.unregister(Group) | admin.site.unregister(Group) | ||||||
|  | |||||||
| @ -1,5 +1,8 @@ | |||||||
| import datetime | import datetime | ||||||
|  | import logging | ||||||
| import tempfile | import tempfile | ||||||
|  | import uuid | ||||||
|  | 
 | ||||||
| from multiprocessing.pool import Pool | from multiprocessing.pool import Pool | ||||||
| 
 | 
 | ||||||
| import itertools | import itertools | ||||||
| @ -17,20 +20,14 @@ from PIL import Image | |||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
| from django.template.defaultfilters import slugify | from django.template.defaultfilters import slugify | ||||||
|  | from pyocr.tesseract import TesseractError | ||||||
| 
 | 
 | ||||||
| from logger.models import Log |  | ||||||
| from paperless.db import GnuPG | from paperless.db import GnuPG | ||||||
| 
 | 
 | ||||||
| from .models import Sender, Tag, Document | from .models import Correspondent, Tag, Document, Log | ||||||
| from .languages import ISO639 | from .languages import ISO639 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def image_to_string(args): |  | ||||||
|     self, png, lang = args |  | ||||||
|     with Image.open(os.path.join(self.SCRATCH, png)) as f: |  | ||||||
|         return self.OCR.image_to_string(f, lang=lang) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class OCRError(Exception): | class OCRError(Exception): | ||||||
|     pass |     pass | ||||||
| 
 | 
 | ||||||
| @ -42,8 +39,8 @@ class ConsumerError(Exception): | |||||||
| class Consumer(object): | class Consumer(object): | ||||||
|     """ |     """ | ||||||
|     Loop over every file found in CONSUMPTION_DIR and: |     Loop over every file found in CONSUMPTION_DIR and: | ||||||
|       1. Convert it to a greyscale png |       1. Convert it to a greyscale pnm | ||||||
|       2. Use tesseract on the png |       2. Use tesseract on the pnm | ||||||
|       3. Encrypt and store the document in the MEDIA_ROOT |       3. Encrypt and store the document in the MEDIA_ROOT | ||||||
|       4. Store the OCR'd text in the database |       4. Store the OCR'd text in the database | ||||||
|       5. Delete the document and image(s) |       5. Delete the document and image(s) | ||||||
| @ -51,28 +48,29 @@ class Consumer(object): | |||||||
| 
 | 
 | ||||||
|     SCRATCH = settings.SCRATCH_DIR |     SCRATCH = settings.SCRATCH_DIR | ||||||
|     CONVERT = settings.CONVERT_BINARY |     CONVERT = settings.CONVERT_BINARY | ||||||
|  |     UNPAPER = settings.UNPAPER_BINARY | ||||||
|     CONSUME = settings.CONSUMPTION_DIR |     CONSUME = settings.CONSUMPTION_DIR | ||||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None |     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||||
| 
 | 
 | ||||||
|     OCR = pyocr.get_available_tools()[0] |  | ||||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE |     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||||
| 
 | 
 | ||||||
|     REGEX_TITLE = re.compile( |     REGEX_TITLE = re.compile( | ||||||
|         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", |         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||||
|         flags=re.IGNORECASE |         flags=re.IGNORECASE | ||||||
|     ) |     ) | ||||||
|     REGEX_SENDER_TITLE = re.compile( |     REGEX_CORRESPONDENT_TITLE = re.compile( | ||||||
|         r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", |         r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||||
|         flags=re.IGNORECASE |         flags=re.IGNORECASE | ||||||
|     ) |     ) | ||||||
|     REGEX_SENDER_TITLE_TAGS = re.compile( |     REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( | ||||||
|         r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", |         r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||||
|         flags=re.IGNORECASE |         flags=re.IGNORECASE | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     def __init__(self, verbosity=1): |     def __init__(self): | ||||||
| 
 | 
 | ||||||
|         self.verbosity = verbosity |         self.logger = logging.getLogger(__name__) | ||||||
|  |         self.logging_group = None | ||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|             os.makedirs(self.SCRATCH) |             os.makedirs(self.SCRATCH) | ||||||
| @ -92,6 +90,12 @@ class Consumer(object): | |||||||
|             raise ConsumerError( |             raise ConsumerError( | ||||||
|                 "Consumption directory {} does not exist".format(self.CONSUME)) |                 "Consumption directory {} does not exist".format(self.CONSUME)) | ||||||
| 
 | 
 | ||||||
|  |     def log(self, level, message): | ||||||
|  |         getattr(self.logger, level)(message, extra={ | ||||||
|  |             "group": self.logging_group, | ||||||
|  |             "component": Log.COMPONENT_CONSUMER | ||||||
|  |         }) | ||||||
|  | 
 | ||||||
|     def consume(self): |     def consume(self): | ||||||
| 
 | 
 | ||||||
|         for doc in os.listdir(self.CONSUME): |         for doc in os.listdir(self.CONSUME): | ||||||
| @ -110,122 +114,156 @@ class Consumer(object): | |||||||
|             if self._is_ready(doc): |             if self._is_ready(doc): | ||||||
|                 continue |                 continue | ||||||
| 
 | 
 | ||||||
|             Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) |             self.logging_group = uuid.uuid4() | ||||||
|  | 
 | ||||||
|  |             self.log("info", "Consuming {}".format(doc)) | ||||||
| 
 | 
 | ||||||
|             tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) |             tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) | ||||||
|             pngs = self._get_greyscale(tempdir, doc) |             imgs = self._get_greyscale(tempdir, doc) | ||||||
|  |             thumbnail = self._get_thumbnail(tempdir, doc) | ||||||
| 
 | 
 | ||||||
|             try: |             try: | ||||||
|                 text = self._get_ocr(pngs) |                 text = self._get_ocr(imgs) | ||||||
|                 self._store(text, doc) |                 self._store(text, doc, thumbnail) | ||||||
|             except OCRError: |             except OCRError as e: | ||||||
|                 self._ignore.append(doc) |                 self._ignore.append(doc) | ||||||
|                 Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) |                 self.log("error", "OCR FAILURE for {}: {}".format(doc, e)) | ||||||
|  |                 self._cleanup_tempdir(tempdir) | ||||||
|                 continue |                 continue | ||||||
|             finally: |             else: | ||||||
|                 self._cleanup(tempdir, doc) |                 self._cleanup_tempdir(tempdir) | ||||||
|  |                 self._cleanup_doc(doc) | ||||||
| 
 | 
 | ||||||
|     def _get_greyscale(self, tempdir, doc): |     def _get_greyscale(self, tempdir, doc): | ||||||
|  |         """ | ||||||
|  |         Greyscale images are easier for Tesseract to OCR | ||||||
|  |         """ | ||||||
| 
 | 
 | ||||||
|         Log.debug( |         self.log("info", "Generating greyscale image from {}".format(doc)) | ||||||
|             "Generating greyscale image from {}".format(doc), |  | ||||||
|             Log.COMPONENT_CONSUMER |  | ||||||
|         ) |  | ||||||
| 
 |  | ||||||
|         png = os.path.join(tempdir, "convert-%04d.jpg") |  | ||||||
| 
 | 
 | ||||||
|  |         # Convert PDF to multiple PNMs | ||||||
|  |         pnm = os.path.join(tempdir, "convert-%04d.pnm") | ||||||
|         subprocess.Popen(( |         subprocess.Popen(( | ||||||
|             self.CONVERT, "-density", "300", "-depth", "8", |             self.CONVERT, "-density", "300", "-depth", "8", | ||||||
|             "-type", "grayscale", doc, png |             "-type", "grayscale", doc, pnm | ||||||
|         )).wait() |         )).wait() | ||||||
| 
 | 
 | ||||||
|         pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")] |         # Get a list of converted images | ||||||
|         return sorted(filter(lambda f: os.path.isfile(f), pngs)) |         pnms = [] | ||||||
|  |         for f in os.listdir(tempdir): | ||||||
|  |             if f.endswith(".pnm"): | ||||||
|  |                 pnms.append(os.path.join(tempdir, f)) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |         # Run unpaper in parallel on converted images | ||||||
|     def _guess_language(text): |         with Pool(processes=self.THREADS) as pool: | ||||||
|  |             pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) | ||||||
|  | 
 | ||||||
|  |         # Return list of converted images, processed with unpaper | ||||||
|  |         pnms = [] | ||||||
|  |         for f in os.listdir(tempdir): | ||||||
|  |             if f.endswith(".unpaper.pnm"): | ||||||
|  |                 pnms.append(os.path.join(tempdir, f)) | ||||||
|  | 
 | ||||||
|  |         return sorted(filter(lambda __: os.path.isfile(__), pnms)) | ||||||
|  | 
 | ||||||
|  |     def _get_thumbnail(self, tempdir, doc): | ||||||
|  |         """ | ||||||
|  |         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         self.log("info", "Generating the thumbnail") | ||||||
|  | 
 | ||||||
|  |         subprocess.Popen(( | ||||||
|  |             self.CONVERT, | ||||||
|  |             "-scale", "500x5000", | ||||||
|  |             "-alpha", "remove", | ||||||
|  |             doc, | ||||||
|  |             os.path.join(tempdir, "convert-%04d.png") | ||||||
|  |         )).wait() | ||||||
|  | 
 | ||||||
|  |         return os.path.join(tempdir, "convert-0000.png") | ||||||
|  | 
 | ||||||
|  |     def _guess_language(self, text): | ||||||
|         try: |         try: | ||||||
|             guess = langdetect.detect(text) |             guess = langdetect.detect(text) | ||||||
|             Log.debug( |             self.log("debug", "Language detected: {}".format(guess)) | ||||||
|                 "Language detected: {}".format(guess), |  | ||||||
|                 Log.COMPONENT_CONSUMER |  | ||||||
|             ) |  | ||||||
|             return guess |             return guess | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             Log.warning( |             self.log("warning", "Language detection error: {}".format(e)) | ||||||
|                 "Language detection error: {}".format(e), Log.COMPONENT_MAIL) |  | ||||||
| 
 | 
 | ||||||
|     def _get_ocr(self, pngs): |     def _get_ocr(self, imgs): | ||||||
|         """ |         """ | ||||||
|         Attempts to do the best job possible OCR'ing the document based on |         Attempts to do the best job possible OCR'ing the document based on | ||||||
|         simple language detection trial & error. |         simple language detection trial & error. | ||||||
|         """ |         """ | ||||||
| 
 | 
 | ||||||
|         if not pngs: |         if not imgs: | ||||||
|             raise OCRError |             raise OCRError("No images found") | ||||||
| 
 | 
 | ||||||
|         Log.debug("OCRing the document", Log.COMPONENT_CONSUMER) |         self.log("info", "OCRing the document") | ||||||
| 
 | 
 | ||||||
|         # Since the division gets rounded down by int, this calculation works |         # Since the division gets rounded down by int, this calculation works | ||||||
|         # for every edge-case, i.e. 1 |         # for every edge-case, i.e. 1 | ||||||
|         middle = int(len(pngs) / 2) |         middle = int(len(imgs) / 2) | ||||||
|         raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE) |         raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) | ||||||
| 
 | 
 | ||||||
|         guessed_language = self._guess_language(raw_text) |         guessed_language = self._guess_language(raw_text) | ||||||
| 
 | 
 | ||||||
|         if not guessed_language or guessed_language not in ISO639: |         if not guessed_language or guessed_language not in ISO639: | ||||||
|             Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER) |             self.log("warning", "Language detection failed!") | ||||||
|             if settings.FORGIVING_OCR: |             if settings.FORGIVING_OCR: | ||||||
|                 Log.warning( |                 self.log( | ||||||
|                     "As FORGIVING_OCR is enabled, we're going to make the best " |                     "warning", | ||||||
|                     "with what we have.", |                     "As FORGIVING_OCR is enabled, we're going to make the " | ||||||
|                     Log.COMPONENT_CONSUMER |                     "best with what we have." | ||||||
|                 ) |                 ) | ||||||
|                 raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) |                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||||
|                 return raw_text |                 return raw_text | ||||||
|             raise OCRError |             raise OCRError("Language detection failed") | ||||||
| 
 | 
 | ||||||
|         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: |         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: | ||||||
|             raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) |             raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||||
|             return raw_text |             return raw_text | ||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|             return self._ocr(pngs, ISO639[guessed_language]) |             return self._ocr(imgs, ISO639[guessed_language]) | ||||||
|         except pyocr.pyocr.tesseract.TesseractError: |         except pyocr.pyocr.tesseract.TesseractError: | ||||||
|             if settings.FORGIVING_OCR: |             if settings.FORGIVING_OCR: | ||||||
|                 Log.warning( |                 self.log( | ||||||
|  |                     "warning", | ||||||
|                     "OCR for {} failed, but we're going to stick with what " |                     "OCR for {} failed, but we're going to stick with what " | ||||||
|                     "we've got since FORGIVING_OCR is enabled.".format( |                     "we've got since FORGIVING_OCR is enabled.".format( | ||||||
|                         guessed_language |                         guessed_language | ||||||
|                     ), |                     ) | ||||||
|                     Log.COMPONENT_CONSUMER |  | ||||||
|                 ) |                 ) | ||||||
|                 raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) |                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||||
|                 return raw_text |                 return raw_text | ||||||
|             raise OCRError |             raise OCRError( | ||||||
|  |                 "The guessed language is not available in this instance of " | ||||||
|  |                 "Tesseract." | ||||||
|  |             ) | ||||||
| 
 | 
 | ||||||
|     def _assemble_ocr_sections(self, pngs, middle, text): |     def _assemble_ocr_sections(self, imgs, middle, text): | ||||||
|         """ |         """ | ||||||
|         Given a `middle` value and the text that middle page represents, we OCR |         Given a `middle` value and the text that middle page represents, we OCR | ||||||
|         the remainder of the document and return the whole thing. |         the remainder of the document and return the whole thing. | ||||||
|         """ |         """ | ||||||
|         text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + text |         text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text | ||||||
|         text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE) |         text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) | ||||||
|         return text |         return text | ||||||
| 
 | 
 | ||||||
|     def _ocr(self, pngs, lang): |     def _ocr(self, imgs, lang): | ||||||
|         """ |         """ | ||||||
|         Performs a single OCR attempt. |         Performs a single OCR attempt. | ||||||
|         """ |         """ | ||||||
| 
 | 
 | ||||||
|         if not pngs: |         if not imgs: | ||||||
|             return "" |             return "" | ||||||
| 
 | 
 | ||||||
|         Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER) |         self.log("info", "Parsing for {}".format(lang)) | ||||||
| 
 | 
 | ||||||
|         with Pool(processes=self.THREADS) as pool: |         with Pool(processes=self.THREADS) as pool: | ||||||
|             r = pool.map( |             r = pool.map(image_to_string, itertools.product(imgs, [lang])) | ||||||
|                 image_to_string, itertools.product([self], pngs, [lang])) |  | ||||||
|             r = " ".join(r) |             r = " ".join(r) | ||||||
| 
 | 
 | ||||||
|         # Strip out excess white space to allow matching to go smoother |         # Strip out excess white space to allow matching to go smoother | ||||||
| @ -233,16 +271,18 @@ class Consumer(object): | |||||||
| 
 | 
 | ||||||
|     def _guess_attributes_from_name(self, parseable): |     def _guess_attributes_from_name(self, parseable): | ||||||
|         """ |         """ | ||||||
|         We use a crude naming convention to make handling the sender, title, and |         We use a crude naming convention to make handling the correspondent, | ||||||
|         tags easier: |         title, and tags easier: | ||||||
|           "<sender> - <title> - <tags>.<suffix>" |           "<correspondent> - <title> - <tags>.<suffix>" | ||||||
|           "<sender> - <title>.<suffix>" |           "<correspondent> - <title>.<suffix>" | ||||||
|           "<title>.<suffix>" |           "<title>.<suffix>" | ||||||
|         """ |         """ | ||||||
| 
 | 
 | ||||||
|         def get_sender(sender_name): |         def get_correspondent(correspondent_name): | ||||||
|             return Sender.objects.get_or_create( |             return Correspondent.objects.get_or_create( | ||||||
|                 name=sender_name, defaults={"slug": slugify(sender_name)})[0] |                 name=correspondent_name, | ||||||
|  |                 defaults={"slug": slugify(correspondent_name)} | ||||||
|  |             )[0] | ||||||
| 
 | 
 | ||||||
|         def get_tags(tags): |         def get_tags(tags): | ||||||
|             r = [] |             r = [] | ||||||
| @ -251,40 +291,47 @@ class Consumer(object): | |||||||
|                     Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) |                     Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) | ||||||
|             return tuple(r) |             return tuple(r) | ||||||
| 
 | 
 | ||||||
|         # First attempt: "<sender> - <title> - <tags>.<suffix>" |         def get_suffix(suffix): | ||||||
|         m = re.match(self.REGEX_SENDER_TITLE_TAGS, parseable) |             suffix = suffix.lower() | ||||||
|  |             if suffix == "jpeg": | ||||||
|  |                 return "jpg" | ||||||
|  |             return suffix | ||||||
|  | 
 | ||||||
|  |         # First attempt: "<correspondent> - <title> - <tags>.<suffix>" | ||||||
|  |         m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable) | ||||||
|         if m: |         if m: | ||||||
|             return ( |             return ( | ||||||
|                 get_sender(m.group(1)), |                 get_correspondent(m.group(1)), | ||||||
|                 m.group(2), |                 m.group(2), | ||||||
|                 get_tags(m.group(3)), |                 get_tags(m.group(3)), | ||||||
|                 m.group(4) |                 get_suffix(m.group(4)) | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|         # Second attempt: "<sender> - <title>.<suffix>" |         # Second attempt: "<correspondent> - <title>.<suffix>" | ||||||
|         m = re.match(self.REGEX_SENDER_TITLE, parseable) |         m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable) | ||||||
|         if m: |         if m: | ||||||
|             return get_sender(m.group(1)), m.group(2), (), m.group(3) |             return ( | ||||||
|  |                 get_correspondent(m.group(1)), | ||||||
|  |                 m.group(2), | ||||||
|  |                 (), | ||||||
|  |                 get_suffix(m.group(3)) | ||||||
|  |             ) | ||||||
| 
 | 
 | ||||||
|         # That didn't work, so we assume sender and tags are None |         # That didn't work, so we assume correspondent and tags are None | ||||||
|         m = re.match(self.REGEX_TITLE, parseable) |         m = re.match(self.REGEX_TITLE, parseable) | ||||||
|         return None, m.group(1), (), m.group(2) |         return None, m.group(1), (), get_suffix(m.group(2)) | ||||||
| 
 | 
 | ||||||
|     def _store(self, text, doc): |     def _store(self, text, doc, thumbnail): | ||||||
| 
 | 
 | ||||||
|         sender, title, tags, file_type = self._guess_attributes_from_name(doc) |         sender, title, tags, file_type = self._guess_attributes_from_name(doc) | ||||||
|         tags = list(tags) |         relevant_tags = set(list(Tag.match_all(text)) + list(tags)) | ||||||
| 
 |  | ||||||
|         lower_text = text.lower() |  | ||||||
|         relevant_tags = set( |  | ||||||
|             [t for t in Tag.objects.all() if t.matches(lower_text)] + tags) |  | ||||||
| 
 | 
 | ||||||
|         stats = os.stat(doc) |         stats = os.stat(doc) | ||||||
| 
 | 
 | ||||||
|         Log.debug("Saving record to database", Log.COMPONENT_CONSUMER) |         self.log("debug", "Saving record to database") | ||||||
| 
 | 
 | ||||||
|         document = Document.objects.create( |         document = Document.objects.create( | ||||||
|             sender=sender, |             correspondent=sender, | ||||||
|             title=title, |             title=title, | ||||||
|             content=text, |             content=text, | ||||||
|             file_type=file_type, |             file_type=file_type, | ||||||
| @ -296,22 +343,29 @@ class Consumer(object): | |||||||
| 
 | 
 | ||||||
|         if relevant_tags: |         if relevant_tags: | ||||||
|             tag_names = ", ".join([t.slug for t in relevant_tags]) |             tag_names = ", ".join([t.slug for t in relevant_tags]) | ||||||
|             Log.debug( |             self.log("debug", "Tagging with {}".format(tag_names)) | ||||||
|                 "Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER) |  | ||||||
|             document.tags.add(*relevant_tags) |             document.tags.add(*relevant_tags) | ||||||
| 
 | 
 | ||||||
|  |         # Encrypt and store the actual document | ||||||
|         with open(doc, "rb") as unencrypted: |         with open(doc, "rb") as unencrypted: | ||||||
|             with open(document.source_path, "wb") as encrypted: |             with open(document.source_path, "wb") as encrypted: | ||||||
|                 Log.debug("Encrypting", Log.COMPONENT_CONSUMER) |                 self.log("debug", "Encrypting the document") | ||||||
|                 encrypted.write(GnuPG.encrypted(unencrypted)) |                 encrypted.write(GnuPG.encrypted(unencrypted)) | ||||||
| 
 | 
 | ||||||
|     def _cleanup(self, tempdir, doc): |         # Encrypt and store the thumbnail | ||||||
|         # Remove temporary directory recursively |         with open(thumbnail, "rb") as unencrypted: | ||||||
|         Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER) |             with open(document.thumbnail_path, "wb") as encrypted: | ||||||
|         shutil.rmtree(tempdir) |                 self.log("debug", "Encrypting the thumbnail") | ||||||
|  |                 encrypted.write(GnuPG.encrypted(unencrypted)) | ||||||
| 
 | 
 | ||||||
|         # Remove doc |         self.log("info", "Completed") | ||||||
|         Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER) | 
 | ||||||
|  |     def _cleanup_tempdir(self, d): | ||||||
|  |         self.log("debug", "Deleting directory {}".format(d)) | ||||||
|  |         shutil.rmtree(d) | ||||||
|  | 
 | ||||||
|  |     def _cleanup_doc(self, doc): | ||||||
|  |         self.log("debug", "Deleting document {}".format(doc)) | ||||||
|         os.unlink(doc) |         os.unlink(doc) | ||||||
| 
 | 
 | ||||||
|     def _is_ready(self, doc): |     def _is_ready(self, doc): | ||||||
| @ -329,3 +383,23 @@ class Consumer(object): | |||||||
|         self.stats[doc] = t |         self.stats[doc] = t | ||||||
| 
 | 
 | ||||||
|         return False |         return False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def image_to_string(args): | ||||||
|  |     img, lang = args | ||||||
|  |     ocr = pyocr.get_available_tools()[0] | ||||||
|  |     with Image.open(os.path.join(Consumer.SCRATCH, img)) as f: | ||||||
|  |         if ocr.can_detect_orientation(): | ||||||
|  |             try: | ||||||
|  |                 orientation = ocr.detect_orientation(f, lang=lang) | ||||||
|  |                 f = f.rotate(orientation["angle"], expand=1) | ||||||
|  |             except TesseractError: | ||||||
|  |                 pass | ||||||
|  |         return ocr.image_to_string(f, lang=lang) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def run_unpaper(args): | ||||||
|  |     unpaper, pnm = args | ||||||
|  |     subprocess.Popen(( | ||||||
|  |         unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm") | ||||||
|  |     )).wait() | ||||||
|  | |||||||
| @ -8,13 +8,13 @@ from time import mktime | |||||||
| from django import forms | from django import forms | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| 
 | 
 | ||||||
| from .models import Document, Sender | from .models import Document, Correspondent | ||||||
| from .consumer import Consumer | from .consumer import Consumer | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class UploadForm(forms.Form): | class UploadForm(forms.Form): | ||||||
| 
 | 
 | ||||||
|     SECRET = settings.UPLOAD_SHARED_SECRET |     SECRET = settings.SHARED_SECRET | ||||||
|     TYPE_LOOKUP = { |     TYPE_LOOKUP = { | ||||||
|         "application/pdf": Document.TYPE_PDF, |         "application/pdf": Document.TYPE_PDF, | ||||||
|         "image/png": Document.TYPE_PNG, |         "image/png": Document.TYPE_PNG, | ||||||
| @ -23,31 +23,36 @@ class UploadForm(forms.Form): | |||||||
|         "image/tiff": Document.TYPE_TIF, |         "image/tiff": Document.TYPE_TIF, | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     sender = forms.CharField( |     correspondent = forms.CharField( | ||||||
|         max_length=Sender._meta.get_field("name").max_length, required=False) |         max_length=Correspondent._meta.get_field("name").max_length, | ||||||
|  |         required=False | ||||||
|  |     ) | ||||||
|     title = forms.CharField( |     title = forms.CharField( | ||||||
|         max_length=Document._meta.get_field("title").max_length, required=False) |         max_length=Document._meta.get_field("title").max_length, | ||||||
|  |         required=False | ||||||
|  |     ) | ||||||
|     document = forms.FileField() |     document = forms.FileField() | ||||||
|     signature = forms.CharField(max_length=256) |     signature = forms.CharField(max_length=256) | ||||||
| 
 | 
 | ||||||
|     def clean_sender(self): |     def clean_correspondent(self): | ||||||
|         """ |         """ | ||||||
|         I suppose it might look cleaner to use .get_or_create() here, but that |         I suppose it might look cleaner to use .get_or_create() here, but that | ||||||
|         would also allow someone to fill up the db with bogus senders before all |         would also allow someone to fill up the db with bogus correspondents | ||||||
|         validation was met. |         before all validation was met. | ||||||
|         """ |         """ | ||||||
|         sender = self.cleaned_data.get("sender") |         corresp = self.cleaned_data.get("correspondent") | ||||||
|         if not sender: |         if not corresp: | ||||||
|             return None |             return None | ||||||
|         if not Sender.SAFE_REGEX.match(sender) or " - " in sender: |         if not Correspondent.SAFE_REGEX.match(corresp) or " - " in corresp: | ||||||
|             raise forms.ValidationError("That sender name is suspicious.") |             raise forms.ValidationError( | ||||||
|         return sender |                 "That correspondent name is suspicious.") | ||||||
|  |         return corresp | ||||||
| 
 | 
 | ||||||
|     def clean_title(self): |     def clean_title(self): | ||||||
|         title = self.cleaned_data.get("title") |         title = self.cleaned_data.get("title") | ||||||
|         if not title: |         if not title: | ||||||
|             return None |             return None | ||||||
|         if not Sender.SAFE_REGEX.match(title) or " - " in title: |         if not Correspondent.SAFE_REGEX.match(title) or " - " in title: | ||||||
|             raise forms.ValidationError("That title is suspicious.") |             raise forms.ValidationError("That title is suspicious.") | ||||||
| 
 | 
 | ||||||
|     def clean_document(self): |     def clean_document(self): | ||||||
| @ -59,10 +64,10 @@ class UploadForm(forms.Form): | |||||||
|         return document, self.TYPE_LOOKUP[file_type] |         return document, self.TYPE_LOOKUP[file_type] | ||||||
| 
 | 
 | ||||||
|     def clean(self): |     def clean(self): | ||||||
|         sender = self.clened_data("sender") |         corresp = self.clened_data("correspondent") | ||||||
|         title = self.cleaned_data("title") |         title = self.cleaned_data("title") | ||||||
|         signature = self.cleaned_data("signature") |         signature = self.cleaned_data("signature") | ||||||
|         if sha256(sender + title + self.SECRET).hexdigest() == signature: |         if sha256(corresp + title + self.SECRET).hexdigest() == signature: | ||||||
|             return True |             return True | ||||||
|         return False |         return False | ||||||
| 
 | 
 | ||||||
| @ -73,13 +78,15 @@ class UploadForm(forms.Form): | |||||||
|         form do that as well.  Think of it as a poor-man's queue server. |         form do that as well.  Think of it as a poor-man's queue server. | ||||||
|         """ |         """ | ||||||
| 
 | 
 | ||||||
|         sender = self.clened_data("sender") |         correspondent = self.clened_data("correspondent") | ||||||
|         title = self.cleaned_data("title") |         title = self.cleaned_data("title") | ||||||
|         document, file_type = self.cleaned_data.get("document") |         document, file_type = self.cleaned_data.get("document") | ||||||
| 
 | 
 | ||||||
|         t = int(mktime(datetime.now())) |         t = int(mktime(datetime.now())) | ||||||
|         file_name = os.path.join( |         file_name = os.path.join( | ||||||
|             Consumer.CONSUME, "{} - {}.{}".format(sender, title, file_type)) |             Consumer.CONSUME, | ||||||
|  |             "{} - {}.{}".format(correspondent, title, file_type) | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|         with open(file_name, "wb") as f: |         with open(file_name, "wb") as f: | ||||||
|             f.write(document) |             f.write(document) | ||||||
|  | |||||||
| @ -185,10 +185,10 @@ ISO639 = { | |||||||
|     "yo": "yor", |     "yo": "yor", | ||||||
|     "za": "zha", |     "za": "zha", | ||||||
| 
 | 
 | ||||||
|     # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra".  I have |     # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra".  I | ||||||
|     # no idea which one is better, so I just picked the bigger file. |     # have no idea which one is better, so I just picked the bigger file. | ||||||
|     "zh": "chi_tra", |     "zh": "chi_tra", | ||||||
| 
 | 
 | ||||||
|     "zu": "zul" |     "zu": "zul" | ||||||
| 
 | 
 | ||||||
| } | } | ||||||
|  | |||||||
							
								
								
									
										30
									
								
								src/documents/loggers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								src/documents/loggers.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,30 @@ | |||||||
|  | import logging | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class PaperlessLogger(logging.StreamHandler): | ||||||
|  |     """ | ||||||
|  |     A logger smart enough to know to log some kinds of messages to the database | ||||||
|  |     for later retrieval in a pretty interface. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     def emit(self, record): | ||||||
|  | 
 | ||||||
|  |         logging.StreamHandler.emit(self, record) | ||||||
|  | 
 | ||||||
|  |         if not hasattr(record, "component"): | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         # We have to do the import here or Django will barf when it tries to | ||||||
|  |         # load this because the apps aren't loaded at that point | ||||||
|  |         from .models import Log | ||||||
|  | 
 | ||||||
|  |         kwargs = { | ||||||
|  |             "message": record.msg, | ||||||
|  |             "component": record.component, | ||||||
|  |             "level": record.levelno, | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         if hasattr(record, "group"): | ||||||
|  |             kwargs["group"] = record.group | ||||||
|  | 
 | ||||||
|  |         Log.objects.create(**kwargs) | ||||||
| @ -1,8 +1,10 @@ | |||||||
| import datetime | import datetime | ||||||
| import imaplib | import imaplib | ||||||
|  | import logging | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| import time | import time | ||||||
|  | import uuid | ||||||
| 
 | 
 | ||||||
| from base64 import b64decode | from base64 import b64decode | ||||||
| from email import policy | from email import policy | ||||||
| @ -11,10 +13,8 @@ from dateutil import parser | |||||||
| 
 | 
 | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| 
 | 
 | ||||||
| from logger.models import Log |  | ||||||
| 
 |  | ||||||
| from .consumer import Consumer | from .consumer import Consumer | ||||||
| from .models import Sender | from .models import Correspondent, Log | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class MailFetcherError(Exception): | class MailFetcherError(Exception): | ||||||
| @ -25,21 +25,34 @@ class InvalidMessageError(Exception): | |||||||
|     pass |     pass | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Message(object): | class Loggable(object): | ||||||
|  | 
 | ||||||
|  |     def __init__(self, group=None): | ||||||
|  |         self.logger = logging.getLogger(__name__) | ||||||
|  |         self.logging_group = group or uuid.uuid4() | ||||||
|  | 
 | ||||||
|  |     def log(self, level, message): | ||||||
|  |         getattr(self.logger, level)(message, extra={ | ||||||
|  |             "group": self.logging_group, | ||||||
|  |             "component": Log.COMPONENT_MAIL | ||||||
|  |         }) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Message(Loggable): | ||||||
|     """ |     """ | ||||||
|     A crude, but simple email message class.  We assume that there's a subject |     A crude, but simple email message class.  We assume that there's a subject | ||||||
|     and n attachments, and that we don't care about the message body. |     and n attachments, and that we don't care about the message body. | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     SECRET = settings.UPLOAD_SHARED_SECRET |     SECRET = settings.SHARED_SECRET | ||||||
| 
 | 
 | ||||||
|     def __init__(self, data, verbosity=1): |     def __init__(self, data, group=None): | ||||||
|         """ |         """ | ||||||
|         Cribbed heavily from |         Cribbed heavily from | ||||||
|         https://www.ianlewis.org/en/parsing-email-attachments-python |         https://www.ianlewis.org/en/parsing-email-attachments-python | ||||||
|         """ |         """ | ||||||
| 
 | 
 | ||||||
|         self.verbosity = verbosity |         Loggable.__init__(self, group=group) | ||||||
| 
 | 
 | ||||||
|         self.subject = None |         self.subject = None | ||||||
|         self.time = None |         self.time = None | ||||||
| @ -54,8 +67,7 @@ class Message(object): | |||||||
| 
 | 
 | ||||||
|         self._set_time(message) |         self._set_time(message) | ||||||
| 
 | 
 | ||||||
|         Log.info( |         self.log("info", 'Importing email: "{}"'.format(self.subject)) | ||||||
|             'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL) |  | ||||||
| 
 | 
 | ||||||
|         attachments = [] |         attachments = [] | ||||||
|         for part in message.walk(): |         for part in message.walk(): | ||||||
| @ -91,7 +103,7 @@ class Message(object): | |||||||
|     def check_subject(self): |     def check_subject(self): | ||||||
|         if self.subject is None: |         if self.subject is None: | ||||||
|             raise InvalidMessageError("Message does not have a subject") |             raise InvalidMessageError("Message does not have a subject") | ||||||
|         if not Sender.SAFE_REGEX.match(self.subject): |         if not Correspondent.SAFE_REGEX.match(self.subject): | ||||||
|             raise InvalidMessageError("Message subject is unsafe: {}".format( |             raise InvalidMessageError("Message subject is unsafe: {}".format( | ||||||
|                 self.subject)) |                 self.subject)) | ||||||
| 
 | 
 | ||||||
| @ -134,9 +146,11 @@ class Attachment(object): | |||||||
|         return self.data |         return self.data | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class MailFetcher(object): | class MailFetcher(Loggable): | ||||||
| 
 | 
 | ||||||
|     def __init__(self, verbosity=1): |     def __init__(self): | ||||||
|  | 
 | ||||||
|  |         Loggable.__init__(self) | ||||||
| 
 | 
 | ||||||
|         self._connection = None |         self._connection = None | ||||||
|         self._host = settings.MAIL_CONSUMPTION["HOST"] |         self._host = settings.MAIL_CONSUMPTION["HOST"] | ||||||
| @ -148,7 +162,6 @@ class MailFetcher(object): | |||||||
|         self._enabled = bool(self._host) |         self._enabled = bool(self._host) | ||||||
| 
 | 
 | ||||||
|         self.last_checked = datetime.datetime.now() |         self.last_checked = datetime.datetime.now() | ||||||
|         self.verbosity = verbosity |  | ||||||
| 
 | 
 | ||||||
|     def pull(self): |     def pull(self): | ||||||
|         """ |         """ | ||||||
| @ -159,14 +172,14 @@ class MailFetcher(object): | |||||||
| 
 | 
 | ||||||
|         if self._enabled: |         if self._enabled: | ||||||
| 
 | 
 | ||||||
|             Log.info("Checking mail", Log.COMPONENT_MAIL) |             # Reset the grouping id for each fetch | ||||||
|  |             self.logging_group = uuid.uuid4() | ||||||
|  | 
 | ||||||
|  |             self.log("debug", "Checking mail") | ||||||
| 
 | 
 | ||||||
|             for message in self._get_messages(): |             for message in self._get_messages(): | ||||||
| 
 | 
 | ||||||
|                 Log.debug( |                 self.log("info", 'Storing email: "{}"'.format(message.subject)) | ||||||
|                     'Storing email: "{}"'.format(message.subject), |  | ||||||
|                     Log.COMPONENT_MAIL |  | ||||||
|                 ) |  | ||||||
| 
 | 
 | ||||||
|                 t = int(time.mktime(message.time.timetuple())) |                 t = int(time.mktime(message.time.timetuple())) | ||||||
|                 file_name = os.path.join(Consumer.CONSUME, message.file_name) |                 file_name = os.path.join(Consumer.CONSUME, message.file_name) | ||||||
| @ -193,7 +206,7 @@ class MailFetcher(object): | |||||||
|             self._connection.logout() |             self._connection.logout() | ||||||
| 
 | 
 | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             Log.error(e, Log.COMPONENT_MAIL) |             self.log("error", str(e)) | ||||||
| 
 | 
 | ||||||
|         return r |         return r | ||||||
| 
 | 
 | ||||||
| @ -218,9 +231,9 @@ class MailFetcher(object): | |||||||
| 
 | 
 | ||||||
|             message = None |             message = None | ||||||
|             try: |             try: | ||||||
|                 message = Message(data[0][1], self.verbosity) |                 message = Message(data[0][1], self.logging_group) | ||||||
|             except InvalidMessageError as e: |             except InvalidMessageError as e: | ||||||
|                 Log.error(e, Log.COMPONENT_MAIL) |                 self.log("error", str(e)) | ||||||
|             else: |             else: | ||||||
|                 self._connection.store(num, "+FLAGS", "\\Deleted") |                 self._connection.store(num, "+FLAGS", "\\Deleted") | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -1,10 +1,12 @@ | |||||||
| import datetime | import datetime | ||||||
|  | import logging | ||||||
| import os | import os | ||||||
| import time | import time | ||||||
| 
 | 
 | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.core.management.base import BaseCommand, CommandError | from django.core.management.base import BaseCommand, CommandError | ||||||
| 
 | 
 | ||||||
|  | from ...models import Log | ||||||
| from ...consumer import Consumer, ConsumerError | from ...consumer import Consumer, ConsumerError | ||||||
| from ...mail import MailFetcher, MailFetcherError | from ...mail import MailFetcher, MailFetcherError | ||||||
| 
 | 
 | ||||||
| @ -34,7 +36,7 @@ class Command(BaseCommand): | |||||||
|         self.verbosity = options["verbosity"] |         self.verbosity = options["verbosity"] | ||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|             self.file_consumer = Consumer(verbosity=self.verbosity) |             self.file_consumer = Consumer() | ||||||
|             self.mail_fetcher = MailFetcher() |             self.mail_fetcher = MailFetcher() | ||||||
|         except (ConsumerError, MailFetcherError) as e: |         except (ConsumerError, MailFetcherError) as e: | ||||||
|             raise CommandError(e) |             raise CommandError(e) | ||||||
| @ -44,6 +46,13 @@ class Command(BaseCommand): | |||||||
|         except FileExistsError: |         except FileExistsError: | ||||||
|             pass |             pass | ||||||
| 
 | 
 | ||||||
|  |         logging.getLogger(__name__).info( | ||||||
|  |             "Starting document consumer at {}".format( | ||||||
|  |                 settings.CONSUMPTION_DIR | ||||||
|  |             ), | ||||||
|  |             extra={"component": Log.COMPONENT_CONSUMER} | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|         try: |         try: | ||||||
|             while True: |             while True: | ||||||
|                 self.loop() |                 self.loop() | ||||||
|  | |||||||
| @ -1,10 +1,12 @@ | |||||||
|  | import json | ||||||
| import os | import os | ||||||
| import time | import time | ||||||
| 
 | 
 | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.core.management.base import BaseCommand, CommandError | from django.core.management.base import BaseCommand, CommandError | ||||||
|  | from django.core import serializers | ||||||
| 
 | 
 | ||||||
| from documents.models import Document | from documents.models import Document, Correspondent, Tag | ||||||
| from paperless.db import GnuPG | from paperless.db import GnuPG | ||||||
| 
 | 
 | ||||||
| from ...mixins import Renderable | from ...mixins import Renderable | ||||||
| @ -14,21 +16,26 @@ class Command(Renderable, BaseCommand): | |||||||
| 
 | 
 | ||||||
|     help = """ |     help = """ | ||||||
|         Decrypt and rename all files in our collection into a given target |         Decrypt and rename all files in our collection into a given target | ||||||
|         directory.  Note that we don't export any of the parsed data since |         directory.  And include a manifest file containing document data for | ||||||
|         that can always be re-collected via the consumer. |         easy import. | ||||||
|     """.replace("    ", "") |     """.replace("    ", "") | ||||||
| 
 | 
 | ||||||
|     def add_arguments(self, parser): |     def add_arguments(self, parser): | ||||||
|         parser.add_argument("target") |         parser.add_argument("target") | ||||||
|  |         parser.add_argument( | ||||||
|  |             "--legacy", | ||||||
|  |             action="store_true", | ||||||
|  |             help="Don't try to export all of the document data, just dump the " | ||||||
|  |                  "original document files out in a format that makes " | ||||||
|  |                  "re-consuming them easy." | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|     def __init__(self, *args, **kwargs): |     def __init__(self, *args, **kwargs): | ||||||
|         self.verbosity = 0 |  | ||||||
|         self.target = None |  | ||||||
|         BaseCommand.__init__(self, *args, **kwargs) |         BaseCommand.__init__(self, *args, **kwargs) | ||||||
|  |         self.target = None | ||||||
| 
 | 
 | ||||||
|     def handle(self, *args, **options): |     def handle(self, *args, **options): | ||||||
| 
 | 
 | ||||||
|         self.verbosity = options["verbosity"] |  | ||||||
|         self.target = options["target"] |         self.target = options["target"] | ||||||
| 
 | 
 | ||||||
|         if not os.path.exists(self.target): |         if not os.path.exists(self.target): | ||||||
| @ -40,9 +47,22 @@ class Command(Renderable, BaseCommand): | |||||||
|         if not settings.PASSPHRASE: |         if not settings.PASSPHRASE: | ||||||
|             settings.PASSPHRASE = input("Please enter the passphrase: ") |             settings.PASSPHRASE = input("Please enter the passphrase: ") | ||||||
| 
 | 
 | ||||||
|         for document in Document.objects.all(): |         if options["legacy"]: | ||||||
|  |             self.dump_legacy() | ||||||
|  |         else: | ||||||
|  |             self.dump() | ||||||
|  | 
 | ||||||
|  |     def dump(self): | ||||||
|  | 
 | ||||||
|  |         documents = Document.objects.all() | ||||||
|  |         document_map = {d.pk: d for d in documents} | ||||||
|  |         manifest = json.loads(serializers.serialize("json", documents)) | ||||||
|  |         for document_dict in manifest: | ||||||
|  | 
 | ||||||
|  |             document = document_map[document_dict["pk"]] | ||||||
| 
 | 
 | ||||||
|             target = os.path.join(self.target, document.file_name) |             target = os.path.join(self.target, document.file_name) | ||||||
|  |             document_dict["__exported_file_name__"] = target | ||||||
| 
 | 
 | ||||||
|             print("Exporting: {}".format(target)) |             print("Exporting: {}".format(target)) | ||||||
| 
 | 
 | ||||||
| @ -50,3 +70,37 @@ class Command(Renderable, BaseCommand): | |||||||
|                 f.write(GnuPG.decrypted(document.source_file)) |                 f.write(GnuPG.decrypted(document.source_file)) | ||||||
|                 t = int(time.mktime(document.created.timetuple())) |                 t = int(time.mktime(document.created.timetuple())) | ||||||
|                 os.utime(target, times=(t, t)) |                 os.utime(target, times=(t, t)) | ||||||
|  | 
 | ||||||
|  |         manifest += json.loads( | ||||||
|  |             serializers.serialize("json", Correspondent.objects.all())) | ||||||
|  | 
 | ||||||
|  |         manifest += json.loads(serializers.serialize( | ||||||
|  |             "json", Tag.objects.all())) | ||||||
|  | 
 | ||||||
|  |         with open(os.path.join(self.target, "manifest.json"), "w") as f: | ||||||
|  |             json.dump(manifest, f, indent=2) | ||||||
|  | 
 | ||||||
|  |     def dump_legacy(self): | ||||||
|  | 
 | ||||||
|  |         for document in Document.objects.all(): | ||||||
|  | 
 | ||||||
|  |             target = os.path.join( | ||||||
|  |                 self.target, self._get_legacy_file_name(document)) | ||||||
|  | 
 | ||||||
|  |             print("Exporting: {}".format(target)) | ||||||
|  | 
 | ||||||
|  |             with open(target, "wb") as f: | ||||||
|  |                 f.write(GnuPG.decrypted(document.source_file)) | ||||||
|  |                 t = int(time.mktime(document.created.timetuple())) | ||||||
|  |                 os.utime(target, times=(t, t)) | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def _get_legacy_file_name(doc): | ||||||
|  |         if doc.correspondent and doc.title: | ||||||
|  |             tags = ",".join([t.slug for t in doc.tags.all()]) | ||||||
|  |             if tags: | ||||||
|  |                 return "{} - {} - {}.{}".format( | ||||||
|  |                     doc.correspondent, doc.title, tags, doc.file_type) | ||||||
|  |             return "{} - {}.{}".format( | ||||||
|  |                 doc.correspondent, doc.title, doc.file_type) | ||||||
|  |         return os.path.basename(doc.source_path) | ||||||
|  | |||||||
							
								
								
									
										99
									
								
								src/documents/management/commands/document_importer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										99
									
								
								src/documents/management/commands/document_importer.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,99 @@ | |||||||
|  | import json | ||||||
|  | import os | ||||||
|  | 
 | ||||||
|  | from django.conf import settings | ||||||
|  | from django.core.management.base import BaseCommand, CommandError | ||||||
|  | from django.core.management import call_command | ||||||
|  | 
 | ||||||
|  | from documents.models import Document | ||||||
|  | from paperless.db import GnuPG | ||||||
|  | 
 | ||||||
|  | from ...mixins import Renderable | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Command(Renderable, BaseCommand): | ||||||
|  | 
 | ||||||
|  |     help = """ | ||||||
|  |         Using a manifest.json file, load the data from there, and import the | ||||||
|  |         documents it refers to. | ||||||
|  |     """.replace("    ", "") | ||||||
|  | 
 | ||||||
|  |     def add_arguments(self, parser): | ||||||
|  |         parser.add_argument("source") | ||||||
|  | 
 | ||||||
|  |     def __init__(self, *args, **kwargs): | ||||||
|  |         BaseCommand.__init__(self, *args, **kwargs) | ||||||
|  |         self.source = None | ||||||
|  |         self.manifest = None | ||||||
|  | 
 | ||||||
|  |     def handle(self, *args, **options): | ||||||
|  | 
 | ||||||
|  |         self.source = options["source"] | ||||||
|  | 
 | ||||||
|  |         if not os.path.exists(self.source): | ||||||
|  |             raise CommandError("That path doesn't exist") | ||||||
|  | 
 | ||||||
|  |         if not os.access(self.source, os.R_OK): | ||||||
|  |             raise CommandError("That path doesn't appear to be readable") | ||||||
|  | 
 | ||||||
|  |         manifest_path = os.path.join(self.source, "manifest.json") | ||||||
|  |         self._check_manifest_exists(manifest_path) | ||||||
|  | 
 | ||||||
|  |         with open(manifest_path) as f: | ||||||
|  |             self.manifest = json.load(f) | ||||||
|  | 
 | ||||||
|  |         self._check_manifest() | ||||||
|  | 
 | ||||||
|  |         if not settings.PASSPHRASE: | ||||||
|  |             raise CommandError( | ||||||
|  |                 "You need to define a passphrase before continuing.  Please " | ||||||
|  |                 "consult the documentation for setting up Paperless." | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         # Fill up the database with whatever is in the manifest | ||||||
|  |         call_command("loaddata", manifest_path) | ||||||
|  | 
 | ||||||
|  |         self._import_files_from_manifest() | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def _check_manifest_exists(path): | ||||||
|  |         if not os.path.exists(path): | ||||||
|  |             raise CommandError( | ||||||
|  |                 "That directory doesn't appear to contain a manifest.json " | ||||||
|  |                 "file." | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def _check_manifest(self): | ||||||
|  | 
 | ||||||
|  |         for record in self.manifest: | ||||||
|  | 
 | ||||||
|  |             if not record["model"] == "documents.document": | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             if "__exported_file_name__" not in record: | ||||||
|  |                 raise CommandError( | ||||||
|  |                     'The manifest file contains a record which does not ' | ||||||
|  |                     'refer to an actual document file.' | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |             doc_file = record["__exported_file_name__"] | ||||||
|  |             if not os.path.exists(os.path.join(self.source, doc_file)): | ||||||
|  |                 raise CommandError( | ||||||
|  |                     'The manifest file refers to "{}" which does not ' | ||||||
|  |                     'appear to be in the source directory.'.format(doc_file) | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |     def _import_files_from_manifest(self): | ||||||
|  | 
 | ||||||
|  |         for record in self.manifest: | ||||||
|  | 
 | ||||||
|  |             if not record["model"] == "documents.document": | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             doc_file = record["__exported_file_name__"] | ||||||
|  |             document = Document.objects.get(pk=record["pk"]) | ||||||
|  |             with open(doc_file, "rb") as unencrypted: | ||||||
|  |                 with open(document.source_path, "wb") as encrypted: | ||||||
|  |                     print("Encrypting {} and saving it to {}".format( | ||||||
|  |                         doc_file, document.source_path)) | ||||||
|  |                     encrypted.write(GnuPG.encrypted(unencrypted)) | ||||||
| @ -10,8 +10,8 @@ class Command(Renderable, BaseCommand): | |||||||
|     help = """ |     help = """ | ||||||
|         Using the current set of tagging rules, apply said rules to all |         Using the current set of tagging rules, apply said rules to all | ||||||
|         documents in the database, effectively allowing you to back-tag all |         documents in the database, effectively allowing you to back-tag all | ||||||
|         previously indexed documents with tags created (or modified) after their |         previously indexed documents with tags created (or modified) after | ||||||
|         initial import. |         their initial import. | ||||||
|     """.replace("    ", "") |     """.replace("    ", "") | ||||||
| 
 | 
 | ||||||
|     def __init__(self, *args, **kwargs): |     def __init__(self, *args, **kwargs): | ||||||
| @ -23,9 +23,10 @@ class Command(Renderable, BaseCommand): | |||||||
|         self.verbosity = options["verbosity"] |         self.verbosity = options["verbosity"] | ||||||
| 
 | 
 | ||||||
|         for document in Document.objects.all(): |         for document in Document.objects.all(): | ||||||
|  | 
 | ||||||
|             tags = Tag.objects.exclude( |             tags = Tag.objects.exclude( | ||||||
|                 pk__in=document.tags.values_list("pk", flat=True)) |                 pk__in=document.tags.values_list("pk", flat=True)) | ||||||
|             for tag in tags: | 
 | ||||||
|                 if tag.matches(document.content): |             for tag in Tag.match_all(document.content, tags): | ||||||
|                     print('Tagging {} with "{}"'.format(document, tag)) |                 print('Tagging {} with "{}"'.format(document, tag)) | ||||||
|                     document.tags.add(tag) |                 document.tags.add(tag) | ||||||
|  | |||||||
							
								
								
									
										20
									
								
								src/documents/management/commands/loaddata_stdin.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								src/documents/management/commands/loaddata_stdin.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,20 @@ | |||||||
|  | import sys | ||||||
|  | 
 | ||||||
|  | from django.core.management.commands.loaddata import Command as LoadDataCommand | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Command(LoadDataCommand): | ||||||
|  |     """ | ||||||
|  |     Allow the loading of data from standard in.  Sourced originally from: | ||||||
|  |     https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 (MIT licensed) | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     def parse_name(self, fixture_name): | ||||||
|  |         self.compression_formats['stdin'] = (lambda x, y: sys.stdin, None) | ||||||
|  |         if fixture_name == '-': | ||||||
|  |             return '-', 'json', 'stdin' | ||||||
|  | 
 | ||||||
|  |     def find_fixtures(self, fixture_label): | ||||||
|  |         if fixture_label == '-': | ||||||
|  |             return [('-', None, '-')] | ||||||
|  |         return super(Command, self).find_fixtures(fixture_label) | ||||||
							
								
								
									
										70
									
								
								src/documents/managers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								src/documents/managers.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,70 @@ | |||||||
|  | from django.conf import settings | ||||||
|  | 
 | ||||||
|  | from django.db import models | ||||||
|  | from django.db.models.aggregates import Max | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class GroupConcat(models.Aggregate): | ||||||
|  |     """ | ||||||
|  |     Theoretically, this should work in Sqlite, PostgreSQL, and MySQL, but I've | ||||||
|  |     only ever tested it in Sqlite. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     ENGINE_SQLITE = 1 | ||||||
|  |     ENGINE_POSTGRESQL = 2 | ||||||
|  |     ENGINE_MYSQL = 3 | ||||||
|  |     ENGINES = { | ||||||
|  |         "django.db.backends.sqlite3": ENGINE_SQLITE, | ||||||
|  |         "django.db.backends.postgresql_psycopg2": ENGINE_POSTGRESQL, | ||||||
|  |         "django.db.backends.postgresql": ENGINE_POSTGRESQL, | ||||||
|  |         "django.db.backends.mysql": ENGINE_MYSQL | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     def __init__(self, expression, separator="\n", **extra): | ||||||
|  | 
 | ||||||
|  |         self.engine = self._get_engine() | ||||||
|  |         self.function = self._get_function() | ||||||
|  |         self.template = self._get_template(separator) | ||||||
|  | 
 | ||||||
|  |         models.Aggregate.__init__( | ||||||
|  |             self, | ||||||
|  |             expression, | ||||||
|  |             output_field=models.CharField(), | ||||||
|  |             **extra | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def _get_engine(self): | ||||||
|  |         engine = settings.DATABASES["default"]["ENGINE"] | ||||||
|  |         try: | ||||||
|  |             return self.ENGINES[engine] | ||||||
|  |         except KeyError: | ||||||
|  |             raise NotImplementedError( | ||||||
|  |                 "There's currently no support for {} when it comes to group " | ||||||
|  |                 "concatenation in Paperless".format(engine) | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def _get_function(self): | ||||||
|  |         if self.engine == self.ENGINE_POSTGRESQL: | ||||||
|  |             return "STRING_AGG" | ||||||
|  |         return "GROUP_CONCAT" | ||||||
|  | 
 | ||||||
|  |     def _get_template(self, separator): | ||||||
|  |         if self.engine == self.ENGINE_MYSQL: | ||||||
|  |             return "%(function)s(%(expressions)s, SEPARATOR '{}')".format( | ||||||
|  |                 separator) | ||||||
|  |         return "%(function)s(%(expressions)s, '{}')".format(separator) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class LogQuerySet(models.query.QuerySet): | ||||||
|  | 
 | ||||||
|  |     def by_group(self): | ||||||
|  |         return self.values("group").annotate( | ||||||
|  |             time=Max("modified"), | ||||||
|  |             messages=GroupConcat("message"), | ||||||
|  |         ).order_by("-time") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class LogManager(models.Manager): | ||||||
|  | 
 | ||||||
|  |     def get_queryset(self): | ||||||
|  |         return LogQuerySet(self.model, using=self._db) | ||||||
| @ -1,5 +1,5 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
| # Generated by Django 1.9 on 2016-02-14 16:08 | # Generated by Django 1.9 on 2016-02-27 17:54 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from django.db import migrations, models | from django.db import migrations, models | ||||||
| @ -7,9 +7,8 @@ from django.db import migrations, models | |||||||
| 
 | 
 | ||||||
| class Migration(migrations.Migration): | class Migration(migrations.Migration): | ||||||
| 
 | 
 | ||||||
|     initial = True |  | ||||||
| 
 |  | ||||||
|     dependencies = [ |     dependencies = [ | ||||||
|  |         ('documents', '0009_auto_20160214_0040'), | ||||||
|     ] |     ] | ||||||
| 
 | 
 | ||||||
|     operations = [ |     operations = [ | ||||||
| @ -17,14 +16,15 @@ class Migration(migrations.Migration): | |||||||
|             name='Log', |             name='Log', | ||||||
|             fields=[ |             fields=[ | ||||||
|                 ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), |                 ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||||||
|                 ('time', models.DateTimeField(auto_now_add=True)), |                 ('group', models.UUIDField(blank=True)), | ||||||
|                 ('message', models.TextField()), |                 ('message', models.TextField()), | ||||||
|                 ('level', models.PositiveIntegerField(choices=[(1, 'Error'), (2, 'Warning'), (3, 'Informational'), (4, 'Debugging')], default=3)), |                 ('level', models.PositiveIntegerField(choices=[(10, 'Debugging'), (20, 'Informational'), (30, 'Warning'), (40, 'Error'), (50, 'Critical')], default=20)), | ||||||
|                 ('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])), |                 ('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])), | ||||||
|  |                 ('created', models.DateTimeField(auto_now_add=True)), | ||||||
|  |                 ('modified', models.DateTimeField(auto_now=True)), | ||||||
|             ], |             ], | ||||||
|         ), |             options={ | ||||||
|         migrations.AlterModelOptions( |                 'ordering': ('-modified',), | ||||||
|             name='log', |             }, | ||||||
|             options={'ordering': ('-time',)}, |  | ||||||
|         ), |         ), | ||||||
|     ] |     ] | ||||||
							
								
								
									
										28
									
								
								src/documents/migrations/0011_auto_20160303_1929.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								src/documents/migrations/0011_auto_20160303_1929.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,28 @@ | |||||||
|  | # -*- coding: utf-8 -*- | ||||||
|  | # Generated by Django 1.9.2 on 2016-03-03 19:29 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from django.db import migrations | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Migration(migrations.Migration): | ||||||
|  | 
 | ||||||
|  |     dependencies = [ | ||||||
|  |         ('documents', '0010_log'), | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     operations = [ | ||||||
|  |         migrations.RenameModel( | ||||||
|  |             old_name='Sender', | ||||||
|  |             new_name='Correspondent', | ||||||
|  |         ), | ||||||
|  |         migrations.AlterModelOptions( | ||||||
|  |             name='document', | ||||||
|  |             options={'ordering': ('correspondent', 'title')}, | ||||||
|  |         ), | ||||||
|  |         migrations.RenameField( | ||||||
|  |             model_name='document', | ||||||
|  |             old_name='sender', | ||||||
|  |             new_name='correspondent', | ||||||
|  |         ), | ||||||
|  |     ] | ||||||
							
								
								
									
										119
									
								
								src/documents/migrations/0012_auto_20160305_0040.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										119
									
								
								src/documents/migrations/0012_auto_20160305_0040.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,119 @@ | |||||||
|  | # -*- coding: utf-8 -*- | ||||||
|  | # Generated by Django 1.9.2 on 2016-03-05 00:40 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import gnupg | ||||||
|  | import os | ||||||
|  | import re | ||||||
|  | import shutil | ||||||
|  | import subprocess | ||||||
|  | import tempfile | ||||||
|  | 
 | ||||||
|  | from django.conf import settings | ||||||
|  | from django.db import migrations | ||||||
|  | from django.utils.termcolors import colorize as colourise  # Spelling hurts me | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class GnuPG(object): | ||||||
|  |     """ | ||||||
|  |     A handy singleton to use when handling encrypted files. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def decrypted(cls, file_handle): | ||||||
|  |         return cls.gpg.decrypt_file( | ||||||
|  |             file_handle, passphrase=settings.PASSPHRASE).data | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def encrypted(cls, file_handle): | ||||||
|  |         return cls.gpg.encrypt_file( | ||||||
|  |             file_handle, | ||||||
|  |             recipients=None, | ||||||
|  |             passphrase=settings.PASSPHRASE, | ||||||
|  |             symmetric=True | ||||||
|  |         ).data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def move_documents_and_create_thumbnails(apps, schema_editor): | ||||||
|  | 
 | ||||||
|  |     documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents")) | ||||||
|  | 
 | ||||||
|  |     if set(documents) == {"originals", "thumbnails"}: | ||||||
|  |         return | ||||||
|  | 
 | ||||||
|  |     print(colourise( | ||||||
|  |         "\n\n" | ||||||
|  |         "  This is a one-time only migration to generate thumbnails for all of your\n" | ||||||
|  |         "  documents so that future UIs will have something to work with.  If you have\n" | ||||||
|  |         "  a lot of documents though, this may take a while, so a coffee break may be\n" | ||||||
|  |         "  in order." | ||||||
|  |         "\n", opts=("bold",) | ||||||
|  |     )) | ||||||
|  | 
 | ||||||
|  |     try: | ||||||
|  |         os.makedirs(settings.SCRATCH_DIR) | ||||||
|  |     except FileExistsError: | ||||||
|  |         pass | ||||||
|  | 
 | ||||||
|  |     for f in sorted(documents): | ||||||
|  | 
 | ||||||
|  |         if not f.endswith("gpg"): | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         print("    {} {} {}".format( | ||||||
|  |             colourise("*", fg="green"), | ||||||
|  |             colourise("Generating a thumbnail for", fg="white"), | ||||||
|  |             colourise(f, fg="cyan") | ||||||
|  |         )) | ||||||
|  | 
 | ||||||
|  |         thumb_temp = tempfile.mkdtemp( | ||||||
|  |             prefix="paperless", dir=settings.SCRATCH_DIR) | ||||||
|  |         orig_temp = tempfile.mkdtemp( | ||||||
|  |             prefix="paperless", dir=settings.SCRATCH_DIR) | ||||||
|  | 
 | ||||||
|  |         orig_source = os.path.join(settings.MEDIA_ROOT, "documents", f) | ||||||
|  |         orig_target = os.path.join(orig_temp, f.replace(".gpg", "")) | ||||||
|  | 
 | ||||||
|  |         with open(orig_source, "rb") as encrypted: | ||||||
|  |             with open(orig_target, "wb") as unencrypted: | ||||||
|  |                 unencrypted.write(GnuPG.decrypted(encrypted)) | ||||||
|  | 
 | ||||||
|  |         subprocess.Popen(( | ||||||
|  |             settings.CONVERT_BINARY, | ||||||
|  |             "-scale", "500x5000", | ||||||
|  |             "-alpha", "remove", | ||||||
|  |             orig_target, | ||||||
|  |             os.path.join(thumb_temp, "convert-%04d.png") | ||||||
|  |         )).wait() | ||||||
|  | 
 | ||||||
|  |         thumb_source = os.path.join(thumb_temp, "convert-0000.png") | ||||||
|  |         thumb_target = os.path.join( | ||||||
|  |             settings.MEDIA_ROOT, | ||||||
|  |             "documents", | ||||||
|  |             "thumbnails", | ||||||
|  |             re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f) | ||||||
|  |         ) | ||||||
|  |         with open(thumb_source, "rb") as unencrypted: | ||||||
|  |             with open(thumb_target, "wb") as encrypted: | ||||||
|  |                 encrypted.write(GnuPG.encrypted(unencrypted)) | ||||||
|  | 
 | ||||||
|  |         shutil.rmtree(thumb_temp) | ||||||
|  |         shutil.rmtree(orig_temp) | ||||||
|  | 
 | ||||||
|  |         shutil.move( | ||||||
|  |             os.path.join(settings.MEDIA_ROOT, "documents", f), | ||||||
|  |             os.path.join(settings.MEDIA_ROOT, "documents", "originals", f), | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Migration(migrations.Migration): | ||||||
|  | 
 | ||||||
|  |     dependencies = [ | ||||||
|  |         ('documents', '0011_auto_20160303_1929'), | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     operations = [ | ||||||
|  |         migrations.RunPython(move_documents_and_create_thumbnails), | ||||||
|  |     ] | ||||||
| @ -1,7 +1,7 @@ | |||||||
| class Renderable(object): | class Renderable(object): | ||||||
|     """ |     """ | ||||||
|     A handy mixin to make it easier/cleaner to print output based on a verbosity |     A handy mixin to make it easier/cleaner to print output based on a | ||||||
|     value. |     verbosity value. | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     def _render(self, text, verbosity): |     def _render(self, text, verbosity): | ||||||
|  | |||||||
| @ -1,5 +1,7 @@ | |||||||
|  | import logging | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
|  | import uuid | ||||||
| 
 | 
 | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.core.urlresolvers import reverse | from django.core.urlresolvers import reverse | ||||||
| @ -7,6 +9,8 @@ from django.db import models | |||||||
| from django.template.defaultfilters import slugify | from django.template.defaultfilters import slugify | ||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
| 
 | 
 | ||||||
|  | from .managers import LogManager | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class SluggedModel(models.Model): | class SluggedModel(models.Model): | ||||||
| 
 | 
 | ||||||
| @ -25,7 +29,7 @@ class SluggedModel(models.Model): | |||||||
|         return self.name |         return self.name | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Sender(SluggedModel): | class Correspondent(SluggedModel): | ||||||
| 
 | 
 | ||||||
|     # This regex is probably more restrictive than it needs to be, but it's |     # This regex is probably more restrictive than it needs to be, but it's | ||||||
|     # better safe than sorry. |     # better safe than sorry. | ||||||
| @ -36,7 +40,7 @@ class Sender(SluggedModel): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Tag(SluggedModel): | class Tag(SluggedModel): | ||||||
|      | 
 | ||||||
|     COLOURS = ( |     COLOURS = ( | ||||||
|         (1, "#a6cee3"), |         (1, "#a6cee3"), | ||||||
|         (2, "#1f78b4"), |         (2, "#1f78b4"), | ||||||
| @ -71,9 +75,9 @@ class Tag(SluggedModel): | |||||||
|         default=MATCH_ANY, |         default=MATCH_ANY, | ||||||
|         help_text=( |         help_text=( | ||||||
|             "Which algorithm you want to use when matching text to the OCR'd " |             "Which algorithm you want to use when matching text to the OCR'd " | ||||||
|             "PDF.  Here, \"any\" looks for any occurrence of any word provided " |             "PDF.  Here, \"any\" looks for any occurrence of any word " | ||||||
|             "in the PDF, while \"all\" requires that every word provided " |             "provided in the PDF, while \"all\" requires that every word " | ||||||
|             "appear in the PDF, albeit not in the order provided.  A " |             "provided appear in the PDF, albeit not in the order provided.  A " | ||||||
|             "\"literal\" match means that the text you enter must appear in " |             "\"literal\" match means that the text you enter must appear in " | ||||||
|             "the PDF exactly as you've entered it, and \"regular expression\" " |             "the PDF exactly as you've entered it, and \"regular expression\" " | ||||||
|             "uses a regex to match the PDF.  If you don't know what a regex " |             "uses a regex to match the PDF.  If you don't know what a regex " | ||||||
| @ -86,28 +90,40 @@ class Tag(SluggedModel): | |||||||
|         return "{}: \"{}\" ({})".format( |         return "{}: \"{}\" ({})".format( | ||||||
|             self.name, self.match, self.get_matching_algorithm_display()) |             self.name, self.match, self.get_matching_algorithm_display()) | ||||||
| 
 | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def match_all(cls, text, tags=None): | ||||||
|  | 
 | ||||||
|  |         if tags is None: | ||||||
|  |             tags = cls.objects.all() | ||||||
|  | 
 | ||||||
|  |         text = text.lower() | ||||||
|  |         for tag in tags: | ||||||
|  |             if tag.matches(text): | ||||||
|  |                 yield tag | ||||||
|  | 
 | ||||||
|     def matches(self, text): |     def matches(self, text): | ||||||
|  | 
 | ||||||
|         # Check that match is not empty |         # Check that match is not empty | ||||||
|         if self.match.strip() == "": |         if self.match.strip() == "": | ||||||
|             return False |             return False | ||||||
| 
 | 
 | ||||||
|         if self.matching_algorithm == self.MATCH_ALL: |         if self.matching_algorithm == self.MATCH_ALL: | ||||||
|             for word in self.match.split(" "): |             for word in self.match.split(" "): | ||||||
|                 if word not in text: |                 if not re.search(r"\b{}\b".format(word), text): | ||||||
|                     return False |                     return False | ||||||
|             return True |             return True | ||||||
| 
 | 
 | ||||||
|         if self.matching_algorithm == self.MATCH_ANY: |         if self.matching_algorithm == self.MATCH_ANY: | ||||||
|             for word in self.match.split(" "): |             for word in self.match.split(" "): | ||||||
|                 if word in text: |                 if re.search(r"\b{}\b".format(word), text): | ||||||
|                     return True |                     return True | ||||||
|             return False |             return False | ||||||
| 
 | 
 | ||||||
|         if self.matching_algorithm == self.MATCH_LITERAL: |         if self.matching_algorithm == self.MATCH_LITERAL: | ||||||
|             return self.match in text |             return bool(re.search(r"\b{}\b".format(self.match), text)) | ||||||
| 
 | 
 | ||||||
|         if self.matching_algorithm == self.MATCH_REGEX: |         if self.matching_algorithm == self.MATCH_REGEX: | ||||||
|             return re.search(re.compile(self.match), text) |             return bool(re.search(re.compile(self.match), text)) | ||||||
| 
 | 
 | ||||||
|         raise NotImplementedError("Unsupported matching algorithm") |         raise NotImplementedError("Unsupported matching algorithm") | ||||||
| 
 | 
 | ||||||
| @ -125,8 +141,8 @@ class Document(models.Model): | |||||||
|     TYPE_TIF = "tiff" |     TYPE_TIF = "tiff" | ||||||
|     TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) |     TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) | ||||||
| 
 | 
 | ||||||
|     sender = models.ForeignKey( |     correspondent = models.ForeignKey( | ||||||
|         Sender, blank=True, null=True, related_name="documents") |         Correspondent, blank=True, null=True, related_name="documents") | ||||||
|     title = models.CharField(max_length=128, blank=True, db_index=True) |     title = models.CharField(max_length=128, blank=True, db_index=True) | ||||||
|     content = models.TextField(db_index=True) |     content = models.TextField(db_index=True) | ||||||
|     file_type = models.CharField( |     file_type = models.CharField( | ||||||
| @ -140,14 +156,15 @@ class Document(models.Model): | |||||||
|     modified = models.DateTimeField(auto_now=True, editable=False) |     modified = models.DateTimeField(auto_now=True, editable=False) | ||||||
| 
 | 
 | ||||||
|     class Meta(object): |     class Meta(object): | ||||||
|         ordering = ("sender", "title") |         ordering = ("correspondent", "title") | ||||||
| 
 | 
 | ||||||
|     def __str__(self): |     def __str__(self): | ||||||
|         created = self.created.strftime("%Y-%m-%d") |         created = self.created.strftime("%Y%m%d%H%M%S") | ||||||
|         if self.sender and self.title: |         if self.correspondent and self.title: | ||||||
|             return "{}: {}, {}".format(created, self.sender, self.title) |             return "{}: {} - {}".format( | ||||||
|         if self.sender or self.title: |                 created, self.correspondent, self.title) | ||||||
|             return "{}: {}".format(created, self.sender or self.title) |         if self.correspondent or self.title: | ||||||
|  |             return "{}: {}".format(created, self.correspondent or self.title) | ||||||
|         return str(created) |         return str(created) | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
| @ -155,6 +172,7 @@ class Document(models.Model): | |||||||
|         return os.path.join( |         return os.path.join( | ||||||
|             settings.MEDIA_ROOT, |             settings.MEDIA_ROOT, | ||||||
|             "documents", |             "documents", | ||||||
|  |             "originals", | ||||||
|             "{:07}.{}.gpg".format(self.pk, self.file_type) |             "{:07}.{}.gpg".format(self.pk, self.file_type) | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
| @ -164,14 +182,71 @@ class Document(models.Model): | |||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def file_name(self): |     def file_name(self): | ||||||
|         if self.sender and self.title: |         return slugify(str(self)) + "." + self.file_type | ||||||
|             tags = ",".join([t.slug for t in self.tags.all()]) |  | ||||||
|             if tags: |  | ||||||
|                 return "{} - {} - {}.{}".format( |  | ||||||
|                     self.sender, self.title, tags, self.file_type) |  | ||||||
|             return "{} - {}.{}".format(self.sender, self.title, self.file_type) |  | ||||||
|         return os.path.basename(self.source_path) |  | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def download_url(self): |     def download_url(self): | ||||||
|         return reverse("fetch", kwargs={"pk": self.pk}) |         return reverse("fetch", kwargs={"kind": "doc", "pk": self.pk}) | ||||||
|  | 
 | ||||||
|  |     @property | ||||||
|  |     def thumbnail_path(self): | ||||||
|  |         return os.path.join( | ||||||
|  |             settings.MEDIA_ROOT, | ||||||
|  |             "documents", | ||||||
|  |             "thumbnails", | ||||||
|  |             "{:07}.png.gpg".format(self.pk) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     @property | ||||||
|  |     def thumbnail_file(self): | ||||||
|  |         return open(self.thumbnail_path, "rb") | ||||||
|  | 
 | ||||||
|  |     @property | ||||||
|  |     def thumbnail_url(self): | ||||||
|  |         return reverse("fetch", kwargs={"kind": "thumb", "pk": self.pk}) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Log(models.Model): | ||||||
|  | 
 | ||||||
|  |     LEVELS = ( | ||||||
|  |         (logging.DEBUG, "Debugging"), | ||||||
|  |         (logging.INFO, "Informational"), | ||||||
|  |         (logging.WARNING, "Warning"), | ||||||
|  |         (logging.ERROR, "Error"), | ||||||
|  |         (logging.CRITICAL, "Critical"), | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     COMPONENT_CONSUMER = 1 | ||||||
|  |     COMPONENT_MAIL = 2 | ||||||
|  |     COMPONENTS = ( | ||||||
|  |         (COMPONENT_CONSUMER, "Consumer"), | ||||||
|  |         (COMPONENT_MAIL, "Mail Fetcher") | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     group = models.UUIDField(blank=True) | ||||||
|  |     message = models.TextField() | ||||||
|  |     level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO) | ||||||
|  |     component = models.PositiveIntegerField(choices=COMPONENTS) | ||||||
|  |     created = models.DateTimeField(auto_now_add=True) | ||||||
|  |     modified = models.DateTimeField(auto_now=True) | ||||||
|  | 
 | ||||||
|  |     objects = LogManager() | ||||||
|  | 
 | ||||||
|  |     class Meta(object): | ||||||
|  |         ordering = ("-modified",) | ||||||
|  | 
 | ||||||
|  |     def __str__(self): | ||||||
|  |         return self.message | ||||||
|  | 
 | ||||||
|  |     def save(self, *args, **kwargs): | ||||||
|  |         """ | ||||||
|  |         To allow for the case where we don't want to group the message, we | ||||||
|  |         shouldn't force the caller to specify a one-time group value.  However, | ||||||
|  |         allowing group=None means that the manager can't differentiate the | ||||||
|  |         different un-grouped messages, so instead we set a random one here. | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         if not self.group: | ||||||
|  |             self.group = uuid.uuid4() | ||||||
|  | 
 | ||||||
|  |         models.Model.save(self, *args, **kwargs) | ||||||
|  | |||||||
							
								
								
									
										55
									
								
								src/documents/serialisers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								src/documents/serialisers.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,55 @@ | |||||||
|  | from rest_framework import serializers | ||||||
|  | 
 | ||||||
|  | from .models import Correspondent, Tag, Document, Log | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class CorrespondentSerializer(serializers.HyperlinkedModelSerializer): | ||||||
|  | 
 | ||||||
|  |     class Meta(object): | ||||||
|  |         model = Correspondent | ||||||
|  |         fields = ("id", "slug", "name") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TagSerializer(serializers.HyperlinkedModelSerializer): | ||||||
|  | 
 | ||||||
|  |     class Meta(object): | ||||||
|  |         model = Tag | ||||||
|  |         fields = ( | ||||||
|  |             "id", "slug", "name", "colour", "match", "matching_algorithm") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class DocumentSerializer(serializers.ModelSerializer): | ||||||
|  | 
 | ||||||
|  |     correspondent = serializers.HyperlinkedRelatedField( | ||||||
|  |         read_only=True, view_name="drf:correspondent-detail", allow_null=True) | ||||||
|  |     tags = serializers.HyperlinkedRelatedField( | ||||||
|  |         read_only=True, view_name="drf:tag-detail", many=True) | ||||||
|  | 
 | ||||||
|  |     class Meta(object): | ||||||
|  |         model = Document | ||||||
|  |         fields = ( | ||||||
|  |             "id", | ||||||
|  |             "correspondent", | ||||||
|  |             "title", | ||||||
|  |             "content", | ||||||
|  |             "file_type", | ||||||
|  |             "tags", | ||||||
|  |             "created", | ||||||
|  |             "modified", | ||||||
|  |             "file_name", | ||||||
|  |             "download_url", | ||||||
|  |             "thumbnail_url", | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class LogSerializer(serializers.ModelSerializer): | ||||||
|  | 
 | ||||||
|  |     time = serializers.DateTimeField() | ||||||
|  |     messages = serializers.CharField() | ||||||
|  | 
 | ||||||
|  |     class Meta(object): | ||||||
|  |         model = Log | ||||||
|  |         fields = ( | ||||||
|  |             "time", | ||||||
|  |             "messages" | ||||||
|  |         ) | ||||||
							
								
								
									
										10
									
								
								src/documents/templates/documents/index.html
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								src/documents/templates/documents/index.html
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,10 @@ | |||||||
|  | <!DOCTYPE html> | ||||||
|  | 
 | ||||||
|  | <html lang="en-gb"> | ||||||
|  |   <head> | ||||||
|  |     <title>Paperless</title> | ||||||
|  |     <meta charset="utf-8"> | ||||||
|  |   </head> | ||||||
|  |   <body> | ||||||
|  |   </body> | ||||||
|  | </html> | ||||||
| @ -4,18 +4,26 @@ from ..consumer import Consumer | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TestAttachment(TestCase): | class TestAttachment(TestCase): | ||||||
|      | 
 | ||||||
|     TAGS = ("tag1", "tag2", "tag3") |     TAGS = ("tag1", "tag2", "tag3") | ||||||
|     CONSUMER = Consumer() |     CONSUMER = Consumer() | ||||||
|      |     SUFFIXES = ( | ||||||
|  |         "pdf", "png", "jpg", "jpeg", "gif", | ||||||
|  |         "PDF", "PNG", "JPG", "JPEG", "GIF", | ||||||
|  |         "PdF", "PnG", "JpG", "JPeG", "GiF", | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|     def _test_guess_attributes_from_name(self, path, sender, title, tags): |     def _test_guess_attributes_from_name(self, path, sender, title, tags): | ||||||
|         for suffix in ("pdf", "png", "jpg", "jpeg", "gif"): |         for suffix in self.SUFFIXES: | ||||||
|             f = path.format(suffix) |             f = path.format(suffix) | ||||||
|             results = self.CONSUMER._guess_attributes_from_name(f) |             results = self.CONSUMER._guess_attributes_from_name(f) | ||||||
|             self.assertEqual(results[0].name, sender, f) |             self.assertEqual(results[0].name, sender, f) | ||||||
|             self.assertEqual(results[1], title, f) |             self.assertEqual(results[1], title, f) | ||||||
|             self.assertEqual(tuple([t.slug for t in results[2]]), tags, f) |             self.assertEqual(tuple([t.slug for t in results[2]]), tags, f) | ||||||
|             self.assertEqual(results[3], suffix, f) |             if suffix.lower() == "jpeg": | ||||||
|  |                 self.assertEqual(results[3], "jpg", f) | ||||||
|  |             else: | ||||||
|  |                 self.assertEqual(results[3], suffix.lower(), f) | ||||||
| 
 | 
 | ||||||
|     def test_guess_attributes_from_name0(self): |     def test_guess_attributes_from_name0(self): | ||||||
|         self._test_guess_attributes_from_name( |         self._test_guess_attributes_from_name( | ||||||
|  | |||||||
							
								
								
									
										36
									
								
								src/documents/tests/test_importer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								src/documents/tests/test_importer.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,36 @@ | |||||||
|  | from django.core.management.base import CommandError | ||||||
|  | from django.test import TestCase | ||||||
|  | 
 | ||||||
|  | from ..management.commands.document_importer import Command | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestImporter(TestCase): | ||||||
|  | 
 | ||||||
|  |     def __init__(self, *args, **kwargs): | ||||||
|  |         TestCase.__init__(self, *args, **kwargs) | ||||||
|  | 
 | ||||||
|  |     def test_check_manifest_exists(self): | ||||||
|  |         cmd = Command() | ||||||
|  |         self.assertRaises( | ||||||
|  |             CommandError, cmd._check_manifest_exists, "/tmp/manifest.json") | ||||||
|  | 
 | ||||||
|  |     def test_check_manifest(self): | ||||||
|  | 
 | ||||||
|  |         cmd = Command() | ||||||
|  |         cmd.source = "/tmp" | ||||||
|  | 
 | ||||||
|  |         cmd.manifest = [{"model": "documents.document"}] | ||||||
|  |         with self.assertRaises(CommandError) as cm: | ||||||
|  |             cmd._check_manifest() | ||||||
|  |         self.assertTrue( | ||||||
|  |             'The manifest file contains a record' in str(cm.exception)) | ||||||
|  | 
 | ||||||
|  |         cmd.manifest = [{ | ||||||
|  |             "model": "documents.document", | ||||||
|  |             "__exported_file_name__": "noexist.pdf" | ||||||
|  |         }] | ||||||
|  |         # self.assertRaises(CommandError, cmd._check_manifest) | ||||||
|  |         with self.assertRaises(CommandError) as cm: | ||||||
|  |             cmd._check_manifest() | ||||||
|  |         self.assertTrue( | ||||||
|  |             'The manifest file refers to "noexist.pdf"' in str(cm.exception)) | ||||||
							
								
								
									
										142
									
								
								src/documents/tests/test_logger.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										142
									
								
								src/documents/tests/test_logger.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,142 @@ | |||||||
|  | import logging | ||||||
|  | import uuid | ||||||
|  | 
 | ||||||
|  | from unittest import mock | ||||||
|  | 
 | ||||||
|  | from django.test import TestCase | ||||||
|  | 
 | ||||||
|  | from ..models import Log | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestPaperlessLog(TestCase): | ||||||
|  | 
 | ||||||
|  |     def __init__(self, *args, **kwargs): | ||||||
|  |         TestCase.__init__(self, *args, **kwargs) | ||||||
|  |         self.logger = logging.getLogger( | ||||||
|  |             "documents.management.commands.document_consumer") | ||||||
|  | 
 | ||||||
|  |     def test_ignored(self): | ||||||
|  |         with mock.patch("logging.StreamHandler.emit") as __: | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 0) | ||||||
|  |             self.logger.info("This is an informational message") | ||||||
|  |             self.logger.warning("This is an informational message") | ||||||
|  |             self.logger.error("This is an informational message") | ||||||
|  |             self.logger.critical("This is an informational message") | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 0) | ||||||
|  | 
 | ||||||
|  |     def test_that_it_saves_at_all(self): | ||||||
|  | 
 | ||||||
|  |         kw = { | ||||||
|  |             "group": uuid.uuid4(), | ||||||
|  |             "component": Log.COMPONENT_MAIL | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(Log.objects.all().count(), 0) | ||||||
|  | 
 | ||||||
|  |         with mock.patch("logging.StreamHandler.emit") as __: | ||||||
|  | 
 | ||||||
|  |             # Debug messages are ignored by default | ||||||
|  |             self.logger.debug("This is a debugging message", extra=kw) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 0) | ||||||
|  | 
 | ||||||
|  |             self.logger.info("This is an informational message", extra=kw) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 1) | ||||||
|  | 
 | ||||||
|  |             self.logger.warning("This is an warning message", extra=kw) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 2) | ||||||
|  | 
 | ||||||
|  |             self.logger.error("This is an error message", extra=kw) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 3) | ||||||
|  | 
 | ||||||
|  |             self.logger.critical("This is a critical message", extra=kw) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 4) | ||||||
|  | 
 | ||||||
|  |     def test_groups(self): | ||||||
|  | 
 | ||||||
|  |         kw1 = { | ||||||
|  |             "group": uuid.uuid4(), | ||||||
|  |             "component": Log.COMPONENT_MAIL | ||||||
|  |         } | ||||||
|  |         kw2 = { | ||||||
|  |             "group": uuid.uuid4(), | ||||||
|  |             "component": Log.COMPONENT_MAIL | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(Log.objects.all().count(), 0) | ||||||
|  | 
 | ||||||
|  |         with mock.patch("logging.StreamHandler.emit") as __: | ||||||
|  | 
 | ||||||
|  |             # Debug messages are ignored by default | ||||||
|  |             self.logger.debug("This is a debugging message", extra=kw1) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 0) | ||||||
|  | 
 | ||||||
|  |             self.logger.info("This is an informational message", extra=kw2) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 1) | ||||||
|  |             self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 1) | ||||||
|  | 
 | ||||||
|  |             self.logger.warning("This is an warning message", extra=kw1) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 2) | ||||||
|  |             self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 1) | ||||||
|  | 
 | ||||||
|  |             self.logger.error("This is an error message", extra=kw2) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 3) | ||||||
|  |             self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 2) | ||||||
|  | 
 | ||||||
|  |             self.logger.critical("This is a critical message", extra=kw1) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 4) | ||||||
|  |             self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2) | ||||||
|  | 
 | ||||||
|  |     def test_components(self): | ||||||
|  | 
 | ||||||
|  |         c1 = Log.COMPONENT_CONSUMER | ||||||
|  |         c2 = Log.COMPONENT_MAIL | ||||||
|  |         kw1 = { | ||||||
|  |             "group": uuid.uuid4(), | ||||||
|  |             "component": c1 | ||||||
|  |         } | ||||||
|  |         kw2 = { | ||||||
|  |             "group": kw1["group"], | ||||||
|  |             "component": c2 | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(Log.objects.all().count(), 0) | ||||||
|  | 
 | ||||||
|  |         with mock.patch("logging.StreamHandler.emit") as __: | ||||||
|  | 
 | ||||||
|  |             # Debug messages are ignored by default | ||||||
|  |             self.logger.debug("This is a debugging message", extra=kw1) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 0) | ||||||
|  | 
 | ||||||
|  |             self.logger.info("This is an informational message", extra=kw2) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 1) | ||||||
|  |             self.assertEqual(Log.objects.filter(component=c2).count(), 1) | ||||||
|  | 
 | ||||||
|  |             self.logger.warning("This is an warning message", extra=kw1) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 2) | ||||||
|  |             self.assertEqual(Log.objects.filter(component=c1).count(), 1) | ||||||
|  | 
 | ||||||
|  |             self.logger.error("This is an error message", extra=kw2) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 3) | ||||||
|  |             self.assertEqual(Log.objects.filter(component=c2).count(), 2) | ||||||
|  | 
 | ||||||
|  |             self.logger.critical("This is a critical message", extra=kw1) | ||||||
|  |             self.assertEqual(Log.objects.all().count(), 4) | ||||||
|  |             self.assertEqual(Log.objects.filter(component=c1).count(), 2) | ||||||
|  | 
 | ||||||
|  |     def test_groupped_query(self): | ||||||
|  | 
 | ||||||
|  |         kw = { | ||||||
|  |             "group": uuid.uuid4(), | ||||||
|  |             "component": Log.COMPONENT_MAIL | ||||||
|  |         } | ||||||
|  |         with mock.patch("logging.StreamHandler.emit") as __: | ||||||
|  |             self.logger.info("Message 0", extra=kw) | ||||||
|  |             self.logger.info("Message 1", extra=kw) | ||||||
|  |             self.logger.info("Message 2", extra=kw) | ||||||
|  |             self.logger.info("Message 3", extra=kw) | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(Log.objects.all().by_group().count(), 1) | ||||||
|  |         self.assertEqual( | ||||||
|  |             Log.objects.all().by_group()[0]["messages"], | ||||||
|  |             "Message 0\nMessage 1\nMessage 2\nMessage 3" | ||||||
|  |         ) | ||||||
| @ -3,6 +3,7 @@ import os | |||||||
| import magic | import magic | ||||||
| 
 | 
 | ||||||
| from hashlib import md5 | from hashlib import md5 | ||||||
|  | from unittest import mock | ||||||
| 
 | 
 | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
| @ -27,7 +28,8 @@ class TestMessage(TestCase): | |||||||
| 
 | 
 | ||||||
|         with open(self.sample, "rb") as f: |         with open(self.sample, "rb") as f: | ||||||
| 
 | 
 | ||||||
|             message = Message(f.read(), verbosity=0) |             with mock.patch("logging.StreamHandler.emit") as __: | ||||||
|  |                 message = Message(f.read()) | ||||||
| 
 | 
 | ||||||
|             self.assertTrue(message) |             self.assertTrue(message) | ||||||
|             self.assertEqual(message.subject, "Test 0") |             self.assertEqual(message.subject, "Test 0") | ||||||
|  | |||||||
							
								
								
									
										119
									
								
								src/documents/tests/test_tags.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										119
									
								
								src/documents/tests/test_tags.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,119 @@ | |||||||
|  | from django.test import TestCase | ||||||
|  | 
 | ||||||
|  | from ..models import Tag | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestTagMatching(TestCase): | ||||||
|  | 
 | ||||||
|  |     def test_match_all(self): | ||||||
|  | 
 | ||||||
|  |         t = Tag.objects.create( | ||||||
|  |             name="Test 0", | ||||||
|  |             match="alpha charlie gamma", | ||||||
|  |             matching_algorithm=Tag.MATCH_ALL | ||||||
|  |         ) | ||||||
|  |         self.assertFalse(t.matches("I have alpha in me")) | ||||||
|  |         self.assertFalse(t.matches("I have charlie in me")) | ||||||
|  |         self.assertFalse(t.matches("I have gamma in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alpha and charlie in me")) | ||||||
|  |         self.assertTrue(t.matches("I have alpha, charlie, and gamma in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alphas in me")) | ||||||
|  |         self.assertFalse(t.matches("I have bravo in me")) | ||||||
|  | 
 | ||||||
|  |         t = Tag.objects.create( | ||||||
|  |             name="Test 1", | ||||||
|  |             match="12 34 56", | ||||||
|  |             matching_algorithm=Tag.MATCH_ALL | ||||||
|  |         ) | ||||||
|  |         self.assertFalse(t.matches("I have 12 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 34 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 56 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 12 and 34 in me")) | ||||||
|  |         self.assertTrue(t.matches("I have 12 34, and 56 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 120, 34, and 56 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 123456 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 01234567 in me")) | ||||||
|  | 
 | ||||||
|  |     def test_match_any(self): | ||||||
|  | 
 | ||||||
|  |         t = Tag.objects.create( | ||||||
|  |             name="Test 0", | ||||||
|  |             match="alpha charlie gamma", | ||||||
|  |             matching_algorithm=Tag.MATCH_ANY | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         self.assertTrue(t.matches("I have alpha in me")) | ||||||
|  |         self.assertTrue(t.matches("I have charlie in me")) | ||||||
|  |         self.assertTrue(t.matches("I have gamma in me")) | ||||||
|  |         self.assertTrue(t.matches("I have alpha and charlie in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alphas in me")) | ||||||
|  |         self.assertFalse(t.matches("I have bravo in me")) | ||||||
|  | 
 | ||||||
|  |         t = Tag.objects.create( | ||||||
|  |             name="Test 1", | ||||||
|  |             match="12 34 56", | ||||||
|  |             matching_algorithm=Tag.MATCH_ANY | ||||||
|  |         ) | ||||||
|  |         self.assertTrue(t.matches("I have 12 in me")) | ||||||
|  |         self.assertTrue(t.matches("I have 34 in me")) | ||||||
|  |         self.assertTrue(t.matches("I have 56 in me")) | ||||||
|  |         self.assertTrue(t.matches("I have 12 and 34 in me")) | ||||||
|  |         self.assertTrue(t.matches("I have 12 34, and 56 in me")) | ||||||
|  |         self.assertTrue(t.matches("I have 120, 34, and 560 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 120, 340, and 560 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 123456 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 01234567 in me")) | ||||||
|  | 
 | ||||||
|  |     def test_match_literal(self): | ||||||
|  | 
 | ||||||
|  |         t = Tag.objects.create( | ||||||
|  |             name="Test 0", | ||||||
|  |             match="alpha charlie gamma", | ||||||
|  |             matching_algorithm=Tag.MATCH_LITERAL | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         self.assertFalse(t.matches("I have alpha in me")) | ||||||
|  |         self.assertFalse(t.matches("I have charlie in me")) | ||||||
|  |         self.assertFalse(t.matches("I have gamma in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alpha and charlie in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alpha, charlie, and gamma in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) | ||||||
|  |         self.assertTrue(t.matches("I have 'alpha charlie gamma' in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alphas in me")) | ||||||
|  |         self.assertFalse(t.matches("I have bravo in me")) | ||||||
|  | 
 | ||||||
|  |         t = Tag.objects.create( | ||||||
|  |             name="Test 1", | ||||||
|  |             match="12 34 56", | ||||||
|  |             matching_algorithm=Tag.MATCH_LITERAL | ||||||
|  |         ) | ||||||
|  |         self.assertFalse(t.matches("I have 12 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 34 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 56 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 12 and 34 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 12 34, and 56 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 120, 34, and 560 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 120, 340, and 560 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 123456 in me")) | ||||||
|  |         self.assertFalse(t.matches("I have 01234567 in me")) | ||||||
|  |         self.assertTrue(t.matches("I have 12 34 56 in me")) | ||||||
|  | 
 | ||||||
|  |     def test_match_regex(self): | ||||||
|  | 
 | ||||||
|  |         t = Tag.objects.create( | ||||||
|  |             name="Test 0", | ||||||
|  |             match="alpha\w+gamma", | ||||||
|  |             matching_algorithm=Tag.MATCH_REGEX | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         self.assertFalse(t.matches("I have alpha in me")) | ||||||
|  |         self.assertFalse(t.matches("I have gamma in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alpha and charlie in me")) | ||||||
|  |         self.assertTrue(t.matches("I have alpha_and_gamma in me")) | ||||||
|  |         self.assertTrue(t.matches("I have alphas_and_gamma in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alpha,and,gamma in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alpha and gamma in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alpha, charlie, and gamma in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) | ||||||
|  |         self.assertFalse(t.matches("I have alphas in me")) | ||||||
| @ -1,21 +1,41 @@ | |||||||
|  | from django.contrib.auth.mixins import LoginRequiredMixin | ||||||
| from django.http import HttpResponse | from django.http import HttpResponse | ||||||
| from django.template.defaultfilters import slugify |  | ||||||
| from django.views.decorators.csrf import csrf_exempt | from django.views.decorators.csrf import csrf_exempt | ||||||
| from django.views.generic import FormView, DetailView | from django.views.generic import FormView, DetailView, TemplateView | ||||||
|  | 
 | ||||||
|  | from rest_framework.mixins import ( | ||||||
|  |     RetrieveModelMixin, UpdateModelMixin, DestroyModelMixin, ListModelMixin) | ||||||
|  | from rest_framework.pagination import PageNumberPagination | ||||||
|  | from rest_framework.permissions import IsAuthenticated | ||||||
|  | from rest_framework.viewsets import ( | ||||||
|  |     ModelViewSet, ReadOnlyModelViewSet, GenericViewSet) | ||||||
| 
 | 
 | ||||||
| from paperless.db import GnuPG | from paperless.db import GnuPG | ||||||
| 
 | 
 | ||||||
| from .models import Document |  | ||||||
| from .forms import UploadForm | from .forms import UploadForm | ||||||
|  | from .models import Correspondent, Tag, Document, Log | ||||||
|  | from .serialisers import ( | ||||||
|  |     CorrespondentSerializer, TagSerializer, DocumentSerializer, LogSerializer) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class PdfView(DetailView): | class IndexView(TemplateView): | ||||||
|  | 
 | ||||||
|  |     template_name = "documents/index.html" | ||||||
|  | 
 | ||||||
|  |     def get_context_data(self, **kwargs): | ||||||
|  |         print(kwargs) | ||||||
|  |         print(self.request.GET) | ||||||
|  |         print(self.request.POST) | ||||||
|  |         return TemplateView.get_context_data(self, **kwargs) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class FetchView(DetailView): | ||||||
| 
 | 
 | ||||||
|     model = Document |     model = Document | ||||||
| 
 | 
 | ||||||
|     def render_to_response(self, context, **response_kwargs): |     def render_to_response(self, context, **response_kwargs): | ||||||
|         """ |         """ | ||||||
|         Override the default to return the unencrypted PDF as raw data. |         Override the default to return the unencrypted image/PDF as raw data. | ||||||
|         """ |         """ | ||||||
| 
 | 
 | ||||||
|         content_types = { |         content_types = { | ||||||
| @ -26,19 +46,25 @@ class PdfView(DetailView): | |||||||
|             Document.TYPE_TIF: "image/tiff", |             Document.TYPE_TIF: "image/tiff", | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|  |         if self.kwargs["kind"] == "thumb": | ||||||
|  |             return HttpResponse( | ||||||
|  |                 GnuPG.decrypted(self.object.thumbnail_file), | ||||||
|  |                 content_type=content_types[Document.TYPE_PNG] | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|         response = HttpResponse( |         response = HttpResponse( | ||||||
|             GnuPG.decrypted(self.object.source_file), |             GnuPG.decrypted(self.object.source_file), | ||||||
|             content_type=content_types[self.object.file_type] |             content_type=content_types[self.object.file_type] | ||||||
|         ) |         ) | ||||||
|         response["Content-Disposition"] = 'attachment; filename="{}"'.format( |         response["Content-Disposition"] = 'attachment; filename="{}"'.format( | ||||||
|             slugify(str(self.object)) + "." + self.object.file_type) |             self.object.file_name) | ||||||
| 
 | 
 | ||||||
|         return response |         return response | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class PushView(FormView): | class PushView(LoginRequiredMixin, FormView): | ||||||
|     """ |     """ | ||||||
|     A crude REST API for creating documents. |     A crude REST-ish API for creating documents. | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     form_class = UploadForm |     form_class = UploadForm | ||||||
| @ -52,3 +78,45 @@ class PushView(FormView): | |||||||
| 
 | 
 | ||||||
|     def form_invalid(self, form): |     def form_invalid(self, form): | ||||||
|         return HttpResponse("0") |         return HttpResponse("0") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class StandardPagination(PageNumberPagination): | ||||||
|  |     page_size = 25 | ||||||
|  |     page_size_query_param = "page-size" | ||||||
|  |     max_page_size = 100000 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class CorrespondentViewSet(ModelViewSet): | ||||||
|  |     model = Correspondent | ||||||
|  |     queryset = Correspondent.objects.all() | ||||||
|  |     serializer_class = CorrespondentSerializer | ||||||
|  |     pagination_class = StandardPagination | ||||||
|  |     permission_classes = (IsAuthenticated,) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TagViewSet(ModelViewSet): | ||||||
|  |     model = Tag | ||||||
|  |     queryset = Tag.objects.all() | ||||||
|  |     serializer_class = TagSerializer | ||||||
|  |     pagination_class = StandardPagination | ||||||
|  |     permission_classes = (IsAuthenticated,) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class DocumentViewSet(RetrieveModelMixin, | ||||||
|  |                       UpdateModelMixin, | ||||||
|  |                       DestroyModelMixin, | ||||||
|  |                       ListModelMixin, | ||||||
|  |                       GenericViewSet): | ||||||
|  |     model = Document | ||||||
|  |     queryset = Document.objects.all() | ||||||
|  |     serializer_class = DocumentSerializer | ||||||
|  |     pagination_class = StandardPagination | ||||||
|  |     permission_classes = (IsAuthenticated,) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class LogViewSet(ReadOnlyModelViewSet): | ||||||
|  |     model = Log | ||||||
|  |     queryset = Log.objects.all().by_group() | ||||||
|  |     serializer_class = LogSerializer | ||||||
|  |     pagination_class = StandardPagination | ||||||
|  |     permission_classes = (IsAuthenticated,) | ||||||
|  | |||||||
| @ -1,12 +0,0 @@ | |||||||
| from django.contrib import admin |  | ||||||
| 
 |  | ||||||
| from .models import Log |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class LogAdmin(admin.ModelAdmin): |  | ||||||
| 
 |  | ||||||
|     list_display = ("message", "level", "component") |  | ||||||
|     list_filter = ("level", "component",) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| admin.site.register(Log, LogAdmin) |  | ||||||
| @ -1,5 +0,0 @@ | |||||||
| from django.apps import AppConfig |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class LoggerConfig(AppConfig): |  | ||||||
|     name = 'logger' |  | ||||||
| @ -1,50 +0,0 @@ | |||||||
| from django.db import models |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class Log(models.Model): |  | ||||||
| 
 |  | ||||||
|     LEVEL_ERROR = 1 |  | ||||||
|     LEVEL_WARNING = 2 |  | ||||||
|     LEVEL_INFO = 3 |  | ||||||
|     LEVEL_DEBUG = 4 |  | ||||||
|     LEVELS = ( |  | ||||||
|         (LEVEL_ERROR, "Error"), |  | ||||||
|         (LEVEL_WARNING, "Warning"), |  | ||||||
|         (LEVEL_INFO, "Informational"), |  | ||||||
|         (LEVEL_DEBUG, "Debugging"), |  | ||||||
|     ) |  | ||||||
| 
 |  | ||||||
|     COMPONENT_CONSUMER = 1 |  | ||||||
|     COMPONENT_MAIL = 2 |  | ||||||
|     COMPONENTS = ( |  | ||||||
|         (COMPONENT_CONSUMER, "Consumer"), |  | ||||||
|         (COMPONENT_MAIL, "Mail Fetcher") |  | ||||||
|     ) |  | ||||||
| 
 |  | ||||||
|     time = models.DateTimeField(auto_now_add=True) |  | ||||||
|     message = models.TextField() |  | ||||||
|     level = models.PositiveIntegerField(choices=LEVELS, default=LEVEL_INFO) |  | ||||||
|     component = models.PositiveIntegerField(choices=COMPONENTS) |  | ||||||
| 
 |  | ||||||
|     class Meta(object): |  | ||||||
|         ordering = ("-time",) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def error(cls, message, component): |  | ||||||
|         cls.objects.create( |  | ||||||
|             message=message, level=cls.LEVEL_ERROR, component=component) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def warning(cls, message, component): |  | ||||||
|         cls.objects.create( |  | ||||||
|             message=message, level=cls.LEVEL_WARNING, component=component) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def info(cls, message, component): |  | ||||||
|         cls.objects.create( |  | ||||||
|             message=message, level=cls.LEVEL_INFO, component=component) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def debug(cls, message, component): |  | ||||||
|         cls.objects.create( |  | ||||||
|             message=message, level=cls.LEVEL_DEBUG, component=component) |  | ||||||
| @ -1,3 +0,0 @@ | |||||||
| from django.test import TestCase |  | ||||||
| 
 |  | ||||||
| # Create your tests here. |  | ||||||
| @ -1,3 +0,0 @@ | |||||||
| from django.shortcuts import render |  | ||||||
| 
 |  | ||||||
| # Create your views here. |  | ||||||
| @ -12,6 +12,8 @@ https://docs.djangoproject.com/en/1.9/ref/settings/ | |||||||
| 
 | 
 | ||||||
| import os | import os | ||||||
| 
 | 
 | ||||||
|  | from dotenv import load_dotenv | ||||||
|  | 
 | ||||||
| # Build paths inside the project like this: os.path.join(BASE_DIR, ...) | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) | ||||||
| BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||||||
| 
 | 
 | ||||||
| @ -42,7 +44,8 @@ INSTALLED_APPS = [ | |||||||
|     "django_extensions", |     "django_extensions", | ||||||
| 
 | 
 | ||||||
|     "documents", |     "documents", | ||||||
|     "logger", | 
 | ||||||
|  |     "rest_framework", | ||||||
| 
 | 
 | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| @ -87,12 +90,12 @@ DATABASES = { | |||||||
|         "NAME": os.path.join(BASE_DIR, "..", "data", "db.sqlite3"), |         "NAME": os.path.join(BASE_DIR, "..", "data", "db.sqlite3"), | ||||||
|     } |     } | ||||||
| } | } | ||||||
| if os.environ.get("PAPERLESS_DBUSER") and os.environ.get("PAPERLESS_DBPASS"): | if os.getenv("PAPERLESS_DBUSER") and os.getenv("PAPERLESS_DBPASS"): | ||||||
|     DATABASES["default"] = { |     DATABASES["default"] = { | ||||||
|         "ENGINE": "django.db.backends.postgresql_psycopg2", |         "ENGINE": "django.db.backends.postgresql_psycopg2", | ||||||
|         "NAME": os.environ.get("PAPERLESS_DBNAME", "paperless"), |         "NAME": os.getenv("PAPERLESS_DBNAME", "paperless"), | ||||||
|         "USER": os.environ.get("PAPERLESS_DBUSER"), |         "USER": os.getenv("PAPERLESS_DBUSER"), | ||||||
|         "PASSWORD": os.environ.get("PAPERLESS_DBPASS") |         "PASSWORD": os.getenv("PAPERLESS_DBPASS") | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -139,55 +142,119 @@ STATIC_URL = '/static/' | |||||||
| MEDIA_URL = "/media/" | MEDIA_URL = "/media/" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Paperless-specific stuffs | # Paperless-specific stuff | ||||||
| # Change these paths if yours are different | # You shouldn't have to edit any of these values.  Rather, you can set these | ||||||
|  | # values in /etc/paperless.conf instead. | ||||||
| # ---------------------------------------------------------------------------- | # ---------------------------------------------------------------------------- | ||||||
| 
 | 
 | ||||||
|  | # Tap paperless.conf if it's available | ||||||
|  | if os.path.exists("/etc/paperless.conf"): | ||||||
|  |     load_dotenv("/etc/paperless.conf") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Logging | ||||||
|  | 
 | ||||||
|  | LOGGING = { | ||||||
|  |     "version": 1, | ||||||
|  |     "disable_existing_loggers": False, | ||||||
|  |     "handlers": { | ||||||
|  |         "consumer": { | ||||||
|  |             "class": "documents.loggers.PaperlessLogger", | ||||||
|  |         } | ||||||
|  |     }, | ||||||
|  |     "loggers": { | ||||||
|  |         "documents": { | ||||||
|  |             "handlers": ["consumer"], | ||||||
|  |             "level": os.getenv("PAPERLESS_CONSUMER_LOG_LEVEL", "INFO"), | ||||||
|  |         }, | ||||||
|  |     }, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # The default language that tesseract will attempt to use when parsing | # The default language that tesseract will attempt to use when parsing | ||||||
| # documents.  It should be a 3-letter language code consistent with ISO 639. | # documents.  It should be a 3-letter language code consistent with ISO 639. | ||||||
| OCR_LANGUAGE = "eng" | OCR_LANGUAGE = "eng" | ||||||
| 
 | 
 | ||||||
| # The amount of threads to use for OCR | # The amount of threads to use for OCR | ||||||
| OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS") | OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") | ||||||
| 
 | 
 | ||||||
| # If this is true, any failed attempts to OCR a PDF will result in the PDF being | # If this is true, any failed attempts to OCR a PDF will result in the PDF | ||||||
| # indexed anyway, with whatever we could get.  If it's False, the file will | # being indexed anyway, with whatever we could get.  If it's False, the file | ||||||
| # simply be left in the CONSUMPTION_DIR. | # will simply be left in the CONSUMPTION_DIR. | ||||||
| FORGIVING_OCR = True | FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true")) | ||||||
| 
 | 
 | ||||||
| # GNUPG needs a home directory for some reason | # GNUPG needs a home directory for some reason | ||||||
| GNUPG_HOME = os.environ.get("HOME", "/dev/null") | GNUPG_HOME = os.getenv("HOME", "/tmp") | ||||||
| 
 | 
 | ||||||
| # Convert is part of the Imagemagick package | # Convert is part of the ImageMagick package | ||||||
| CONVERT_BINARY = "/usr/bin/convert" | CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY") | ||||||
|  | 
 | ||||||
|  | # Unpaper | ||||||
|  | UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") | ||||||
| 
 | 
 | ||||||
| # This will be created if it doesn't exist | # This will be created if it doesn't exist | ||||||
| SCRATCH_DIR = "/tmp/paperless" | SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless") | ||||||
| 
 | 
 | ||||||
| # This is where Paperless will look for PDFs to index | # This is where Paperless will look for PDFs to index | ||||||
| CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME") | CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR") | ||||||
| 
 | 
 | ||||||
| # If you want to use IMAP mail consumption, populate this with useful values. | # If you want to use IMAP mail consumption, populate this with useful values. | ||||||
| # If you leave HOST set to None, we assume you're not going to use this feature. | # If you leave HOST set to None, we assume you're not going to use this | ||||||
|  | # feature. | ||||||
| MAIL_CONSUMPTION = { | MAIL_CONSUMPTION = { | ||||||
|     "HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"), |     "HOST": os.getenv("PAPERLESS_CONSUME_MAIL_HOST"), | ||||||
|     "PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"), |     "PORT": os.getenv("PAPERLESS_CONSUME_MAIL_PORT"), | ||||||
|     "USERNAME": os.environ.get("PAPERLESS_CONSUME_MAIL_USER"), |     "USERNAME": os.getenv("PAPERLESS_CONSUME_MAIL_USER"), | ||||||
|     "PASSWORD": os.environ.get("PAPERLESS_CONSUME_MAIL_PASS"), |     "PASSWORD": os.getenv("PAPERLESS_CONSUME_MAIL_PASS"), | ||||||
|     "USE_SSL": True,  # If True, use SSL/TLS to connect |     "USE_SSL": True,  # If True, use SSL/TLS to connect | ||||||
|     "INBOX": "INBOX"  # The name of the inbox on the server |     "INBOX": "INBOX"  # The name of the inbox on the server | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # This is used to encrypt the original documents and decrypt them later when you | # This is used to encrypt the original documents and decrypt them later when | ||||||
| # want to download them.  Set it and change the permissions on this file to | # you want to download them.  Set it and change the permissions on this file to | ||||||
| # 0600, or set it to `None` and you'll be prompted for the passphrase at | # 0600, or set it to `None` and you'll be prompted for the passphrase at | ||||||
| # runtime.  The default looks for an environment variable. | # runtime.  The default looks for an environment variable. | ||||||
| # DON'T FORGET TO SET THIS as leaving it blank may cause some strange things | # DON'T FORGET TO SET THIS as leaving it blank may cause some strange things | ||||||
| # with GPG, including an interesting case where it may "encrypt" zero-byte | # with GPG, including an interesting case where it may "encrypt" zero-byte | ||||||
| # files. | # files. | ||||||
| PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE") | PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE") | ||||||
| 
 | 
 | ||||||
| # If you intend to use the "API" to push files into the consumer, you'll need to | # If you intend to use the "API" to push files into the consumer, you'll need | ||||||
| # provide a shared secret here.  Leaving this as the default will disable the | # to provide a shared secret here.  Leaving this as the default will disable | ||||||
| # API. | # the API. | ||||||
| UPLOAD_SHARED_SECRET = os.environ.get("PAPERLESS_SECRET", "") | SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "") | ||||||
|  | 
 | ||||||
|  | # | ||||||
|  | # TODO: Remove after 1.2 | ||||||
|  | # | ||||||
|  | # This logic is here to address issue #44, wherein we were using inconsistent | ||||||
|  | # constant names vs. environment variables.  If you're using Paperless for the | ||||||
|  | # first time, you can safely ignore everything from here on, so long as you're | ||||||
|  | # correctly defining the variables as per the documentation. | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def deprecated(before, after): | ||||||
|  |     print( | ||||||
|  |         "\n\n" | ||||||
|  |         "WARNING: {before} has been renamed to {after}.\n" | ||||||
|  |         "WARNING: Use of {before} will not work as of version 1.2." | ||||||
|  |         "\n\n".format( | ||||||
|  |             before=before, | ||||||
|  |             after=after | ||||||
|  |         ) | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | if not CONVERT_BINARY: | ||||||
|  |     CONVERT_BINARY = "convert" | ||||||
|  |     if os.getenv("PAPERLESS_CONVERT"): | ||||||
|  |         deprecated("PAPERLESS_CONVERT", "PAPERLESS_CONVERT_BINARY") | ||||||
|  |         CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT", CONVERT_BINARY) | ||||||
|  | 
 | ||||||
|  | if not CONSUMPTION_DIR and os.getenv("PAPERLESS_CONSUME"): | ||||||
|  |     deprecated("PAPERLESS_CONSUME", "PAPERLESS_CONSUMPTION_DIR") | ||||||
|  |     CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUME") | ||||||
|  | 
 | ||||||
|  | if not SHARED_SECRET and os.getenv("PAPERLESS_SECRET"): | ||||||
|  |     deprecated("PAPERLESS_SECRET", "PAPERLESS_SHARED_SECRET") | ||||||
|  |     SHARED_SECRET = os.getenv("PAPERLESS_SECRET", "") | ||||||
|  | |||||||
| @ -15,15 +15,46 @@ Including another URLconf | |||||||
|     3. Add a URL to urlpatterns:  url(r'^blog/', include(blog_urls)) |     3. Add a URL to urlpatterns:  url(r'^blog/', include(blog_urls)) | ||||||
| """ | """ | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.conf.urls import url, static | from django.conf.urls import url, static, include | ||||||
| from django.contrib import admin | from django.contrib import admin | ||||||
| 
 | 
 | ||||||
| from documents.views import PdfView, PushView | from rest_framework.routers import DefaultRouter | ||||||
|  | 
 | ||||||
|  | from documents.views import ( | ||||||
|  |     IndexView, FetchView, PushView, | ||||||
|  |     CorrespondentViewSet, TagViewSet, DocumentViewSet, LogViewSet | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | router = DefaultRouter() | ||||||
|  | router.register(r'correspondents', CorrespondentViewSet) | ||||||
|  | router.register(r'tags', TagViewSet) | ||||||
|  | router.register(r'documents', DocumentViewSet) | ||||||
|  | router.register(r'logs', LogViewSet) | ||||||
| 
 | 
 | ||||||
| urlpatterns = [ | urlpatterns = [ | ||||||
|     url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"), | 
 | ||||||
|     url(r'', admin.site.urls), |     # API | ||||||
|  |     url( | ||||||
|  |         r"^api/auth/", | ||||||
|  |         include('rest_framework.urls', namespace="rest_framework") | ||||||
|  |     ), | ||||||
|  |     url(r"^api/", include(router.urls, namespace="drf")), | ||||||
|  | 
 | ||||||
|  |     # Normal pages (coming soon) | ||||||
|  |     # url(r"^$", IndexView.as_view(), name="index"), | ||||||
|  | 
 | ||||||
|  |     # File downloads | ||||||
|  |     url( | ||||||
|  |         r"^fetch/(?P<kind>doc|thumb)/(?P<pk>\d+)$", | ||||||
|  |         FetchView.as_view(), | ||||||
|  |         name="fetch" | ||||||
|  |     ), | ||||||
|  | 
 | ||||||
|  |     # The Django admin | ||||||
|  |     url(r"admin/", admin.site.urls), | ||||||
|  |     url(r"", admin.site.urls),  # This is going away | ||||||
|  | 
 | ||||||
| ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) | ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) | ||||||
| 
 | 
 | ||||||
| if settings.UPLOAD_SHARED_SECRET: | if settings.SHARED_SECRET: | ||||||
|     urlpatterns.insert(0, url(r"^push$", PushView.as_view(), name="push")) |     urlpatterns.insert(0, url(r"^push$", PushView.as_view(), name="push")) | ||||||
|  | |||||||
| @ -1 +1 @@ | |||||||
| __version__ = (0, 0, 6) | __version__ = (0, 1, 1) | ||||||
|  | |||||||
							
								
								
									
										23
									
								
								src/tox.ini
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/tox.ini
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,23 @@ | |||||||
|  | # Tox (http://tox.testrun.org/) is a tool for running tests | ||||||
|  | # in multiple virtualenvs. This configuration file will run the | ||||||
|  | # test suite on all supported python versions. To use it, "pip install tox" | ||||||
|  | # and then run "tox" from this directory. | ||||||
|  | 
 | ||||||
|  | [tox] | ||||||
|  | skipsdist = True | ||||||
|  | envlist = py34, py35, pep8 | ||||||
|  | 
 | ||||||
|  | [testenv] | ||||||
|  | commands = {envpython} manage.py test | ||||||
|  | deps = -r{toxinidir}/../requirements.txt | ||||||
|  | setenv = | ||||||
|  |     PAPERLESS_CONSUME=/tmp | ||||||
|  |     PAPERLESS_PASSPHRASE=THISISNOTASECRET | ||||||
|  |     PAPERLESS_SECRET=paperless | ||||||
|  | 
 | ||||||
|  | [testenv:pep8] | ||||||
|  | commands=pep8 | ||||||
|  | deps=pep8 | ||||||
|  | 
 | ||||||
|  | [pep8] | ||||||
|  | exclude=.tox,migrations,paperless/settings.py | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user