mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-25 15:52:35 -04:00 
			
		
		
		
	Merge branch 'master' of github.com:danielquinn/paperless
This commit is contained in:
		
						commit
						5c59120c57
					
				| @ -1,5 +1,9 @@ | ||||
| language: python | ||||
| 
 | ||||
| before_install: | ||||
| - sudo apt-get update -qq | ||||
| - sudo apt-get install -qq libpoppler-cpp-dev | ||||
| 
 | ||||
| sudo: false | ||||
| 
 | ||||
| matrix: | ||||
|  | ||||
							
								
								
									
										75
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										75
									
								
								Dockerfile
									
									
									
									
									
								
							| @ -1,50 +1,47 @@ | ||||
| FROM python:3.5 | ||||
| MAINTAINER Pit Kleyersburg <pitkley@googlemail.com> | ||||
| FROM alpine:3.7 | ||||
| 
 | ||||
| # Install dependencies | ||||
| RUN apt-get update \ | ||||
|     && apt-get install -y --no-install-recommends \ | ||||
|         sudo \ | ||||
|         tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \ | ||||
|     && rm -rf /var/lib/apt/lists/* | ||||
| 
 | ||||
| # Install python dependencies | ||||
| RUN mkdir -p /usr/src/paperless | ||||
| WORKDIR /usr/src/paperless | ||||
| COPY requirements.txt /usr/src/paperless/ | ||||
| RUN pip install --no-cache-dir -r requirements.txt | ||||
| LABEL maintainer="The Paperless Project https://github.com/danielquinn/paperless" \ | ||||
|       contributors="Guy Addadi <addadi@gmail.com>, Pit Kleyersburg <pitkley@googlemail.com>, \ | ||||
|         Sven Fischer <git-dev@linux4tw.de>" | ||||
| 
 | ||||
| # Copy application | ||||
| RUN mkdir -p /usr/src/paperless/src | ||||
| RUN mkdir -p /usr/src/paperless/data | ||||
| RUN mkdir -p /usr/src/paperless/media | ||||
| COPY requirements.txt /usr/src/paperless/ | ||||
| COPY src/ /usr/src/paperless/src/ | ||||
| COPY data/ /usr/src/paperless/data/ | ||||
| COPY media/ /usr/src/paperless/media/ | ||||
| 
 | ||||
| # Set consumption directory | ||||
| ENV PAPERLESS_CONSUMPTION_DIR /consume | ||||
| RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR | ||||
| 
 | ||||
| # Migrate database | ||||
| WORKDIR /usr/src/paperless/src | ||||
| RUN ./manage.py migrate | ||||
| 
 | ||||
| # Create user | ||||
| RUN groupadd -g 1000 paperless \ | ||||
|     && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \ | ||||
|     && chown -Rh paperless:paperless /usr/src/paperless | ||||
| 
 | ||||
| # Set export directory | ||||
| ENV PAPERLESS_EXPORT_DIR /export | ||||
| RUN mkdir -p $PAPERLESS_EXPORT_DIR | ||||
| 
 | ||||
| # Setup entrypoint | ||||
| COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh | ||||
| RUN chmod 755 /sbin/docker-entrypoint.sh | ||||
| 
 | ||||
| # Mount volumes | ||||
| # Set export and consumption directories | ||||
| ENV PAPERLESS_EXPORT_DIR=/export \ | ||||
|     PAPERLESS_CONSUMPTION_DIR=/consume | ||||
| 
 | ||||
| # Install dependencies | ||||
| RUN apk --no-cache --update add \ | ||||
|         python3 gnupg libmagic bash \ | ||||
|         sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \ | ||||
|     apk --no-cache add --virtual .build-dependencies \ | ||||
|         python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \ | ||||
| # Install python dependencies | ||||
|     python3 -m ensurepip && \ | ||||
|     rm -r /usr/lib/python*/ensurepip && \ | ||||
|     cd /usr/src/paperless && \ | ||||
|     pip3 install --no-cache-dir -r requirements.txt && \ | ||||
| # Remove build dependencies | ||||
|     apk del .build-dependencies && \ | ||||
| # Create the consumption directory | ||||
|     mkdir -p $PAPERLESS_CONSUMPTION_DIR && \ | ||||
| # Migrate database | ||||
|     ./src/manage.py migrate && \ | ||||
| # Create user | ||||
|     addgroup -g 1000 paperless && \ | ||||
|     adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \ | ||||
|     chown -Rh paperless:paperless /usr/src/paperless && \ | ||||
|     mkdir -p $PAPERLESS_EXPORT_DIR && \ | ||||
| # Setup entrypoint | ||||
|     chmod 755 /sbin/docker-entrypoint.sh | ||||
| 
 | ||||
| WORKDIR /usr/src/paperless/src | ||||
| # Mount volumes and set Entrypoint | ||||
| VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"] | ||||
| 
 | ||||
| ENTRYPOINT ["/sbin/docker-entrypoint.sh"] | ||||
| CMD ["--help"] | ||||
|  | ||||
							
								
								
									
										12
									
								
								README.rst
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								README.rst
									
									
									
									
									
								
							| @ -4,7 +4,6 @@ Paperless | ||||
| |Documentation| | ||||
| |Chat| | ||||
| |Travis| | ||||
| |Dependencies| | ||||
| 
 | ||||
| Index and archive all of your scanned paper documents | ||||
| 
 | ||||
| @ -28,12 +27,11 @@ scanner produces | ||||
| 
 | ||||
| 1. Buy a document scanner that can write to a place on your network.  If you | ||||
|    need some inspiration, have a look at the `scanner recommendations`_ page. | ||||
|    recommended by another user. | ||||
| 2. Set it up to "scan to FTP" or something similar. It should be able to push | ||||
|    scanned images to a server without you having to do anything.  If your | ||||
|    scanner doesn't know how to automatically upload the file somewhere, you can | ||||
|    always do that manually.  Paperless doesn't care how the documents get into | ||||
|    its local consumption directory. | ||||
|    scanned images to a server without you having to do anything.  Of course if | ||||
|    your scanner doesn't know how to automatically upload the file somewhere, | ||||
|    you can always do that manually.  Paperless doesn't care how the documents | ||||
|    get into its local consumption directory. | ||||
| 3. Have the target server run the Paperless consumption script to OCR the file | ||||
|    and index it into a local database. | ||||
| 4. Use the web frontend to sift through the database and find what you want. | ||||
| @ -140,5 +138,3 @@ work and they need the money a lot more than I do. | ||||
|    :target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge | ||||
| .. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master | ||||
|    :target: https://travis-ci.org/danielquinn/paperless | ||||
| .. |Dependencies| image:: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500/badge.svg | ||||
|    :target: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500 | ||||
|  | ||||
| @ -2,7 +2,7 @@ version: '2' | ||||
| 
 | ||||
| services: | ||||
|     webserver: | ||||
|         image: pitkley/paperless | ||||
|         build: ./ | ||||
|         ports: | ||||
|             # You can adapt the port you want Paperless to listen on by | ||||
|             # modifying the part before the `:`. | ||||
| @ -20,7 +20,7 @@ services: | ||||
|         command: ["runserver", "--insecure", "0.0.0.0:8000"] | ||||
| 
 | ||||
|     consumer: | ||||
|         image: pitkley/paperless | ||||
|         build: ./ | ||||
|         volumes: | ||||
|             - data:/usr/src/paperless/data | ||||
|             - media:/usr/src/paperless/media | ||||
|  | ||||
| @ -1,243 +1,296 @@ | ||||
| Changelog | ||||
| ######### | ||||
| 
 | ||||
| * 1.1.0 | ||||
|   * Fix for `#283`_, a redirect bug which broke interactions with | ||||
| 1.2.0 | ||||
| ===== | ||||
| 
 | ||||
| * New Docker image, now based on Alpine, thanks to the efforts of `addadi`_ | ||||
|   and `Pit`_. | ||||
| * `BastianPoe`_ has added the long-awaited feature to automatically skip the | ||||
|   OCR step when the PDF already contains text. This can be overridden by | ||||
|   setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or | ||||
|   in the environment.  Note that this also means that Paperless now requires | ||||
|   ``libpoppler-cpp-dev`` to be installed. **Important**: You'll need to run | ||||
|   ``pip install -r requirements.txt`` after the usual ``git pull`` to | ||||
|   properly update. | ||||
| 
 | ||||
| 1.1.0 | ||||
| ===== | ||||
| 
 | ||||
| * Fix for `#283`_, a redirect bug which broke interactions with | ||||
|   paperless-desktop.  Thanks to `chris-aeviator`_ for reporting it. | ||||
|   * Addition of an optional new financial year filter, courtesy of | ||||
| * Addition of an optional new financial year filter, courtesy of | ||||
|   `David Martin`_ `#256`_ | ||||
|   * Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of | ||||
| * Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of | ||||
|   `Dan Panzarella`_ | ||||
| 
 | ||||
| * 1.0.0 | ||||
|   * Upgrade to Django 1.11.  **You'll need to run | ||||
| 1.0.0 | ||||
| ===== | ||||
| 
 | ||||
| * Upgrade to Django 1.11.  **You'll need to run | ||||
|   ``pip install -r requirements.txt`` after the usual ``git pull`` to | ||||
|   properly update**. | ||||
|   * Replace the templatetag-based hack we had for document listing in favour of | ||||
| * Replace the templatetag-based hack we had for document listing in favour of | ||||
|   a slightly less ugly solution in the form of another template tag with less | ||||
|   copypasta. | ||||
|   * Support for multi-word-matches for auto-tagging thanks to an excellent | ||||
| * Support for multi-word-matches for auto-tagging thanks to an excellent | ||||
|   patch from `ishirav`_ `#277`_. | ||||
|   * Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of | ||||
| * Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of | ||||
|   the text and checkboxes under some resolutions `#272`_. | ||||
|   * Patched the Docker config to force the serving of static files.  Credit for | ||||
| * Patched the Docker config to force the serving of static files.  Credit for | ||||
|   this one goes to `dev-rke`_ via `#248`_. | ||||
|   * Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_. | ||||
|   * Date fields in the admin are now expressed as HTML5 date fields thanks to | ||||
| * Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_. | ||||
| * Date fields in the admin are now expressed as HTML5 date fields thanks to | ||||
|   `Lukas Winkler`_'s issue `#278`_ | ||||
| 
 | ||||
| * 0.8.0 | ||||
|   * Paperless can now run in a subdirectory on a host (``/paperless``), rather | ||||
| 0.8.0 | ||||
| ===== | ||||
| 
 | ||||
| * Paperless can now run in a subdirectory on a host (``/paperless``), rather | ||||
|   than always running in the root (``/``) thanks to `maphy-psd`_'s work on | ||||
|   `#255`_. | ||||
| 
 | ||||
| * 0.7.0 | ||||
|   * **Potentially breaking change**: As per `#235`_, Paperless will no longer | ||||
| 0.7.0 | ||||
| ===== | ||||
| 
 | ||||
| * **Potentially breaking change**: As per `#235`_, Paperless will no longer | ||||
|   automatically delete documents attached to correspondents when those | ||||
|   correspondents are themselves deleted.  This was Django's default | ||||
|   behaviour, but didn't make much sense in Paperless' case.  Thanks to | ||||
|   `Thomas Brueggemann`_ and `David Martin`_ for their input on this one. | ||||
|   * Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files | ||||
| * Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files | ||||
|   properly.  Thanks to `ayounggun`_ for reporting this one and to | ||||
|   `Kusti Skytén`_ for posting the correct solution in the Github issue. | ||||
| 
 | ||||
| * 0.6.0 | ||||
|   * Abandon the shared-secret trick we were using for the POST API in favour | ||||
| 0.6.0 | ||||
| ===== | ||||
| 
 | ||||
| * Abandon the shared-secret trick we were using for the POST API in favour | ||||
|   of BasicAuth or Django session. | ||||
|   * Fix the POST API so it actually works.  `#236`_ | ||||
|   * **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET`` | ||||
| * Fix the POST API so it actually works.  `#236`_ | ||||
| * **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET`` | ||||
|   as it was being used both for the API (now replaced with a normal auth) | ||||
|   and form email polling.  Now that we're only using it for email, this | ||||
|   variable has been renamed to ``PAPERLESS_EMAIL_SECRET``.  The old value | ||||
|   will still work for a while, but you should change your config if you've | ||||
|   been using the email polling feature.  Thanks to `Joshua Gilman`_ for all | ||||
|   the help with this feature. | ||||
| * 0.5.0 | ||||
|   * Support for fuzzy matching in the auto-tagger & auto-correspondent systems | ||||
| 
 | ||||
| 0.5.0 | ||||
| ===== | ||||
| 
 | ||||
| * Support for fuzzy matching in the auto-tagger & auto-correspondent systems | ||||
|   thanks to `Jake Gysland`_'s patch `#220`_. | ||||
|   * Modified the Dockerfile to prepare an export directory (`#212`_).  Thanks | ||||
| * Modified the Dockerfile to prepare an export directory (`#212`_).  Thanks | ||||
|   to combined efforts from `Pit`_ and `Strubbl`_ in working out the kinks on | ||||
|   this one. | ||||
|   * Updated the import/export scripts to include support for thumbnails.  Big | ||||
| * Updated the import/export scripts to include support for thumbnails.  Big | ||||
|   thanks to `CkuT`_ for finding this shortcoming and doing the work to get | ||||
|   it fixed in `#224`_. | ||||
|   * All of the following changes are thanks to `David Martin`_: | ||||
| * All of the following changes are thanks to `David Martin`_: | ||||
|   * Bumped the dependency on pyocr to 0.4.7 so new users can make use of | ||||
|   Tesseract 4 if they so prefer (`#226`_). | ||||
|   * Fixed a number of issues with the automated mail handler (`#227`_, `#228`_) | ||||
|   * Amended the documentation for better handling of systemd service files (`#229`_) | ||||
|   * Amended the Django Admin configuration to have nice headers (`#230`_) | ||||
| 
 | ||||
| * 0.4.1 | ||||
|   * Fix for `#206`_ wherein the pluggable parser didn't recognise files with | ||||
| 0.4.1 | ||||
| ===== | ||||
| 
 | ||||
| * Fix for `#206`_ wherein the pluggable parser didn't recognise files with | ||||
|   all-caps suffixes like ``.PDF`` | ||||
| 
 | ||||
| * 0.4.0 | ||||
|   * Introducing reminders.  See `#199`_ for more information, but the short | ||||
| 0.4.0 | ||||
| ===== | ||||
| 
 | ||||
| * Introducing reminders.  See `#199`_ for more information, but the short | ||||
|   explanation is that you can now attach simple notes & times to documents | ||||
|   which are made available via the API.  Currently, the default API | ||||
|   (basically just the Django admin) doesn't really make use of this, but | ||||
|   `Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would | ||||
|   like to make use of this feature in his project. | ||||
| 
 | ||||
| * 0.3.6 | ||||
|   * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the | ||||
| 0.3.6 | ||||
| ===== | ||||
| 
 | ||||
| * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the | ||||
|   correspondent or the tags for a document. | ||||
|   * The ``content`` field is now optional, to allow for the edge case of a | ||||
| * The ``content`` field is now optional, to allow for the edge case of a | ||||
|   purely graphical document. | ||||
|   * You can no longer add documents via the admin.  This never worked in the | ||||
| * You can no longer add documents via the admin.  This never worked in the | ||||
|   first place, so all I've done here is remove the link to the broken form. | ||||
|   * The consumer code has been heavily refactored to support a pluggable | ||||
| * The consumer code has been heavily refactored to support a pluggable | ||||
|   interface.  Install a paperless consumer via pip and tell paperless about | ||||
|   it with an environment variable, and you're good to go.  Proper | ||||
|   documentation is on its way. | ||||
| 
 | ||||
| * 0.3.5 | ||||
|   * A serious facelift for the documents listing page wherein we drop the | ||||
| 0.3.5 | ||||
| ===== | ||||
| 
 | ||||
| * A serious facelift for the documents listing page wherein we drop the | ||||
|   tabular layout in favour of a tiled interface. | ||||
|   * Users can now configure the number of items per page. | ||||
|   * Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value. | ||||
|   * Moved the dotenv loading to the top of settings.py | ||||
|   * Fix for `#112`_: Added checks for binaries required for document | ||||
| * Users can now configure the number of items per page. | ||||
| * Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value. | ||||
| * Moved the dotenv loading to the top of settings.py | ||||
| * Fix for `#112`_: Added checks for binaries required for document | ||||
|   consumption. | ||||
| 
 | ||||
| * 0.3.4 | ||||
|   * Removal of django-suit due to a licensing conflict I bumped into in 0.3.3. | ||||
| 0.3.4 | ||||
| ===== | ||||
| 
 | ||||
| * Removal of django-suit due to a licensing conflict I bumped into in 0.3.3. | ||||
|   Note that you *can* use Django Suit with Paperless, but only in a | ||||
|   non-profit situation as their free license prohibits for-profit use.  As a | ||||
|   result, I can't bundle Suit with Paperless without conflicting with the | ||||
|   GPL.  Further development will be done against the stock Django admin. | ||||
|   * I shrunk the thumbnails a little 'cause they were too big for me, even on | ||||
| * I shrunk the thumbnails a little 'cause they were too big for me, even on | ||||
|   my high-DPI monitor. | ||||
|   * BasicAuth support for document and thumbnail downloads, as well as the Push | ||||
| * BasicAuth support for document and thumbnail downloads, as well as the Push | ||||
|   API thanks to @thomasbrueggemann.  See `#179`_. | ||||
| 
 | ||||
| * 0.3.3 | ||||
|   * Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw! | ||||
|   * Timezone, items per page, and default language are now all configurable, | ||||
| 0.3.3 | ||||
| ===== | ||||
| 
 | ||||
| * Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw! | ||||
| * Timezone, items per page, and default language are now all configurable, | ||||
|   also thanks to @ekw. | ||||
| 
 | ||||
| * 0.3.2 | ||||
|   * Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the | ||||
| 0.3.2 | ||||
| ===== | ||||
| 
 | ||||
| * Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the | ||||
|   user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need | ||||
|   arise. | ||||
| 
 | ||||
| * 0.3.1 | ||||
|   * Added a default value for ``CONVERT_BINARY`` | ||||
| 0.3.1 | ||||
| ===== | ||||
| 
 | ||||
| * 0.3.0 | ||||
|   * Updated to using django-filter 1.x | ||||
|   * Added some system checks so new users aren't confused by misconfigurations. | ||||
|   * Consumer loop time is now configurable for systems with slow writes.  Just | ||||
| * Added a default value for ``CONVERT_BINARY`` | ||||
| 
 | ||||
| 0.3.0 | ||||
| ===== | ||||
| 
 | ||||
| * Updated to using django-filter 1.x | ||||
| * Added some system checks so new users aren't confused by misconfigurations. | ||||
| * Consumer loop time is now configurable for systems with slow writes.  Just | ||||
|   set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds.  The default | ||||
|   is 10. | ||||
|   * As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``, | ||||
| * As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``, | ||||
|   ``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``.  Please use | ||||
|   ``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and | ||||
|   ``PAPERLESS_SHARED_SECRET`` respectively instead. | ||||
| 
 | ||||
| * 0.2.0 | ||||
| 0.2.0 | ||||
| ===== | ||||
| 
 | ||||
|   * `#150`_: The media root is now a variable you can set in | ||||
| * `#150`_: The media root is now a variable you can set in | ||||
|   ``paperless.conf``. | ||||
|   * `#148`_: The database location (sqlite) is now a variable you can set in | ||||
| * `#148`_: The database location (sqlite) is now a variable you can set in | ||||
|   ``paperless.conf``. | ||||
|   * `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch`` | ||||
| * `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch`` | ||||
|   URL. | ||||
|   * `#131`_: Document files are now automatically removed from disk when | ||||
| * `#131`_: Document files are now automatically removed from disk when | ||||
|   they're deleted in Paperless. | ||||
|   * `#121`_: Fixed a bug where Paperless wasn't setting document creation time | ||||
| * `#121`_: Fixed a bug where Paperless wasn't setting document creation time | ||||
|   based on the file naming scheme. | ||||
|   * `#81`_: Added a hook to run an arbitrary script after every document is | ||||
| * `#81`_: Added a hook to run an arbitrary script after every document is | ||||
|   consumed. | ||||
|   * `#98`_: Added optional environment variables for ImageMagick so that it | ||||
| * `#98`_: Added optional environment variables for ImageMagick so that it | ||||
|   doesn't explode when handling Very Large Documents or when it's just | ||||
|   running on a low-memory system.  Thanks to `Florian Harr`_ for his help on | ||||
|   this one. | ||||
|   * `#89`_ Ported the auto-tagging code to correspondents as well.  Thanks to | ||||
| * `#89`_ Ported the auto-tagging code to correspondents as well.  Thanks to | ||||
|   `Justin Snyman`_ for the pointers in the issue queue. | ||||
|   * Added support for guessing the date from the file name along with the | ||||
| * Added support for guessing the date from the file name along with the | ||||
|   correspondent, title, and tags.  Thanks to `Tikitu de Jager`_ for his pull | ||||
|   request that I took forever to merge and to `Pit`_ for his efforts on the | ||||
|   regex front. | ||||
|   * `#94`_: Restored support for changing the created date in the UI.  Thanks | ||||
| * `#94`_: Restored support for changing the created date in the UI.  Thanks | ||||
|   to `Martin Honermeyer`_ and `Tim White`_ for working with me on this. | ||||
| 
 | ||||
| * 0.1.1 | ||||
| 0.1.1 | ||||
| ===== | ||||
| 
 | ||||
|   * Potentially **Breaking Change**: All references to "sender" in the code | ||||
| * Potentially **Breaking Change**: All references to "sender" in the code | ||||
|   have been renamed to "correspondent" to better reflect the nature of the | ||||
|   property (one could quite reasonably scan a document before sending it to | ||||
|   someone.) | ||||
|   * `#67`_: Rewrote the document exporter and added a new importer that allows | ||||
| * `#67`_: Rewrote the document exporter and added a new importer that allows | ||||
|   for full metadata retention without depending on the file name and | ||||
|   modification time.  A big thanks to `Tikitu de Jager`_, `Pit`_, | ||||
|   `Florian Jung`_, and `Christopher Luu`_ for their code snippets and | ||||
|   contributing conversation that lead to this change. | ||||
|   * `#20`_: Added *unpaper* support to help in cleaning up the scanned image | ||||
| * `#20`_: Added *unpaper* support to help in cleaning up the scanned image | ||||
|   before it's OCR'd.  Thanks to `Pit`_ for this one. | ||||
|   * `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI. | ||||
|   * `#68`_: Added support for using a proper config file at | ||||
| * `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI. | ||||
| * `#68`_: Added support for using a proper config file at | ||||
|   ``/etc/paperless.conf`` and modified the systemd unit files to use it. | ||||
|   * Refactored the Vagrant installation process to use environment variables | ||||
| * Refactored the Vagrant installation process to use environment variables | ||||
|   rather than asking the user to modify ``settings.py``. | ||||
|   * `#44`_: Harmonise environment variable names with constant names. | ||||
|   * `#60`_: Setup logging to actually use the Python native logging framework. | ||||
|   * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images | ||||
| * `#44`_: Harmonise environment variable names with constant names. | ||||
| * `#60`_: Setup logging to actually use the Python native logging framework. | ||||
| * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images | ||||
|   to be imported but made unavailable. | ||||
| 
 | ||||
| * 0.1.0 | ||||
| 0.1.0 | ||||
| ===== | ||||
| 
 | ||||
|   * Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and | ||||
| * Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and | ||||
|   `Tikitu de Jager`_ for this one, and especially to `Pit`_ | ||||
|   who spearheadded this effort. | ||||
|   * A simple REST API is in place, but it should be considered unstable. | ||||
|   * Cleaned up the consumer to use temporary directories instead of a single | ||||
| * A simple REST API is in place, but it should be considered unstable. | ||||
| * Cleaned up the consumer to use temporary directories instead of a single | ||||
|   scratch space.  (Thanks `Pit`_) | ||||
|   * Improved the efficiency of the consumer by parsing pages more intelligently | ||||
| * Improved the efficiency of the consumer by parsing pages more intelligently | ||||
|   and introducing a threaded OCR process (thanks again `Pit`_). | ||||
|   * `#45`_: Cleaned up the logic for tag matching.  Reported by `darkmatter`_. | ||||
|   * `#47`_: Auto-rotate landscape documents.  Reported by `Paul`_ and fixed by | ||||
| * `#45`_: Cleaned up the logic for tag matching.  Reported by `darkmatter`_. | ||||
| * `#47`_: Auto-rotate landscape documents.  Reported by `Paul`_ and fixed by | ||||
|   `Pit`_. | ||||
|   * `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_) | ||||
|   * `#54`_: Documented the re-tagger (`zedster`_) | ||||
|   * `#57`_: Make sure file is preserved on import failure (`darkmatter`_) | ||||
|   * Added tox with pep8 checking | ||||
| * `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_) | ||||
| * `#54`_: Documented the re-tagger (`zedster`_) | ||||
| * `#57`_: Make sure file is preserved on import failure (`darkmatter`_) | ||||
| * Added tox with pep8 checking | ||||
| 
 | ||||
| * 0.0.6 | ||||
| 0.0.6 | ||||
| ===== | ||||
| 
 | ||||
|   * Added support for parallel OCR (significant work from `Pit`_) | ||||
|   * Sped up the language detection (significant work from `Pit`_) | ||||
|   * Added simple logging | ||||
| * Added support for parallel OCR (significant work from `Pit`_) | ||||
| * Sped up the language detection (significant work from `Pit`_) | ||||
| * Added simple logging | ||||
| 
 | ||||
| * 0.0.5 | ||||
| 0.0.5 | ||||
| ===== | ||||
| 
 | ||||
|   * Added support for image files as documents (png, jpg, gif, tiff) | ||||
|   * Added a crude means of HTTP POST for document imports | ||||
|   * Added IMAP mail support | ||||
|   * Added a re-tagging utility | ||||
|   * Documentation for the above as well as data migration | ||||
| * Added support for image files as documents (png, jpg, gif, tiff) | ||||
| * Added a crude means of HTTP POST for document imports | ||||
| * Added IMAP mail support | ||||
| * Added a re-tagging utility | ||||
| * Documentation for the above as well as data migration | ||||
| 
 | ||||
| * 0.0.4 | ||||
| 0.0.4 | ||||
| ===== | ||||
| 
 | ||||
|   * Added automated tagging basted on keyword matching | ||||
|   * Cleaned up the document listing page | ||||
|   * Removed ``User`` and ``Group`` from the admin | ||||
|   * Added ``pytz`` to the list of requirements | ||||
| * Added automated tagging basted on keyword matching | ||||
| * Cleaned up the document listing page | ||||
| * Removed ``User`` and ``Group`` from the admin | ||||
| * Added ``pytz`` to the list of requirements | ||||
| 
 | ||||
| * 0.0.3 | ||||
| 0.0.3 | ||||
| ===== | ||||
| 
 | ||||
|   * Added basic tagging | ||||
| * Added basic tagging | ||||
| 
 | ||||
| * 0.0.2 | ||||
| 0.0.2 | ||||
| ===== | ||||
| 
 | ||||
|   * Added language detection | ||||
|   * Added datestamps to ``document_exporter``. | ||||
|   * Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``. | ||||
| * Added language detection | ||||
| * Added datestamps to ``document_exporter``. | ||||
| * Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``. | ||||
| 
 | ||||
| * 0.0.1 | ||||
| 0.0.1 | ||||
| ===== | ||||
| 
 | ||||
|   * Initial release | ||||
| * Initial release | ||||
| 
 | ||||
| .. _Brian Conn: https://github.com/TheConnMan | ||||
| .. _Christopher Luu: https://github.com/nuudles | ||||
| @ -268,6 +321,8 @@ Changelog | ||||
| .. _Lukas Winkler: https://github.com/Findus23 | ||||
| .. _chris-aeviator: https://github.com/chris-aeviator | ||||
| .. _Dan Panzarella: https://github.com/pzl | ||||
| .. _addadi: https://github.com/addadi | ||||
| .. _BastianPoe: https://github.com/BastianPoe | ||||
| 
 | ||||
| .. _#20: https://github.com/danielquinn/paperless/issues/20 | ||||
| .. _#44: https://github.com/danielquinn/paperless/issues/44 | ||||
| @ -317,3 +372,5 @@ Changelog | ||||
| .. _#283: https://github.com/danielquinn/paperless/issues/283 | ||||
| .. _#256: https://github.com/danielquinn/paperless/pull/256 | ||||
| .. _#285: https://github.com/danielquinn/paperless/pull/285 | ||||
| 
 | ||||
| .. _pipenv: https://docs.pipenv.org/ | ||||
|  | ||||
| @ -11,24 +11,27 @@ should work) that has the following software installed: | ||||
| * `Tesseract`_, plus its language files matching your document base. | ||||
| * `Imagemagick`_ version 6.7.5 or higher | ||||
| * `unpaper`_ | ||||
| * `libpoppler-cpp-dev`_ PDF rendering library | ||||
| 
 | ||||
| .. _Python3: https://python.org/ | ||||
| .. _GNU Privacy Guard: https://gnupg.org | ||||
| .. _Tesseract: https://github.com/tesseract-ocr | ||||
| .. _Imagemagick: http://imagemagick.org/ | ||||
| .. _unpaper: https://www.flameeyes.eu/projects/unpaper | ||||
| .. _libpoppler-cpp-dev: https://poppler.freedesktop.org/ | ||||
| 
 | ||||
| Notably, you should confirm how you access your Python3 installation.  Many | ||||
| Linux distributions will install Python3 in parallel to Python2, using the names | ||||
| ``python3`` and ``python`` respectively.  The same goes for ``pip3`` and | ||||
| ``pip``.  Running Paperless with Python2 will likely break things, so make sure that  | ||||
| you're using the right version. | ||||
| Linux distributions will install Python3 in parallel to Python2, using the | ||||
| names ``python3`` and ``python`` respectively.  The same goes for ``pip3`` and | ||||
| ``pip``.  Running Paperless with Python2 will likely break things, so make sure | ||||
| that you're using the right version. | ||||
| 
 | ||||
| For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to | ||||
| refer to their Python3 versions. | ||||
| 
 | ||||
| In addition to the above, there are a number of Python requirements, all of | ||||
| which are listed in a file called ``requirements.txt`` in the project root directory. | ||||
| which are listed in a file called ``requirements.txt`` in the project root | ||||
| directory. | ||||
| 
 | ||||
| If you're not working on a virtual environment (like Vagrant or Docker), you | ||||
| should probably be using a virtualenv, but that's your call.  The reasons why | ||||
| @ -39,12 +42,13 @@ probably figure that out before continuing. | ||||
| 
 | ||||
| .. _requirements-apple: | ||||
| 
 | ||||
| Apple-tastic Complications | ||||
| -------------------------- | ||||
| Problems with Imagemagick & PDFs | ||||
| -------------------------------- | ||||
| 
 | ||||
| Some users have `run into problems`_ with installing ImageMagick on Apple | ||||
| systems using HomeBrew.  The solution appears to be to install ghostscript as | ||||
| well as ImageMagick: | ||||
| Some users have `run into problems`_ with getting ImageMagick to do its thing | ||||
| with PDFs.  Often this is the case with Apple systems using HomeBrew, but other | ||||
| Linuxes have been a problem as well.  The solution appears to be to install | ||||
| ghostscript as well as ImageMagick: | ||||
| 
 | ||||
| .. _run into problems: https://github.com/danielquinn/paperless/issues/25 | ||||
| 
 | ||||
|  | ||||
| @ -175,7 +175,8 @@ Docker Method | ||||
|    modified versions of the configuration files. | ||||
| 4. Modify ``docker-compose.yml`` to your preferences, following the | ||||
|    instructions in comments in the file. The only change that is a hard | ||||
|    requirement is to specify where the consumption directory should mount. | ||||
|    requirement is to specify where the consumption directory should | ||||
|    mount.[#dockercomposeyml]_ | ||||
| 5. Modify ``docker-compose.env`` and adapt the following environment variables: | ||||
| 
 | ||||
|    ``PAPERLESS_PASSPHRASE`` | ||||
| @ -192,7 +193,7 @@ Docker Method | ||||
|      default English, set this parameter to a space separated list of | ||||
|      three-letter language-codes after `ISO 639-2/T`_. For a list of available | ||||
|      languages -- including their three letter codes -- see the | ||||
|      `Debian packagelist`_. | ||||
|      `Alpine packagelist`_. | ||||
| 
 | ||||
|    ``USERMAP_UID`` and ``USERMAP_GID`` | ||||
|      If you want to mount the consumption volume (directory ``/consume`` within | ||||
| @ -282,12 +283,17 @@ Docker Method | ||||
| .. _Docker: https://www.docker.com/ | ||||
| .. _docker-compose: https://docs.docker.com/compose/install/ | ||||
| .. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes | ||||
| .. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr- | ||||
| .. _Alpine packagelist: https://pkgs.alpinelinux.org/packages?name=tesseract-ocr-data*&arch=x86_64 | ||||
| 
 | ||||
| .. [#compose] You of course don't have to use docker-compose, but it | ||||
|    simplifies deployment immensely. If you know your way around Docker, feel | ||||
|    free to tinker around without using compose! | ||||
| 
 | ||||
| .. [#dockercomposeyml] If you're upgrading your docker-compose images from | ||||
|    version 1.1.0 or earlier, you might need to change in the | ||||
|    ``docker-compose.yml`` file the ``image: pitkley/paperless`` directive in | ||||
|    both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the | ||||
|    newer ``docker-compose.yml.example`` file | ||||
| 
 | ||||
| .. _setup-permanent: | ||||
| 
 | ||||
|  | ||||
| @ -14,6 +14,7 @@ python-dotenv>=0.6.2 | ||||
| python-gnupg>=0.3.9 | ||||
| pytz>=2016.10 | ||||
| gunicorn==19.7.1 | ||||
| pdftotext>=2.0.1 | ||||
| 
 | ||||
| # For the tests | ||||
| factory-boy | ||||
|  | ||||
| @ -9,7 +9,7 @@ map_uidgid() { | ||||
|     USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID} | ||||
|     if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then | ||||
|         echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID" | ||||
|         groupmod -g "${USERMAP_GID}" paperless | ||||
|         addgroup -g "${USERMAP_GID}" paperless | ||||
|         sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd | ||||
|     fi | ||||
| } | ||||
| @ -56,25 +56,24 @@ install_languages() { | ||||
|         return | ||||
|     fi | ||||
| 
 | ||||
|     # Update apt-lists | ||||
|     apt-get update | ||||
| 
 | ||||
|     # Loop over languages to be installed | ||||
|     for lang in "${langs[@]}"; do | ||||
|         pkg="tesseract-ocr-$lang" | ||||
|         if dpkg -s "$pkg" > /dev/null 2>&1; then | ||||
|         pkg="tesseract-ocr-data-$lang" | ||||
| 
 | ||||
|         # English is installed by default | ||||
|         if [ "$lang" ==  "eng" ]; then | ||||
|             continue | ||||
|         fi | ||||
|          | ||||
|         if ! apt-cache show "$pkg" > /dev/null 2>&1; then | ||||
|         if apk info -e "$pkg" > /dev/null 2>&1; then | ||||
|             continue | ||||
|         fi | ||||
|         if ! apk info "$pkg" > /dev/null 2>&1; then | ||||
|             continue | ||||
|         fi | ||||
| 
 | ||||
|         apt-get install "$pkg" | ||||
|         apk --no-cache --update add "$pkg" | ||||
|     done | ||||
| 
 | ||||
|     # Remove apt lists | ||||
|     rm -rf /var/lib/apt/lists/* | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
| @ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") | ||||
| # The amount of threads to use for OCR | ||||
| OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") | ||||
| 
 | ||||
| # OCR all documents? | ||||
| OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true")) | ||||
| 
 | ||||
| # If this is true, any failed attempts to OCR a PDF will result in the PDF | ||||
| # being indexed anyway, with whatever we could get.  If it's False, the file | ||||
| # will simply be left in the CONSUMPTION_DIR. | ||||
|  | ||||
| @ -3,6 +3,7 @@ import os | ||||
| import re | ||||
| import subprocess | ||||
| from multiprocessing.pool import Pool | ||||
| import pdftotext | ||||
| 
 | ||||
| import langdetect | ||||
| import pyocr | ||||
| @ -31,6 +32,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||
|     UNPAPER = settings.UNPAPER_BINARY | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|     OCR_ALWAYS = settings.OCR_ALWAYS | ||||
| 
 | ||||
|     def get_thumbnail(self): | ||||
|         """ | ||||
| @ -46,7 +48,21 @@ class RasterisedDocumentParser(DocumentParser): | ||||
| 
 | ||||
|         return os.path.join(self.tempdir, "convert-0000.png") | ||||
| 
 | ||||
|     def _is_ocred(self): | ||||
|         # Extract text from PDF using pdftotext | ||||
|         text = get_text_from_pdf(self.document_path) | ||||
| 
 | ||||
|         # We assume, that a PDF with at least 50 characters contains text | ||||
|         # (so no OCR required) | ||||
|         if len(text) > 50: | ||||
|             return True | ||||
| 
 | ||||
|         return False | ||||
| 
 | ||||
|     def get_text(self): | ||||
|         if not self.OCR_ALWAYS and self._is_ocred(): | ||||
|             self.log("info", "Skipping OCR, using Text from PDF") | ||||
|             return get_text_from_pdf(self.document_path) | ||||
| 
 | ||||
|         images = self._get_greyscale() | ||||
| 
 | ||||
| @ -212,3 +228,13 @@ def image_to_string(args): | ||||
|             except (TesseractError, OtherTesseractError): | ||||
|                 pass | ||||
|         return ocr.image_to_string(f, lang=lang) | ||||
| 
 | ||||
| 
 | ||||
| def get_text_from_pdf(pdf_file): | ||||
|     with open(pdf_file, "rb") as f: | ||||
|         try: | ||||
|             pdf = pdftotext.PDF(f) | ||||
|         except pdftotext.Error: | ||||
|             return False | ||||
| 
 | ||||
|     return "\n".join(pdf) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user