mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	Merge pull request #74 from pitkley/feature/unpaper
Add unpaper support
This commit is contained in:
		
						commit
						500c615e56
					
				@ -5,7 +5,7 @@ MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
 | 
				
			|||||||
RUN apt-get update \
 | 
					RUN apt-get update \
 | 
				
			||||||
    && apt-get install -y --no-install-recommends \
 | 
					    && apt-get install -y --no-install-recommends \
 | 
				
			||||||
        sudo \
 | 
					        sudo \
 | 
				
			||||||
        tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \
 | 
					        tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
 | 
				
			||||||
    && rm -rf /var/lib/apt/lists/*
 | 
					    && rm -rf /var/lib/apt/lists/*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Install python dependencies
 | 
					# Install python dependencies
 | 
				
			||||||
 | 
				
			|||||||
@ -10,11 +10,13 @@ should work) that has the following software installed on it:
 | 
				
			|||||||
* `GNU Privacy Guard`_
 | 
					* `GNU Privacy Guard`_
 | 
				
			||||||
* `Tesseract`_
 | 
					* `Tesseract`_
 | 
				
			||||||
* `Imagemagick`_
 | 
					* `Imagemagick`_
 | 
				
			||||||
 | 
					* `unpaper`_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. _Python3: https://python.org/
 | 
					.. _Python3: https://python.org/
 | 
				
			||||||
.. _GNU Privacy Guard: https://gnupg.org
 | 
					.. _GNU Privacy Guard: https://gnupg.org
 | 
				
			||||||
.. _Tesseract: https://github.com/tesseract-ocr
 | 
					.. _Tesseract: https://github.com/tesseract-ocr
 | 
				
			||||||
.. _Imagemagick: http://imagemagick.org/
 | 
					.. _Imagemagick: http://imagemagick.org/
 | 
				
			||||||
 | 
					.. _unpaper: https://www.flameeyes.eu/projects/unpaper
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Notably, you should confirm how you access your Python3 installation.  Many
 | 
					Notably, you should confirm how you access your Python3 installation.  Many
 | 
				
			||||||
Linux distributions will install Python3 in parallel to Python2, using the names
 | 
					Linux distributions will install Python3 in parallel to Python2, using the names
 | 
				
			||||||
 | 
				
			|||||||
@ -5,7 +5,7 @@ apt-get update
 | 
				
			|||||||
apt-get build-dep -y python-imaging
 | 
					apt-get build-dep -y python-imaging
 | 
				
			||||||
apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
 | 
					apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
 | 
				
			||||||
apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
 | 
					apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
 | 
				
			||||||
apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick
 | 
					apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick unpaper
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Python dependencies
 | 
					# Python dependencies
 | 
				
			||||||
pip3 install -r /opt/paperless/requirements.txt
 | 
					pip3 install -r /opt/paperless/requirements.txt
 | 
				
			||||||
 | 
				
			|||||||
@ -39,8 +39,8 @@ class ConsumerError(Exception):
 | 
				
			|||||||
class Consumer(object):
 | 
					class Consumer(object):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Loop over every file found in CONSUMPTION_DIR and:
 | 
					    Loop over every file found in CONSUMPTION_DIR and:
 | 
				
			||||||
      1. Convert it to a greyscale png
 | 
					      1. Convert it to a greyscale pnm
 | 
				
			||||||
      2. Use tesseract on the png
 | 
					      2. Use tesseract on the pnm
 | 
				
			||||||
      3. Encrypt and store the document in the MEDIA_ROOT
 | 
					      3. Encrypt and store the document in the MEDIA_ROOT
 | 
				
			||||||
      4. Store the OCR'd text in the database
 | 
					      4. Store the OCR'd text in the database
 | 
				
			||||||
      5. Delete the document and image(s)
 | 
					      5. Delete the document and image(s)
 | 
				
			||||||
@ -48,6 +48,7 @@ class Consumer(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    SCRATCH = settings.SCRATCH_DIR
 | 
					    SCRATCH = settings.SCRATCH_DIR
 | 
				
			||||||
    CONVERT = settings.CONVERT_BINARY
 | 
					    CONVERT = settings.CONVERT_BINARY
 | 
				
			||||||
 | 
					    UNPAPER = settings.UNPAPER_BINARY
 | 
				
			||||||
    CONSUME = settings.CONSUMPTION_DIR
 | 
					    CONSUME = settings.CONSUMPTION_DIR
 | 
				
			||||||
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
					    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -118,11 +119,11 @@ class Consumer(object):
 | 
				
			|||||||
            self.log("info", "Consuming {}".format(doc))
 | 
					            self.log("info", "Consuming {}".format(doc))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
 | 
					            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
 | 
				
			||||||
            pngs = self._get_greyscale(tempdir, doc)
 | 
					            imgs = self._get_greyscale(tempdir, doc)
 | 
				
			||||||
            thumbnail = self._get_thumbnail(tempdir, doc)
 | 
					            thumbnail = self._get_thumbnail(tempdir, doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                text = self._get_ocr(pngs)
 | 
					                text = self._get_ocr(imgs)
 | 
				
			||||||
                self._store(text, doc, thumbnail)
 | 
					                self._store(text, doc, thumbnail)
 | 
				
			||||||
            except OCRError as e:
 | 
					            except OCRError as e:
 | 
				
			||||||
                self._ignore.append(doc)
 | 
					                self._ignore.append(doc)
 | 
				
			||||||
@ -140,19 +141,30 @@ class Consumer(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        self.log("info", "Generating greyscale image from {}".format(doc))
 | 
					        self.log("info", "Generating greyscale image from {}".format(doc))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        png = os.path.join(tempdir, "convert-%04d.jpg")
 | 
					        # Convert PDF to multiple PNMs
 | 
				
			||||||
 | 
					        pnm = os.path.join(tempdir, "convert-%04d.pnm")
 | 
				
			||||||
        subprocess.Popen((
 | 
					        subprocess.Popen((
 | 
				
			||||||
            self.CONVERT, "-density", "300", "-depth", "8",
 | 
					            self.CONVERT, "-density", "300", "-depth", "8",
 | 
				
			||||||
            "-type", "grayscale", doc, png
 | 
					            "-type", "grayscale", doc, pnm
 | 
				
			||||||
        )).wait()
 | 
					        )).wait()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        pngs = []
 | 
					        # Get a list of converted images
 | 
				
			||||||
 | 
					        pnms = []
 | 
				
			||||||
        for f in os.listdir(tempdir):
 | 
					        for f in os.listdir(tempdir):
 | 
				
			||||||
            if f.startswith("convert"):
 | 
					            if f.endswith(".pnm"):
 | 
				
			||||||
                pngs.append(os.path.join(tempdir, f))
 | 
					                pnms.append(os.path.join(tempdir, f))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return sorted(filter(lambda __: os.path.isfile(__), pngs))
 | 
					        # Run unpaper in parallel on converted images
 | 
				
			||||||
 | 
					        with Pool(processes=self.THREADS) as pool:
 | 
				
			||||||
 | 
					            pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Return list of converted images, processed with unpaper
 | 
				
			||||||
 | 
					        pnms = []
 | 
				
			||||||
 | 
					        for f in os.listdir(tempdir):
 | 
				
			||||||
 | 
					            if f.endswith(".unpaper.pnm"):
 | 
				
			||||||
 | 
					                pnms.append(os.path.join(tempdir, f))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return sorted(filter(lambda __: os.path.isfile(__), pnms))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _get_thumbnail(self, tempdir, doc):
 | 
					    def _get_thumbnail(self, tempdir, doc):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
@ -179,21 +191,21 @@ class Consumer(object):
 | 
				
			|||||||
        except Exception as e:
 | 
					        except Exception as e:
 | 
				
			||||||
            self.log("warning", "Language detection error: {}".format(e))
 | 
					            self.log("warning", "Language detection error: {}".format(e))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _get_ocr(self, pngs):
 | 
					    def _get_ocr(self, imgs):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Attempts to do the best job possible OCR'ing the document based on
 | 
					        Attempts to do the best job possible OCR'ing the document based on
 | 
				
			||||||
        simple language detection trial & error.
 | 
					        simple language detection trial & error.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not pngs:
 | 
					        if not imgs:
 | 
				
			||||||
            raise OCRError("No images found")
 | 
					            raise OCRError("No images found")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.log("info", "OCRing the document")
 | 
					        self.log("info", "OCRing the document")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Since the division gets rounded down by int, this calculation works
 | 
					        # Since the division gets rounded down by int, this calculation works
 | 
				
			||||||
        # for every edge-case, i.e. 1
 | 
					        # for every edge-case, i.e. 1
 | 
				
			||||||
        middle = int(len(pngs) / 2)
 | 
					        middle = int(len(imgs) / 2)
 | 
				
			||||||
        raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
 | 
					        raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        guessed_language = self._guess_language(raw_text)
 | 
					        guessed_language = self._guess_language(raw_text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -205,16 +217,16 @@ class Consumer(object):
 | 
				
			|||||||
                    "As FORGIVING_OCR is enabled, we're going to make the "
 | 
					                    "As FORGIVING_OCR is enabled, we're going to make the "
 | 
				
			||||||
                    "best with what we have."
 | 
					                    "best with what we have."
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
 | 
					                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
 | 
				
			||||||
                return raw_text
 | 
					                return raw_text
 | 
				
			||||||
            raise OCRError("Language detection failed")
 | 
					            raise OCRError("Language detection failed")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
 | 
					        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
 | 
				
			||||||
            raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
 | 
					            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
 | 
				
			||||||
            return raw_text
 | 
					            return raw_text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            return self._ocr(pngs, ISO639[guessed_language])
 | 
					            return self._ocr(imgs, ISO639[guessed_language])
 | 
				
			||||||
        except pyocr.pyocr.tesseract.TesseractError:
 | 
					        except pyocr.pyocr.tesseract.TesseractError:
 | 
				
			||||||
            if settings.FORGIVING_OCR:
 | 
					            if settings.FORGIVING_OCR:
 | 
				
			||||||
                self.log(
 | 
					                self.log(
 | 
				
			||||||
@ -224,34 +236,34 @@ class Consumer(object):
 | 
				
			|||||||
                        guessed_language
 | 
					                        guessed_language
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
 | 
					                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
 | 
				
			||||||
                return raw_text
 | 
					                return raw_text
 | 
				
			||||||
            raise OCRError(
 | 
					            raise OCRError(
 | 
				
			||||||
                "The guessed language is not available in this instance of "
 | 
					                "The guessed language is not available in this instance of "
 | 
				
			||||||
                "Tesseract."
 | 
					                "Tesseract."
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _assemble_ocr_sections(self, pngs, middle, text):
 | 
					    def _assemble_ocr_sections(self, imgs, middle, text):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Given a `middle` value and the text that middle page represents, we OCR
 | 
					        Given a `middle` value and the text that middle page represents, we OCR
 | 
				
			||||||
        the remainder of the document and return the whole thing.
 | 
					        the remainder of the document and return the whole thing.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
 | 
					        text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
 | 
				
			||||||
        text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
 | 
					        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
 | 
				
			||||||
        return text
 | 
					        return text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _ocr(self, pngs, lang):
 | 
					    def _ocr(self, imgs, lang):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Performs a single OCR attempt.
 | 
					        Performs a single OCR attempt.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not pngs:
 | 
					        if not imgs:
 | 
				
			||||||
            return ""
 | 
					            return ""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.log("info", "Parsing for {}".format(lang))
 | 
					        self.log("info", "Parsing for {}".format(lang))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with Pool(processes=self.THREADS) as pool:
 | 
					        with Pool(processes=self.THREADS) as pool:
 | 
				
			||||||
            r = pool.map(image_to_string, itertools.product(pngs, [lang]))
 | 
					            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
 | 
				
			||||||
            r = " ".join(r)
 | 
					            r = " ".join(r)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Strip out excess white space to allow matching to go smoother
 | 
					        # Strip out excess white space to allow matching to go smoother
 | 
				
			||||||
@ -374,16 +386,9 @@ class Consumer(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def image_to_string(args):
 | 
					def image_to_string(args):
 | 
				
			||||||
    """
 | 
					    img, lang = args
 | 
				
			||||||
    I have no idea why, but if this function were a method of Consumer, it
 | 
					 | 
				
			||||||
    would explode with:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      `TypeError: cannot serialize '_io.TextIOWrapper' object`.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    png, lang = args
 | 
					 | 
				
			||||||
    ocr = pyocr.get_available_tools()[0]
 | 
					    ocr = pyocr.get_available_tools()[0]
 | 
				
			||||||
    with Image.open(os.path.join(Consumer.SCRATCH, png)) as f:
 | 
					    with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
 | 
				
			||||||
        if ocr.can_detect_orientation():
 | 
					        if ocr.can_detect_orientation():
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                orientation = ocr.detect_orientation(f, lang=lang)
 | 
					                orientation = ocr.detect_orientation(f, lang=lang)
 | 
				
			||||||
@ -391,3 +396,10 @@ def image_to_string(args):
 | 
				
			|||||||
            except TesseractError:
 | 
					            except TesseractError:
 | 
				
			||||||
                pass
 | 
					                pass
 | 
				
			||||||
        return ocr.image_to_string(f, lang=lang)
 | 
					        return ocr.image_to_string(f, lang=lang)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def run_unpaper(args):
 | 
				
			||||||
 | 
					    unpaper, pnm = args
 | 
				
			||||||
 | 
					    subprocess.Popen((
 | 
				
			||||||
 | 
					        unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
 | 
				
			||||||
 | 
					    )).wait()
 | 
				
			||||||
 | 
				
			|||||||
@ -189,6 +189,9 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
 | 
				
			|||||||
# Convert is part of the ImageMagick package
 | 
					# Convert is part of the ImageMagick package
 | 
				
			||||||
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
 | 
					CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Unpaper
 | 
				
			||||||
 | 
					UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# This will be created if it doesn't exist
 | 
					# This will be created if it doesn't exist
 | 
				
			||||||
SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
 | 
					SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user