mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	Fixed merge conflict and did some pep8
This commit is contained in:
		
						commit
						a0f4f6c5f2
					
				@ -58,6 +58,11 @@ Standard (Bare Metal)
 | 
			
		||||
      passphrase from the environment, so if you don't set it to a static value
 | 
			
		||||
      here, you must set ``PAPERLESS_PASSPHRASE=some-secret-string`` on the
 | 
			
		||||
      command line whenever invoking the consumer or webserver.
 | 
			
		||||
    * ``OCR_THREADS``: this is the number of threads the OCR process will spawn
 | 
			
		||||
      to process document pages in parallel. The default value gets sourced from
 | 
			
		||||
      the environment-variable ``PAPERLESS_OCR_THREADS`` and expects it to be an
 | 
			
		||||
      integer. If the variable is not set, Python determines the core-count of
 | 
			
		||||
      your CPU and uses that value.
 | 
			
		||||
4. Initialise the database with ``./manage.py migrate``.
 | 
			
		||||
5. Create a user for your Paperless instance with
 | 
			
		||||
   ``./manage.py createsuperuser``. Follow the prompts to create your user.
 | 
			
		||||
 | 
			
		||||
@ -51,7 +51,7 @@ class Consumer(object):
 | 
			
		||||
    SCRATCH = settings.SCRATCH_DIR
 | 
			
		||||
    CONVERT = settings.CONVERT_BINARY
 | 
			
		||||
    CONSUME = settings.CONSUMPTION_DIR
 | 
			
		||||
    THREADS = settings.OCR_THREADS
 | 
			
		||||
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
			
		||||
 | 
			
		||||
    OCR = pyocr.get_available_tools()[0]
 | 
			
		||||
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 | 
			
		||||
@ -140,7 +140,8 @@ class Consumer(object):
 | 
			
		||||
 | 
			
		||||
        return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
 | 
			
		||||
 | 
			
		||||
    def _guess_language(self, text):
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _guess_language(text):
 | 
			
		||||
        try:
 | 
			
		||||
            guess = langdetect.detect(text)
 | 
			
		||||
            Log.debug(
 | 
			
		||||
@ -148,8 +149,9 @@ class Consumer(object):
 | 
			
		||||
                Log.COMPONENT_CONSUMER
 | 
			
		||||
            )
 | 
			
		||||
            return guess
 | 
			
		||||
        except Exception:
 | 
			
		||||
            return None
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            Log.warning(
 | 
			
		||||
                "Language detection error: {}".format(e), Log.COMPONENT_MAIL)
 | 
			
		||||
 | 
			
		||||
    def _get_ocr(self, pngs):
 | 
			
		||||
        """
 | 
			
		||||
@ -157,9 +159,15 @@ class Consumer(object):
 | 
			
		||||
        simple language detection trial & error.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if not pngs:
 | 
			
		||||
            raise OCRError
 | 
			
		||||
 | 
			
		||||
        Log.debug("OCRing the document", Log.COMPONENT_CONSUMER)
 | 
			
		||||
 | 
			
		||||
        raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
 | 
			
		||||
        # Since the division gets rounded down by int, this calculation works
 | 
			
		||||
        # for every edge-case, i.e. 1
 | 
			
		||||
        middle = int(len(pngs) / 2)
 | 
			
		||||
        raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
 | 
			
		||||
 | 
			
		||||
        guessed_language = self._guess_language(raw_text)
 | 
			
		||||
 | 
			
		||||
@ -171,10 +179,12 @@ class Consumer(object):
 | 
			
		||||
                    "with what we have.",
 | 
			
		||||
                    Log.COMPONENT_CONSUMER
 | 
			
		||||
                )
 | 
			
		||||
                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
 | 
			
		||||
                return raw_text
 | 
			
		||||
            raise OCRError
 | 
			
		||||
 | 
			
		||||
        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
 | 
			
		||||
            raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
 | 
			
		||||
            return raw_text
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
@ -188,14 +198,27 @@ class Consumer(object):
 | 
			
		||||
                    ),
 | 
			
		||||
                    Log.COMPONENT_CONSUMER
 | 
			
		||||
                )
 | 
			
		||||
                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
 | 
			
		||||
                return raw_text
 | 
			
		||||
            raise OCRError
 | 
			
		||||
 | 
			
		||||
    def _assemble_ocr_sections(self, pngs, middle, text):
 | 
			
		||||
        """
 | 
			
		||||
        Given a `middle` value and the text that middle page represents, we OCR
 | 
			
		||||
        the remainder of the document and return the whole thing.
 | 
			
		||||
        """
 | 
			
		||||
        text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
 | 
			
		||||
        text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
 | 
			
		||||
        return text
 | 
			
		||||
 | 
			
		||||
    def _ocr(self, pngs, lang):
 | 
			
		||||
        """
 | 
			
		||||
        Performs a single OCR attempt.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if not pngs:
 | 
			
		||||
            return ""
 | 
			
		||||
 | 
			
		||||
        Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER)
 | 
			
		||||
 | 
			
		||||
        with Pool(processes=self.THREADS) as pool:
 | 
			
		||||
 | 
			
		||||
@ -26,6 +26,9 @@ class Log(models.Model):
 | 
			
		||||
    level = models.PositiveIntegerField(choices=LEVELS, default=LEVEL_INFO)
 | 
			
		||||
    component = models.PositiveIntegerField(choices=COMPONENTS)
 | 
			
		||||
 | 
			
		||||
    class Meta(object):
 | 
			
		||||
        ordering = ("time",)
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def error(cls, message, component):
 | 
			
		||||
        cls.objects.create(
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user