mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-04 03:27:12 -05:00 
			
		
		
		
	made unpaper and convert a little bit nicer to interact with
This commit is contained in:
		
							parent
							
								
									c28b636ffa
								
							
						
					
					
						commit
						3a08a2d206
					
				@ -29,6 +29,46 @@ DATE_REGEX = re.compile(
 | 
				
			|||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					logger = logging.getLogger(__name__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
 | 
				
			||||||
 | 
					    environment = os.environ.copy()
 | 
				
			||||||
 | 
					    if settings.CONVERT_MEMORY_LIMIT:
 | 
				
			||||||
 | 
					        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
 | 
				
			||||||
 | 
					    if settings.CONVERT_TMPDIR:
 | 
				
			||||||
 | 
					        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    args = [settings.CONVERT_BINARY]
 | 
				
			||||||
 | 
					    args += ['-density', str(density)] if density else []
 | 
				
			||||||
 | 
					    args += ['-scale', str(scale)] if scale else []
 | 
				
			||||||
 | 
					    args += ['-alpha', str(alpha)] if alpha else []
 | 
				
			||||||
 | 
					    args += ['-strip'] if strip else []
 | 
				
			||||||
 | 
					    args += ['-trim'] if trim else []
 | 
				
			||||||
 | 
					    args += ['-type', str(type)] if type else []
 | 
				
			||||||
 | 
					    args += ['-depth', str(depth)] if depth else []
 | 
				
			||||||
 | 
					    args += [input, output]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    logger.debug("Execute: " + " ".join(args), extra={'group': logging_group})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if not subprocess.Popen(args, env=environment).wait() == 0:
 | 
				
			||||||
 | 
					        raise ParseError("Convert failed at {}".format(args))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def run_unpaper(pnm, logging_group=None):
 | 
				
			||||||
 | 
					    pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
 | 
				
			||||||
 | 
					                    pnm_out)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if not subprocess.Popen(command_args).wait() == 0:
 | 
				
			||||||
 | 
					        raise ParseError("Unpaper failed at {}".format(command_args))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return pnm_out
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ParseError(Exception):
 | 
					class ParseError(Exception):
 | 
				
			||||||
    pass
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -56,6 +96,9 @@ class DocumentParser:
 | 
				
			|||||||
        out_path = os.path.join(self.tempdir, "optipng.png")
 | 
					        out_path = os.path.join(self.tempdir, "optipng.png")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
 | 
					        args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.log('debug', 'Execute: ' + " ".join(args))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not subprocess.Popen(args).wait() == 0:
 | 
					        if not subprocess.Popen(args).wait() == 0:
 | 
				
			||||||
            raise ParseError("Optipng failed at {}".format(args))
 | 
					            raise ParseError("Optipng failed at {}".format(args))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -11,7 +11,8 @@ from PIL import Image
 | 
				
			|||||||
from pyocr import PyocrException
 | 
					from pyocr import PyocrException
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pdftotext
 | 
					import pdftotext
 | 
				
			||||||
from documents.parsers import DocumentParser, ParseError
 | 
					from documents.parsers import DocumentParser, ParseError, run_unpaper, \
 | 
				
			||||||
 | 
					    run_convert
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .languages import ISO639
 | 
					from .languages import ISO639
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -39,15 +40,14 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        # Run convert to get a decent thumbnail
 | 
					        # Run convert to get a decent thumbnail
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            run_convert(
 | 
					            run_convert(density=300,
 | 
				
			||||||
                settings.CONVERT_BINARY,
 | 
					                        scale="500x5000>",
 | 
				
			||||||
                "-density", "300",
 | 
					                        alpha="remove",
 | 
				
			||||||
                "-scale", "500x5000>",
 | 
					                        strip=True,
 | 
				
			||||||
                "-alpha", "remove",
 | 
					                        trim=True,
 | 
				
			||||||
                "-strip", "-trim",
 | 
					                        input="{}[0]".format(self.document_path),
 | 
				
			||||||
                "{}[0]".format(self.document_path),
 | 
					                        output=out_path,
 | 
				
			||||||
                out_path
 | 
					                        logging_group=self.logging_group)
 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        except ParseError:
 | 
					        except ParseError:
 | 
				
			||||||
            # if convert fails, fall back to extracting
 | 
					            # if convert fails, fall back to extracting
 | 
				
			||||||
            # the first PDF page as a PNG using Ghostscript
 | 
					            # the first PDF page as a PNG using Ghostscript
 | 
				
			||||||
@ -61,15 +61,14 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
            if not subprocess.Popen(cmd).wait() == 0:
 | 
					            if not subprocess.Popen(cmd).wait() == 0:
 | 
				
			||||||
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
 | 
					                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
 | 
				
			||||||
            # then run convert on the output from gs
 | 
					            # then run convert on the output from gs
 | 
				
			||||||
            run_convert(
 | 
					            run_convert(density=300,
 | 
				
			||||||
                settings.CONVERT_BINARY,
 | 
					                        scale="500x5000>",
 | 
				
			||||||
                "-density", "300",
 | 
					                        alpha="remove",
 | 
				
			||||||
                "-scale", "500x5000>",
 | 
					                        strip=True,
 | 
				
			||||||
                "-alpha", "remove",
 | 
					                        trim=True,
 | 
				
			||||||
                "-strip", "-trim",
 | 
					                        input=gs_out_path,
 | 
				
			||||||
                gs_out_path,
 | 
					                        output=out_path,
 | 
				
			||||||
                out_path
 | 
					                        logging_group=self.logging_group)
 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return out_path
 | 
					        return out_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -107,14 +106,17 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
            if not guessed_language or guessed_language not in ISO639:
 | 
					            if not guessed_language or guessed_language not in ISO639:
 | 
				
			||||||
                self.log("warning", "Language detection failed.")
 | 
					                self.log("warning", "Language detection failed.")
 | 
				
			||||||
                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
 | 
					                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
 | 
					            elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
 | 
				
			||||||
                self.log("info", "Detected language: {} (default language)".format(guessed_language))
 | 
					                self.log("info", "Detected language: {} (default language)".format(guessed_language))
 | 
				
			||||||
                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
 | 
					                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
 | 
					            elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
 | 
				
			||||||
                self.log("warning","Detected language {} is not available on this system.".format(guessed_language))
 | 
					                self.log("warning", "Detected language {} is not available on this system.".format(guessed_language))
 | 
				
			||||||
                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
 | 
					                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                self.log("info","Detected language: {}".format(guessed_language))
 | 
					                self.log("info", "Detected language: {}".format(guessed_language))
 | 
				
			||||||
                ocr_pages = self._ocr(images, ISO639[guessed_language])
 | 
					                ocr_pages = self._ocr(images, ISO639[guessed_language])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.log("info", "OCR completed.")
 | 
					            self.log("info", "OCR completed.")
 | 
				
			||||||
@ -133,13 +135,13 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        # Convert PDF to multiple PNMs
 | 
					        # Convert PDF to multiple PNMs
 | 
				
			||||||
        pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
 | 
					        pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
 | 
				
			||||||
        run_convert(
 | 
					
 | 
				
			||||||
            settings.CONVERT_BINARY,
 | 
					        run_convert(density=settings.CONVERT_DENSITY,
 | 
				
			||||||
            "-density", str(settings.CONVERT_DENSITY),
 | 
					                    depth="8",
 | 
				
			||||||
            "-depth", "8",
 | 
					                    type="grayscale",
 | 
				
			||||||
            "-type", "grayscale",
 | 
					                    input=self.document_path,
 | 
				
			||||||
            self.document_path, pnm,
 | 
					                    output=pnm,
 | 
				
			||||||
        )
 | 
					                    logging_group=self.logging_group)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Get a list of converted images
 | 
					        # Get a list of converted images
 | 
				
			||||||
        pnms = []
 | 
					        pnms = []
 | 
				
			||||||
@ -187,27 +189,6 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
            return [sample_page]
 | 
					            return [sample_page]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def run_convert(*args):
 | 
					 | 
				
			||||||
    environment = os.environ.copy()
 | 
					 | 
				
			||||||
    if settings.CONVERT_MEMORY_LIMIT:
 | 
					 | 
				
			||||||
        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
 | 
					 | 
				
			||||||
    if settings.CONVERT_TMPDIR:
 | 
					 | 
				
			||||||
        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not subprocess.Popen(args, env=environment).wait() == 0:
 | 
					 | 
				
			||||||
        raise ParseError("Convert failed at {}".format(args))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def run_unpaper(pnm):
 | 
					 | 
				
			||||||
    pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
 | 
					 | 
				
			||||||
                    pnm_out)
 | 
					 | 
				
			||||||
    if not subprocess.Popen(command_args).wait() == 0:
 | 
					 | 
				
			||||||
        raise ParseError("Unpaper failed at {}".format(command_args))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return pnm_out
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
def strip_excess_whitespace(text):
 | 
					def strip_excess_whitespace(text):
 | 
				
			||||||
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
 | 
					    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user