mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-31 10:37:12 -04:00 
			
		
		
		
	Allows configuration of the NLTK processing language
This commit is contained in:
		
							parent
							
								
									6523cf0c4b
								
							
						
					
					
						commit
						d10d2f5a54
					
				| @ -320,10 +320,9 @@ class DocumentClassifier: | |||||||
|         # Get only the letters (remove punctuation too) |         # Get only the letters (remove punctuation too) | ||||||
|         content = re.sub(r"[^\w\s]", " ", content) |         content = re.sub(r"[^\w\s]", " ", content) | ||||||
|         # Tokenize |         # Tokenize | ||||||
|         # TODO configurable language |         words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE) | ||||||
|         words: List[str] = word_tokenize(content, language="english") |  | ||||||
|         # Remove stop words |         # Remove stop words | ||||||
|         stops = set(stopwords.words("english")) |         stops = set(stopwords.words(settings.NLTK_LANGUAGE)) | ||||||
|         meaningful_words = [w for w in words if w not in stops] |         meaningful_words = [w for w in words if w not in stops] | ||||||
|         # Stem words |         # Stem words | ||||||
|         meaningful_words = [self.stemmer.stem(w) for w in meaningful_words] |         meaningful_words = [self.stemmer.stem(w) for w in meaningful_words] | ||||||
|  | |||||||
| @ -708,3 +708,5 @@ if os.getenv("PAPERLESS_IGNORE_DATES") is not None: | |||||||
| ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default") | ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default") | ||||||
| if ENABLE_UPDATE_CHECK != "default": | if ENABLE_UPDATE_CHECK != "default": | ||||||
|     ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK") |     ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK") | ||||||
|  | 
 | ||||||
|  | NLTK_LANGUAGE = os.getenv("PAPERLESS_NLTK_LANG", "english").lower() | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user