mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-31 10:37:00 -04:00 
			
		
		
		
	Fix #2100891 [[Enhancement] Full-text search for doc/docx/zip (containing htm/html)](https://bugs.launchpad.net/calibre/+bug/2100891)
Forgot to modify the check in db cache for extractable format
This commit is contained in:
		
							parent
							
								
									00ddf587c7
								
							
						
					
					
						commit
						a7375ad7d4
					
				| @ -528,7 +528,7 @@ class Cache: | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def dispatch_fts_jobs(queue, stop_dispatch, dbref): | ||||
|         from .fts.text import is_fmt_ok | ||||
|         from .fts.text import is_fmt_extractable | ||||
| 
 | ||||
|         def do_one(): | ||||
|             self = dbref() | ||||
| @ -542,7 +542,7 @@ class Cache: | ||||
|                 if book_id is None: | ||||
|                     return False | ||||
|                 path = self._format_abspath(book_id, fmt) | ||||
|             if not path or not is_fmt_ok(fmt): | ||||
|             if not path or not is_fmt_extractable(fmt): | ||||
|                 with self.write_lock: | ||||
|                     self.backend.remove_dirty_fts(book_id, fmt) | ||||
|                     self._update_fts_indexing_numbers() | ||||
|  | ||||
| @ -79,21 +79,37 @@ def pdftotext(path): | ||||
|     return clean_ascii_chars(raw).decode('utf-8', 'replace') | ||||
| 
 | ||||
| 
 | ||||
| def can_extract_text(pathtoebook: str, input_fmt: str, exit_stack: contextlib.ExitStack) -> tuple[str, str]: | ||||
|     if not pathtoebook: | ||||
|         return pathtoebook, input_fmt | ||||
|     if is_fmt_ok(input_fmt): | ||||
|         return pathtoebook, input_fmt | ||||
|     if input_fmt.lower() in ARCHIVE_FMTS: | ||||
|         try: | ||||
|             tdir = exit_stack.enter_context(TemporaryDirectory()) | ||||
|             pathtoebook, input_fmt = unarchive(pathtoebook, tdir) | ||||
|             input_fmt = input_fmt.upper() | ||||
|         except Exception: | ||||
|             return '', input_fmt | ||||
|         else: | ||||
|             return pathtoebook, input_fmt | ||||
|     return '', input_fmt | ||||
| 
 | ||||
| 
 | ||||
| def is_fmt_extractable(input_fmt: str) -> bool: | ||||
|     if is_fmt_ok(input_fmt): | ||||
|         return True | ||||
|     return input_fmt.lower() in ARCHIVE_FMTS | ||||
| 
 | ||||
| 
 | ||||
| def extract_text(pathtoebook): | ||||
|     input_fmt = pathtoebook.rpartition('.')[-1].upper() | ||||
|     ans = '' | ||||
|     input_plugin = is_fmt_ok(input_fmt) | ||||
|     with contextlib.ExitStack() as exit_stack: | ||||
|         if not input_plugin: | ||||
|             if input_fmt.lower() in ARCHIVE_FMTS: | ||||
|                 try: | ||||
|                     tdir = exit_stack.enter_context(TemporaryDirectory()) | ||||
|                     pathtoebook, input_fmt = unarchive(pathtoebook, tdir) | ||||
|                     input_fmt = input_fmt.upper() | ||||
|                 except Exception: | ||||
|                     return ans | ||||
|             else: | ||||
|                 return ans | ||||
|         pathtoebook, input_fmt = can_extract_text(pathtoebook, input_fmt, exit_stack) | ||||
|         if not pathtoebook: | ||||
|             return ans | ||||
|         input_plugin = plugin_for_input_format(input_fmt) | ||||
|         if input_fmt == 'PDF': | ||||
|             ans = pdftotext(pathtoebook) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user