mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-04 03:27:00 -05:00 
			
		
		
		
	Fix #2100891 [[Enhancement] Full-text search for doc/docx/zip (containing htm/html)](https://bugs.launchpad.net/calibre/+bug/2100891)
Forgot to modify the check in db cache for extractable format
This commit is contained in:
		
							parent
							
								
									00ddf587c7
								
							
						
					
					
						commit
						a7375ad7d4
					
				@ -528,7 +528,7 @@ class Cache:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def dispatch_fts_jobs(queue, stop_dispatch, dbref):
 | 
					    def dispatch_fts_jobs(queue, stop_dispatch, dbref):
 | 
				
			||||||
        from .fts.text import is_fmt_ok
 | 
					        from .fts.text import is_fmt_extractable
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def do_one():
 | 
					        def do_one():
 | 
				
			||||||
            self = dbref()
 | 
					            self = dbref()
 | 
				
			||||||
@ -542,7 +542,7 @@ class Cache:
 | 
				
			|||||||
                if book_id is None:
 | 
					                if book_id is None:
 | 
				
			||||||
                    return False
 | 
					                    return False
 | 
				
			||||||
                path = self._format_abspath(book_id, fmt)
 | 
					                path = self._format_abspath(book_id, fmt)
 | 
				
			||||||
            if not path or not is_fmt_ok(fmt):
 | 
					            if not path or not is_fmt_extractable(fmt):
 | 
				
			||||||
                with self.write_lock:
 | 
					                with self.write_lock:
 | 
				
			||||||
                    self.backend.remove_dirty_fts(book_id, fmt)
 | 
					                    self.backend.remove_dirty_fts(book_id, fmt)
 | 
				
			||||||
                    self._update_fts_indexing_numbers()
 | 
					                    self._update_fts_indexing_numbers()
 | 
				
			||||||
 | 
				
			|||||||
@ -79,21 +79,37 @@ def pdftotext(path):
 | 
				
			|||||||
    return clean_ascii_chars(raw).decode('utf-8', 'replace')
 | 
					    return clean_ascii_chars(raw).decode('utf-8', 'replace')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def can_extract_text(pathtoebook: str, input_fmt: str, exit_stack: contextlib.ExitStack) -> tuple[str, str]:
 | 
				
			||||||
 | 
					    if not pathtoebook:
 | 
				
			||||||
 | 
					        return pathtoebook, input_fmt
 | 
				
			||||||
 | 
					    if is_fmt_ok(input_fmt):
 | 
				
			||||||
 | 
					        return pathtoebook, input_fmt
 | 
				
			||||||
 | 
					    if input_fmt.lower() in ARCHIVE_FMTS:
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            tdir = exit_stack.enter_context(TemporaryDirectory())
 | 
				
			||||||
 | 
					            pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
 | 
				
			||||||
 | 
					            input_fmt = input_fmt.upper()
 | 
				
			||||||
 | 
					        except Exception:
 | 
				
			||||||
 | 
					            return '', input_fmt
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return pathtoebook, input_fmt
 | 
				
			||||||
 | 
					    return '', input_fmt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_fmt_extractable(input_fmt: str) -> bool:
 | 
				
			||||||
 | 
					    if is_fmt_ok(input_fmt):
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    return input_fmt.lower() in ARCHIVE_FMTS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def extract_text(pathtoebook):
 | 
					def extract_text(pathtoebook):
 | 
				
			||||||
    input_fmt = pathtoebook.rpartition('.')[-1].upper()
 | 
					    input_fmt = pathtoebook.rpartition('.')[-1].upper()
 | 
				
			||||||
    ans = ''
 | 
					    ans = ''
 | 
				
			||||||
    input_plugin = is_fmt_ok(input_fmt)
 | 
					    input_plugin = is_fmt_ok(input_fmt)
 | 
				
			||||||
    with contextlib.ExitStack() as exit_stack:
 | 
					    with contextlib.ExitStack() as exit_stack:
 | 
				
			||||||
        if not input_plugin:
 | 
					        pathtoebook, input_fmt = can_extract_text(pathtoebook, input_fmt, exit_stack)
 | 
				
			||||||
            if input_fmt.lower() in ARCHIVE_FMTS:
 | 
					        if not pathtoebook:
 | 
				
			||||||
                try:
 | 
					            return ans
 | 
				
			||||||
                    tdir = exit_stack.enter_context(TemporaryDirectory())
 | 
					 | 
				
			||||||
                    pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
 | 
					 | 
				
			||||||
                    input_fmt = input_fmt.upper()
 | 
					 | 
				
			||||||
                except Exception:
 | 
					 | 
				
			||||||
                    return ans
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                return ans
 | 
					 | 
				
			||||||
        input_plugin = plugin_for_input_format(input_fmt)
 | 
					        input_plugin = plugin_for_input_format(input_fmt)
 | 
				
			||||||
        if input_fmt == 'PDF':
 | 
					        if input_fmt == 'PDF':
 | 
				
			||||||
            ans = pdftotext(pathtoebook)
 | 
					            ans = pdftotext(pathtoebook)
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user