Specialize page counting for text files

This commit is contained in:
Kovid Goyal 2025-12-29 09:36:34 +05:30
parent d992305e8e
commit d1cd49d8a9
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 19 additions and 4 deletions

View File

@ -1742,7 +1742,8 @@ class Cache:
ans: Pages | None = None
with self.safe_read_lock:
for pages, algorithm, format, format_size, timestamp in self.backend.execute(
f'SELECT pages,algorithm,format,format_size,timestamp FROM books_pages_link WHERE book={book_id:d}'):
f'SELECT pages,algorithm,format,format_size,timestamp FROM books_pages_link WHERE book={book_id:d} LIMIT 1'
):
ans = Pages(int(pages), int(algorithm), str(format), int(format_size), parse_iso8601(timestamp, assume_utc=True))
break
if queue_if_unavailable and ans is None:
@ -1757,7 +1758,7 @@ class Cache:
'''
if book_id <= 0:
if len(self.fields['pages'].table.book_col_map) < len(self.fields['uuid'].table.book_col_map):
self.backend.execute('INSERT OR IGNORE INTO books_pages_link(book) SELECT id FROM books')
self.backend.execute('INSERT OR IGNORE INTO books_pages_link(book,needs_scan) SELECT id,1 FROM books')
else:
self.backend.execute(f'UPDATE books_pages_link SET needs_scan=1 WHERE book={int(book_id)}')
self.maintain_page_counts.queue_scan(book_id)
@ -2109,7 +2110,6 @@ class Cache:
needs_close = True
try:
size, fname = self._do_add_format(book_id, fmt, stream, name)
self.queue_pages_scan()
finally:
if needs_close:
stream.close()

View File

@ -9,6 +9,8 @@ from contextlib import closing, suppress
from multiprocessing import Pipe
from operator import itemgetter
from lxml import etree
from calibre import detect_ncpus
from calibre.constants import iswindows
from calibre.ebooks.oeb.base import XHTML
@ -65,8 +67,11 @@ def get_length(root):
return ans
CHARS_PER_PAGE = 1000
def get_page_count(root):
return get_length(root) // 1000
return get_length(root) // CHARS_PER_PAGE
def calculate_number_of_workers(names, in_process_container, max_workers):
@ -114,6 +119,14 @@ def count_pages_oeb(pathtoebook: str, tdir: str, executor: Executor | None = Non
return sum(executor.map(process, paths))
def count_pages_txt(pathtoebook: str) -> int:
with open(pathtoebook, 'rb') as f:
text = f.read().decode('utf-8', 'replace')
e = etree.Element('r')
e.tail = text
return get_num_of_significant_chars(e) // CHARS_PER_PAGE
def count_pages(pathtoebook: str, executor: Executor | None = None) -> int:
ext = pathtoebook.rpartition('.')[-1].lower()
match ext:
@ -125,6 +138,8 @@ def count_pages(pathtoebook: str, executor: Executor | None = None) -> int:
return count_pages_cbr(pathtoebook)
case 'cb7':
return count_pages_cb7(pathtoebook)
case 'txt' | 'text' | 'md' | 'textile' | 'markdown':
return count_pages_txt(pathtoebook)
case _:
with TemporaryDirectory() as tdir:
return count_pages_oeb(pathtoebook, tdir, executor=executor)