Use pdftotext to index PDF files

Much faster and less resource intensive than pdftohtml
This commit is contained in:
Kovid Goyal 2022-06-29 16:29:29 +05:30
parent 1d67f1e923
commit cde3ff211c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
7 changed files with 93 additions and 6 deletions

View File

@ -38,7 +38,7 @@ qt_get_dll_path = partial(get_dll_path, loc=os.path.join(QT_PREFIX, 'lib'))
def binary_includes(): def binary_includes():
return [ return [
j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'optipng', 'JxrDecApp')] + [ j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'optipng', 'JxrDecApp')] + [
j(PREFIX, 'private', 'mozjpeg', 'bin', x) for x in ('jpegtran', 'cjpeg')] + [ j(PREFIX, 'private', 'mozjpeg', 'bin', x) for x in ('jpegtran', 'cjpeg')] + [
] + list(map( ] + list(map(

View File

@ -484,7 +484,7 @@ class Freeze:
print('\nAdding poppler') print('\nAdding poppler')
for x in ('libopenjp2.7.dylib', 'libpoppler.115.dylib',): for x in ('libopenjp2.7.dylib', 'libpoppler.115.dylib',):
self.install_dylib(join(PREFIX, 'lib', x)) self.install_dylib(join(PREFIX, 'lib', x))
for x in ('pdftohtml', 'pdftoppm', 'pdfinfo'): for x in ('pdftohtml', 'pdftoppm', 'pdfinfo', 'pdftotext'):
self.install_dylib( self.install_dylib(
join(PREFIX, 'bin', x), set_id=False, dest=self.helpers_dir) join(PREFIX, 'bin', x), set_id=False, dest=self.helpers_dir)

View File

@ -145,7 +145,7 @@ def freeze(env, ext_dir, incdir):
shutil.copy2(x + '.manifest', env.dll_dir) shutil.copy2(x + '.manifest', env.dll_dir)
bindir = os.path.join(PREFIX, 'bin') bindir = os.path.join(PREFIX, 'bin')
for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'): for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'):
copybin(os.path.join(bindir, x + '.exe')) copybin(os.path.join(bindir, x + '.exe'))
for f in glob.glob(os.path.join(bindir, '*.dll')): for f in glob.glob(os.path.join(bindir, '*.dll')):
if re.search(r'(easylzma|icutest)', f.lower()) is None: if re.search(r'(easylzma|icutest)', f.lower()) is None:

View File

@ -3,6 +3,7 @@
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net> # License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
import os
import re import re
import unicodedata import unicodedata
@ -61,6 +62,19 @@ def is_fmt_ok(input_fmt):
return input_plugin return input_plugin
def pdftotext(path):
import subprocess
from calibre.ebooks.pdf.pdftohtml import PDFTOTEXT, popen
from calibre.utils.cleantext import clean_ascii_chars
cmd = [PDFTOTEXT] + '-enc UTF-8 -nodiag -eol unix'.split() + [os.path.basename(path), '-']
p = popen(cmd, cwd=os.path.dirname(path), stdout=subprocess.PIPE, stdin=subprocess.DEVNULL)
raw = p.stdout.read()
if p.wait() != 0:
return ''
return clean_ascii_chars(raw).decode('utf-8', 'replace')
def extract_text(pathtoebook): def extract_text(pathtoebook):
input_fmt = pathtoebook.rpartition('.')[-1].upper() input_fmt = pathtoebook.rpartition('.')[-1].upper()
ans = '' ans = ''
@ -68,6 +82,8 @@ def extract_text(pathtoebook):
if not input_plugin: if not input_plugin:
return ans return ans
input_plugin = plugin_for_input_format(input_fmt) input_plugin = plugin_for_input_format(input_fmt)
if input_fmt == 'PDF':
return pdftotext(pathtoebook)
with TemporaryDirectory() as tdir: with TemporaryDirectory() as tdir:
texts = [] texts = []
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log) book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)

View File

@ -3,12 +3,14 @@
import builtins import builtins
import os
import sys import sys
import tempfile
from apsw import Connection from apsw import Connection
from calibre.constants import plugins from calibre.constants import plugins
from calibre.db.tests.base import BaseTest
from calibre.db.annotations import unicode_normalize from calibre.db.annotations import unicode_normalize
from calibre.db.tests.base import BaseTest
def print(*args, **kwargs): def print(*args, **kwargs):
@ -51,7 +53,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
def tokenize(text, flags=None, remove_diacritics=True): def tokenize(text, flags=None, remove_diacritics=True):
from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT from calibre_extensions.sqlite_extension import FTS5_TOKENIZE_DOCUMENT, tokenize
if flags is None: if flags is None:
flags = FTS5_TOKENIZE_DOCUMENT flags = FTS5_TOKENIZE_DOCUMENT
return tokenize(unicode_normalize(text), remove_diacritics, flags) return tokenize(unicode_normalize(text), remove_diacritics, flags)
@ -183,6 +185,73 @@ class FTSTest(BaseTest):
# }}} # }}}
def test_pdftotext(self):
pdf_data = '''\
%PDF-1.1
%¥±ë
1 0 obj
<< /Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<< /Type /Pages
/Kids [3 0 R]
/Count 1
/MediaBox [0 0 300 144]
>>
endobj
3 0 obj
<< /Type /Page
/Parent 2 0 R
/Resources
<< /Font
<< /F1
<< /Type /Font
/Subtype /Type1
/BaseFont /Times-Roman
>>
>>
>>
/Contents 4 0 R
>>
endobj
4 0 obj
<< /Length 55 >>
stream
BT
/F1 18 Tf
0 0 Td
(Hello World) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000018 00000 n
0000000077 00000 n
0000000178 00000 n
0000000457 00000 n
trailer
<< /Root 1 0 R
/Size 5
>>
startxref
565
%%EOF'''
with tempfile.TemporaryDirectory() as tdir:
pdf = os.path.join(tdir, 'test.pdf')
with open(pdf, 'w') as f:
f.write(pdf_data)
from calibre.db.fts.text import pdftotext
self.assertEqual(pdftotext(pdf).strip(), 'Hello World')
def find_tests(): def find_tests():
import unittest import unittest

View File

@ -33,6 +33,7 @@ if iswindows and hasattr(sys, 'frozen'):
PDFTOHTML = os.path.join(base, 'pdftohtml.exe') PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
if (islinux or isbsd) and getattr(sys, 'frozen', False): if (islinux or isbsd) and getattr(sys, 'frozen', False):
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml') PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
PDFTOTEXT = os.path.join(os.path.dirname(PDFTOHTML), 'pdftotext' + ('.exe' if iswindows else ''))
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):

View File

@ -409,11 +409,12 @@ class BuildTest(unittest.TestCase):
@unittest.skipUnless(getattr(sys, 'frozen', False), 'Only makes sense to test executables in frozen builds') @unittest.skipUnless(getattr(sys, 'frozen', False), 'Only makes sense to test executables in frozen builds')
def test_executables(self): def test_executables(self):
from calibre.utils.ipc.launch import Worker from calibre.utils.ipc.launch import Worker
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML from calibre.ebooks.pdf.pdftohtml import PDFTOHTML, PDFTOTEXT
w = Worker({}) w = Worker({})
self.assertTrue(os.path.exists(w.executable), 'calibre-parallel (%s) does not exist' % w.executable) self.assertTrue(os.path.exists(w.executable), 'calibre-parallel (%s) does not exist' % w.executable)
self.assertTrue(os.path.exists(w.gui_executable), 'calibre-parallel-gui (%s) does not exist' % w.gui_executable) self.assertTrue(os.path.exists(w.gui_executable), 'calibre-parallel-gui (%s) does not exist' % w.gui_executable)
self.assertTrue(os.path.exists(PDFTOHTML), 'pdftohtml (%s) does not exist' % PDFTOHTML) self.assertTrue(os.path.exists(PDFTOHTML), 'pdftohtml (%s) does not exist' % PDFTOHTML)
self.assertTrue(os.path.exists(PDFTOTEXT), 'pdftotext (%s) does not exist' % PDFTOTEXT)
if iswindows: if iswindows:
from calibre.devices.usbms.device import eject_exe from calibre.devices.usbms.device import eject_exe
self.assertTrue(os.path.exists(eject_exe()), 'calibre-eject.exe (%s) does not exist' % eject_exe()) self.assertTrue(os.path.exists(eject_exe()), 'calibre-eject.exe (%s) does not exist' % eject_exe())