Use pdftotext to index PDF files

Much faster and less resource intensive than pdftohtml
This commit is contained in:
Kovid Goyal 2022-06-29 16:29:29 +05:30
parent 1d67f1e923
commit cde3ff211c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
7 changed files with 93 additions and 6 deletions

View File

@ -38,7 +38,7 @@ qt_get_dll_path = partial(get_dll_path, loc=os.path.join(QT_PREFIX, 'lib'))
def binary_includes():
return [
j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'optipng', 'JxrDecApp')] + [
j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'optipng', 'JxrDecApp')] + [
j(PREFIX, 'private', 'mozjpeg', 'bin', x) for x in ('jpegtran', 'cjpeg')] + [
] + list(map(

View File

@ -484,7 +484,7 @@ class Freeze:
print('\nAdding poppler')
for x in ('libopenjp2.7.dylib', 'libpoppler.115.dylib',):
self.install_dylib(join(PREFIX, 'lib', x))
for x in ('pdftohtml', 'pdftoppm', 'pdfinfo'):
for x in ('pdftohtml', 'pdftoppm', 'pdfinfo', 'pdftotext'):
self.install_dylib(
join(PREFIX, 'bin', x), set_id=False, dest=self.helpers_dir)

View File

@ -145,7 +145,7 @@ def freeze(env, ext_dir, incdir):
shutil.copy2(x + '.manifest', env.dll_dir)
bindir = os.path.join(PREFIX, 'bin')
for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'):
for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'):
copybin(os.path.join(bindir, x + '.exe'))
for f in glob.glob(os.path.join(bindir, '*.dll')):
if re.search(r'(easylzma|icutest)', f.lower()) is None:

View File

@ -3,6 +3,7 @@
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
import os
import re
import unicodedata
@ -61,6 +62,19 @@ def is_fmt_ok(input_fmt):
return input_plugin
def pdftotext(path):
import subprocess
from calibre.ebooks.pdf.pdftohtml import PDFTOTEXT, popen
from calibre.utils.cleantext import clean_ascii_chars
cmd = [PDFTOTEXT] + '-enc UTF-8 -nodiag -eol unix'.split() + [os.path.basename(path), '-']
p = popen(cmd, cwd=os.path.dirname(path), stdout=subprocess.PIPE, stdin=subprocess.DEVNULL)
raw = p.stdout.read()
if p.wait() != 0:
return ''
return clean_ascii_chars(raw).decode('utf-8', 'replace')
def extract_text(pathtoebook):
input_fmt = pathtoebook.rpartition('.')[-1].upper()
ans = ''
@ -68,6 +82,8 @@ def extract_text(pathtoebook):
if not input_plugin:
return ans
input_plugin = plugin_for_input_format(input_fmt)
if input_fmt == 'PDF':
return pdftotext(pathtoebook)
with TemporaryDirectory() as tdir:
texts = []
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)

View File

@ -3,12 +3,14 @@
import builtins
import os
import sys
import tempfile
from apsw import Connection
from calibre.constants import plugins
from calibre.db.tests.base import BaseTest
from calibre.db.annotations import unicode_normalize
from calibre.db.tests.base import BaseTest
def print(*args, **kwargs):
@ -51,7 +53,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
def tokenize(text, flags=None, remove_diacritics=True):
from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
from calibre_extensions.sqlite_extension import FTS5_TOKENIZE_DOCUMENT, tokenize
if flags is None:
flags = FTS5_TOKENIZE_DOCUMENT
return tokenize(unicode_normalize(text), remove_diacritics, flags)
@ -183,6 +185,73 @@ class FTSTest(BaseTest):
# }}}
def test_pdftotext(self):
pdf_data = '''\
%PDF-1.1
%¥±ë
1 0 obj
<< /Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<< /Type /Pages
/Kids [3 0 R]
/Count 1
/MediaBox [0 0 300 144]
>>
endobj
3 0 obj
<< /Type /Page
/Parent 2 0 R
/Resources
<< /Font
<< /F1
<< /Type /Font
/Subtype /Type1
/BaseFont /Times-Roman
>>
>>
>>
/Contents 4 0 R
>>
endobj
4 0 obj
<< /Length 55 >>
stream
BT
/F1 18 Tf
0 0 Td
(Hello World) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000018 00000 n
0000000077 00000 n
0000000178 00000 n
0000000457 00000 n
trailer
<< /Root 1 0 R
/Size 5
>>
startxref
565
%%EOF'''
with tempfile.TemporaryDirectory() as tdir:
pdf = os.path.join(tdir, 'test.pdf')
with open(pdf, 'w') as f:
f.write(pdf_data)
from calibre.db.fts.text import pdftotext
self.assertEqual(pdftotext(pdf).strip(), 'Hello World')
def find_tests():
import unittest

View File

@ -33,6 +33,7 @@ if iswindows and hasattr(sys, 'frozen'):
PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
if (islinux or isbsd) and getattr(sys, 'frozen', False):
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
PDFTOTEXT = os.path.join(os.path.dirname(PDFTOHTML), 'pdftotext' + ('.exe' if iswindows else ''))
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):

View File

@ -409,11 +409,12 @@ class BuildTest(unittest.TestCase):
@unittest.skipUnless(getattr(sys, 'frozen', False), 'Only makes sense to test executables in frozen builds')
def test_executables(self):
from calibre.utils.ipc.launch import Worker
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML, PDFTOTEXT
w = Worker({})
self.assertTrue(os.path.exists(w.executable), 'calibre-parallel (%s) does not exist' % w.executable)
self.assertTrue(os.path.exists(w.gui_executable), 'calibre-parallel-gui (%s) does not exist' % w.gui_executable)
self.assertTrue(os.path.exists(PDFTOHTML), 'pdftohtml (%s) does not exist' % PDFTOHTML)
self.assertTrue(os.path.exists(PDFTOTEXT), 'pdftotext (%s) does not exist' % PDFTOTEXT)
if iswindows:
from calibre.devices.usbms.device import eject_exe
self.assertTrue(os.path.exists(eject_exe()), 'calibre-eject.exe (%s) does not exist' % eject_exe())