Use pdftotext to index PDF files

Much faster and less resource intensive than pdftohtml
2025-06-23 15:30:45 -04:00 · 2022-06-29 16:29:29 +05:30 · 2022-06-29 16:29:29 +05:30 · cde3ff211c
commit cde3ff211c
parent 1d67f1e923
7 changed files with 93 additions and 6 deletions
--- a/bypy/linux/main.py
+++ b/bypy/linux/main.py
@ -38,7 +38,7 @@ qt_get_dll_path = partial(get_dll_path, loc=os.path.join(QT_PREFIX, 'lib'))

 def binary_includes():
    return [
-        j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'optipng', 'JxrDecApp')] + [
+        j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'optipng', 'JxrDecApp')] + [

        j(PREFIX, 'private', 'mozjpeg', 'bin', x) for x in ('jpegtran', 'cjpeg')] + [
        ] + list(map(
--- a/bypy/macos/main.py
+++ b/bypy/macos/main.py
@ -484,7 +484,7 @@ class Freeze:
        print('\nAdding poppler')
        for x in ('libopenjp2.7.dylib', 'libpoppler.115.dylib',):
            self.install_dylib(join(PREFIX, 'lib', x))
-        for x in ('pdftohtml', 'pdftoppm', 'pdfinfo'):
+        for x in ('pdftohtml', 'pdftoppm', 'pdfinfo', 'pdftotext'):
            self.install_dylib(
                join(PREFIX, 'bin', x), set_id=False, dest=self.helpers_dir)

--- a/bypy/windows/main.py
+++ b/bypy/windows/main.py
@ -145,7 +145,7 @@ def freeze(env, ext_dir, incdir):
            shutil.copy2(x + '.manifest', env.dll_dir)

    bindir = os.path.join(PREFIX, 'bin')
-    for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'):
+    for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'):
        copybin(os.path.join(bindir, x + '.exe'))
    for f in glob.glob(os.path.join(bindir, '*.dll')):
        if re.search(r'(easylzma|icutest)', f.lower()) is None:
--- a/src/calibre/db/fts/text.py
+++ b/src/calibre/db/fts/text.py
@ -3,6 +3,7 @@
 # License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>


+import os
 import re
 import unicodedata

@ -61,6 +62,19 @@ def is_fmt_ok(input_fmt):
    return input_plugin


+def pdftotext(path):
+    import subprocess
+
+    from calibre.ebooks.pdf.pdftohtml import PDFTOTEXT, popen
+    from calibre.utils.cleantext import clean_ascii_chars
+    cmd = [PDFTOTEXT] + '-enc UTF-8 -nodiag -eol unix'.split() + [os.path.basename(path), '-']
+    p = popen(cmd, cwd=os.path.dirname(path), stdout=subprocess.PIPE, stdin=subprocess.DEVNULL)
+    raw = p.stdout.read()
+    if p.wait() != 0:
+        return ''
+    return clean_ascii_chars(raw).decode('utf-8', 'replace')
+
+
 def extract_text(pathtoebook):
    input_fmt = pathtoebook.rpartition('.')[-1].upper()
    ans = ''
@ -68,6 +82,8 @@ def extract_text(pathtoebook):
    if not input_plugin:
        return ans
    input_plugin = plugin_for_input_format(input_fmt)
+    if input_fmt == 'PDF':
+        return pdftotext(pathtoebook)
    with TemporaryDirectory() as tdir:
        texts = []
        book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
--- a/src/calibre/db/tests/fts.py
+++ b/src/calibre/db/tests/fts.py
@ -3,12 +3,14 @@


 import builtins
+import os
 import sys
+import tempfile
 from apsw import Connection

 from calibre.constants import plugins
-from calibre.db.tests.base import BaseTest
 from calibre.db.annotations import unicode_normalize
+from calibre.db.tests.base import BaseTest


 def print(*args, **kwargs):
@ -51,7 +53,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);


 def tokenize(text, flags=None, remove_diacritics=True):
-    from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
+    from calibre_extensions.sqlite_extension import FTS5_TOKENIZE_DOCUMENT, tokenize
    if flags is None:
        flags = FTS5_TOKENIZE_DOCUMENT
    return tokenize(unicode_normalize(text), remove_diacritics, flags)
@ -183,6 +185,73 @@ class FTSTest(BaseTest):

    # }}}

+    def test_pdftotext(self):
+        pdf_data = '''\
+%PDF-1.1
+%¥±ë
+
+1 0 obj
+  << /Type /Catalog
+     /Pages 2 0 R
+  >>
+endobj
+
+2 0 obj
+  << /Type /Pages
+     /Kids [3 0 R]
+     /Count 1
+     /MediaBox [0 0 300 144]
+  >>
+endobj
+
+3 0 obj
+  <<  /Type /Page
+      /Parent 2 0 R
+      /Resources
+       << /Font
+           << /F1
+               << /Type /Font
+                  /Subtype /Type1
+                  /BaseFont /Times-Roman
+               >>
+           >>
+       >>
+      /Contents 4 0 R
+  >>
+endobj
+
+4 0 obj
+  << /Length 55 >>
+stream
+  BT
+    /F1 18 Tf
+    0 0 Td
+    (Hello World) Tj
+  ET
+endstream
+endobj
+
+xref
+0 5
+0000000000 65535 f
+0000000018 00000 n
+0000000077 00000 n
+0000000178 00000 n
+0000000457 00000 n
+trailer
+  <<  /Root 1 0 R
+      /Size 5
+  >>
+startxref
+565
+%%EOF'''
+        with tempfile.TemporaryDirectory() as tdir:
+            pdf = os.path.join(tdir, 'test.pdf')
+            with open(pdf, 'w') as f:
+                f.write(pdf_data)
+            from calibre.db.fts.text import pdftotext
+            self.assertEqual(pdftotext(pdf).strip(), 'Hello World')
+

 def find_tests():
    import unittest
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@ -33,6 +33,7 @@ if iswindows and hasattr(sys, 'frozen'):
    PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
 if (islinux or isbsd) and getattr(sys, 'frozen', False):
    PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
+PDFTOTEXT = os.path.join(os.path.dirname(PDFTOHTML), 'pdftotext' + ('.exe' if iswindows else ''))


 def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
--- a/src/calibre/test_build.py
+++ b/src/calibre/test_build.py
@ -409,11 +409,12 @@ class BuildTest(unittest.TestCase):
    @unittest.skipUnless(getattr(sys, 'frozen', False), 'Only makes sense to test executables in frozen builds')
    def test_executables(self):
        from calibre.utils.ipc.launch import Worker
-        from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
+        from calibre.ebooks.pdf.pdftohtml import PDFTOHTML, PDFTOTEXT
        w = Worker({})
        self.assertTrue(os.path.exists(w.executable), 'calibre-parallel (%s) does not exist' % w.executable)
        self.assertTrue(os.path.exists(w.gui_executable), 'calibre-parallel-gui (%s) does not exist' % w.gui_executable)
        self.assertTrue(os.path.exists(PDFTOHTML), 'pdftohtml (%s) does not exist' % PDFTOHTML)
+        self.assertTrue(os.path.exists(PDFTOTEXT), 'pdftotext (%s) does not exist' % PDFTOTEXT)
        if iswindows:
            from calibre.devices.usbms.device import eject_exe
            self.assertTrue(os.path.exists(eject_exe()), 'calibre-eject.exe (%s) does not exist' % eject_exe())