Use pdftotext to index PDF files

Much faster and less resource intensive than pdftohtml
2025-06-23 15:30:45 -04:00 · 2022-06-29 16:29:29 +05:30 · 2022-06-29 16:29:29 +05:30 · cde3ff211c
commit cde3ff211c
parent 1d67f1e923
7 changed files with 93 additions and 6 deletions
--- a/bypy/linux/main.py
+++ b/bypy/linux/main.py
@ -38,7 +38,7 @@ qt_get_dll_path = partial(get_dll_path, loc=os.path.join(QT_PREFIX, 'lib'))
 def binary_includes():
    return [
-        j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'optipng', 'JxrDecApp')] + [
+        j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'optipng', 'JxrDecApp')] + [
        j(PREFIX, 'private', 'mozjpeg', 'bin', x) for x in ('jpegtran', 'cjpeg')] + [
        ] + list(map(
--- a/bypy/macos/main.py
+++ b/bypy/macos/main.py
@ -484,7 +484,7 @@ class Freeze:
        print('\nAdding poppler')
        for x in ('libopenjp2.7.dylib', 'libpoppler.115.dylib',):
            self.install_dylib(join(PREFIX, 'lib', x))
-        for x in ('pdftohtml', 'pdftoppm', 'pdfinfo'):
+        for x in ('pdftohtml', 'pdftoppm', 'pdfinfo', 'pdftotext'):
            self.install_dylib(
                join(PREFIX, 'bin', x), set_id=False, dest=self.helpers_dir)
--- a/bypy/windows/main.py
+++ b/bypy/windows/main.py
@ -145,7 +145,7 @@ def freeze(env, ext_dir, incdir):
            shutil.copy2(x + '.manifest', env.dll_dir)
    bindir = os.path.join(PREFIX, 'bin')
-    for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'):
+    for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'):
        copybin(os.path.join(bindir, x + '.exe'))
    for f in glob.glob(os.path.join(bindir, '*.dll')):
        if re.search(r'(easylzma|icutest)', f.lower()) is None:
--- a/src/calibre/db/fts/text.py
+++ b/src/calibre/db/fts/text.py
@ -3,6 +3,7 @@
 # License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
 import os
 import re
 import unicodedata
@ -61,6 +62,19 @@ def is_fmt_ok(input_fmt):
    return input_plugin
 def pdftotext(path):
    import subprocess
    from calibre.ebooks.pdf.pdftohtml import PDFTOTEXT, popen
    from calibre.utils.cleantext import clean_ascii_chars
    cmd = [PDFTOTEXT] + '-enc UTF-8 -nodiag -eol unix'.split() + [os.path.basename(path), '-']
    p = popen(cmd, cwd=os.path.dirname(path), stdout=subprocess.PIPE, stdin=subprocess.DEVNULL)
    raw = p.stdout.read()
    if p.wait() != 0:
        return ''
    return clean_ascii_chars(raw).decode('utf-8', 'replace')
 def extract_text(pathtoebook):
    input_fmt = pathtoebook.rpartition('.')[-1].upper()
    ans = ''
@ -68,6 +82,8 @@ def extract_text(pathtoebook):
    if not input_plugin:
        return ans
    input_plugin = plugin_for_input_format(input_fmt)
    if input_fmt == 'PDF':
        return pdftotext(pathtoebook)
    with TemporaryDirectory() as tdir:
        texts = []
        book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
--- a/src/calibre/db/tests/fts.py
+++ b/src/calibre/db/tests/fts.py
@ -3,12 +3,14 @@
 import builtins
 import os
 import sys
 import tempfile
 from apsw import Connection
 from calibre.constants import plugins
 from calibre.db.tests.base import BaseTest
 from calibre.db.annotations import unicode_normalize
 from calibre.db.tests.base import BaseTest
 def print(*args, **kwargs):
@ -51,7 +53,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
 def tokenize(text, flags=None, remove_diacritics=True):
-    from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
+    from calibre_extensions.sqlite_extension import FTS5_TOKENIZE_DOCUMENT, tokenize
    if flags is None:
        flags = FTS5_TOKENIZE_DOCUMENT
    return tokenize(unicode_normalize(text), remove_diacritics, flags)
@ -183,6 +185,73 @@ class FTSTest(BaseTest):
    # }}}
    def test_pdftotext(self):
        pdf_data = '''\
 %PDF-1.1
 %¥±ë
 1 0 obj
  << /Type /Catalog
     /Pages 2 0 R
  >>
 endobj
 2 0 obj
  << /Type /Pages
     /Kids [3 0 R]
     /Count 1
     /MediaBox [0 0 300 144]
  >>
 endobj
 3 0 obj
  <<  /Type /Page
      /Parent 2 0 R
      /Resources
       << /Font
           << /F1
               << /Type /Font
                  /Subtype /Type1
                  /BaseFont /Times-Roman
               >>
           >>
       >>
      /Contents 4 0 R
  >>
 endobj
 4 0 obj
  << /Length 55 >>
 stream
  BT
    /F1 18 Tf
    0 0 Td
    (Hello World) Tj
  ET
 endstream
 endobj
 xref
 0 5
 0000000000 65535 f
 0000000018 00000 n
 0000000077 00000 n
 0000000178 00000 n
 0000000457 00000 n
 trailer
  <<  /Root 1 0 R
      /Size 5
  >>
 startxref
 565
 %%EOF'''
        with tempfile.TemporaryDirectory() as tdir:
            pdf = os.path.join(tdir, 'test.pdf')
            with open(pdf, 'w') as f:
                f.write(pdf_data)
            from calibre.db.fts.text import pdftotext
            self.assertEqual(pdftotext(pdf).strip(), 'Hello World')
 def find_tests():
    import unittest
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@ -33,6 +33,7 @@ if iswindows and hasattr(sys, 'frozen'):
    PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
 if (islinux or isbsd) and getattr(sys, 'frozen', False):
    PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
 PDFTOTEXT = os.path.join(os.path.dirname(PDFTOHTML), 'pdftotext' + ('.exe' if iswindows else ''))
 def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
--- a/src/calibre/test_build.py
+++ b/src/calibre/test_build.py
@ -409,11 +409,12 @@ class BuildTest(unittest.TestCase):
    @unittest.skipUnless(getattr(sys, 'frozen', False), 'Only makes sense to test executables in frozen builds')
    def test_executables(self):
        from calibre.utils.ipc.launch import Worker
-        from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
+        from calibre.ebooks.pdf.pdftohtml import PDFTOHTML, PDFTOTEXT
        w = Worker({})
        self.assertTrue(os.path.exists(w.executable), 'calibre-parallel (%s) does not exist' % w.executable)
        self.assertTrue(os.path.exists(w.gui_executable), 'calibre-parallel-gui (%s) does not exist' % w.gui_executable)
        self.assertTrue(os.path.exists(PDFTOHTML), 'pdftohtml (%s) does not exist' % PDFTOHTML)
        self.assertTrue(os.path.exists(PDFTOTEXT), 'pdftotext (%s) does not exist' % PDFTOTEXT)
        if iswindows:
            from calibre.devices.usbms.device import eject_exe
            self.assertTrue(os.path.exists(eject_exe()), 'calibre-eject.exe (%s) does not exist' % eject_exe())