diff --git a/bypy/linux/__main__.py b/bypy/linux/__main__.py index 93d4d39705..863b31357e 100644 --- a/bypy/linux/__main__.py +++ b/bypy/linux/__main__.py @@ -38,7 +38,7 @@ qt_get_dll_path = partial(get_dll_path, loc=os.path.join(QT_PREFIX, 'lib')) def binary_includes(): return [ - j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'optipng', 'JxrDecApp')] + [ + j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'optipng', 'JxrDecApp')] + [ j(PREFIX, 'private', 'mozjpeg', 'bin', x) for x in ('jpegtran', 'cjpeg')] + [ ] + list(map( diff --git a/bypy/macos/__main__.py b/bypy/macos/__main__.py index ea7037793e..bca00451d2 100644 --- a/bypy/macos/__main__.py +++ b/bypy/macos/__main__.py @@ -484,7 +484,7 @@ class Freeze: print('\nAdding poppler') for x in ('libopenjp2.7.dylib', 'libpoppler.115.dylib',): self.install_dylib(join(PREFIX, 'lib', x)) - for x in ('pdftohtml', 'pdftoppm', 'pdfinfo'): + for x in ('pdftohtml', 'pdftoppm', 'pdfinfo', 'pdftotext'): self.install_dylib( join(PREFIX, 'bin', x), set_id=False, dest=self.helpers_dir) diff --git a/bypy/windows/__main__.py b/bypy/windows/__main__.py index 20d181fea3..aff1b5a49b 100644 --- a/bypy/windows/__main__.py +++ b/bypy/windows/__main__.py @@ -145,7 +145,7 @@ def freeze(env, ext_dir, incdir): shutil.copy2(x + '.manifest', env.dll_dir) bindir = os.path.join(PREFIX, 'bin') - for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'): + for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'): copybin(os.path.join(bindir, x + '.exe')) for f in glob.glob(os.path.join(bindir, '*.dll')): if re.search(r'(easylzma|icutest)', f.lower()) is None: diff --git a/src/calibre/db/fts/text.py b/src/calibre/db/fts/text.py index 3da5698b5d..faff0fd618 100644 --- a/src/calibre/db/fts/text.py +++ b/src/calibre/db/fts/text.py @@ -3,6 +3,7 @@ # License: GPL v3 Copyright: 2022, Kovid Goyal +import os import re import unicodedata @@ -61,6 +62,19 @@ def is_fmt_ok(input_fmt): return input_plugin +def pdftotext(path): + import subprocess + + from calibre.ebooks.pdf.pdftohtml import PDFTOTEXT, popen + from calibre.utils.cleantext import clean_ascii_chars + cmd = [PDFTOTEXT] + '-enc UTF-8 -nodiag -eol unix'.split() + [os.path.basename(path), '-'] + p = popen(cmd, cwd=os.path.dirname(path), stdout=subprocess.PIPE, stdin=subprocess.DEVNULL) + raw = p.stdout.read() + if p.wait() != 0: + return '' + return clean_ascii_chars(raw).decode('utf-8', 'replace') + + def extract_text(pathtoebook): input_fmt = pathtoebook.rpartition('.')[-1].upper() ans = '' @@ -68,6 +82,8 @@ def extract_text(pathtoebook): if not input_plugin: return ans input_plugin = plugin_for_input_format(input_fmt) + if input_fmt == 'PDF': + return pdftotext(pathtoebook) with TemporaryDirectory() as tdir: texts = [] book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log) diff --git a/src/calibre/db/tests/fts.py b/src/calibre/db/tests/fts.py index dd278a6bfc..5c4c0abd23 100644 --- a/src/calibre/db/tests/fts.py +++ b/src/calibre/db/tests/fts.py @@ -3,12 +3,14 @@ import builtins +import os import sys +import tempfile from apsw import Connection from calibre.constants import plugins -from calibre.db.tests.base import BaseTest from calibre.db.annotations import unicode_normalize +from calibre.db.tests.base import BaseTest def print(*args, **kwargs): @@ -51,7 +53,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row); def tokenize(text, flags=None, remove_diacritics=True): - from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT + from calibre_extensions.sqlite_extension import FTS5_TOKENIZE_DOCUMENT, tokenize if flags is None: flags = FTS5_TOKENIZE_DOCUMENT return tokenize(unicode_normalize(text), remove_diacritics, flags) @@ -183,6 +185,73 @@ class FTSTest(BaseTest): # }}} + def test_pdftotext(self): + pdf_data = '''\ +%PDF-1.1 +%¥±ë + +1 0 obj + << /Type /Catalog + /Pages 2 0 R + >> +endobj + +2 0 obj + << /Type /Pages + /Kids [3 0 R] + /Count 1 + /MediaBox [0 0 300 144] + >> +endobj + +3 0 obj + << /Type /Page + /Parent 2 0 R + /Resources + << /Font + << /F1 + << /Type /Font + /Subtype /Type1 + /BaseFont /Times-Roman + >> + >> + >> + /Contents 4 0 R + >> +endobj + +4 0 obj + << /Length 55 >> +stream + BT + /F1 18 Tf + 0 0 Td + (Hello World) Tj + ET +endstream +endobj + +xref +0 5 +0000000000 65535 f +0000000018 00000 n +0000000077 00000 n +0000000178 00000 n +0000000457 00000 n +trailer + << /Root 1 0 R + /Size 5 + >> +startxref +565 +%%EOF''' + with tempfile.TemporaryDirectory() as tdir: + pdf = os.path.join(tdir, 'test.pdf') + with open(pdf, 'w') as f: + f.write(pdf_data) + from calibre.db.fts.text import pdftotext + self.assertEqual(pdftotext(pdf).strip(), 'Hello World') + def find_tests(): import unittest diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 7fb4345840..3e8145b977 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -33,6 +33,7 @@ if iswindows and hasattr(sys, 'frozen'): PDFTOHTML = os.path.join(base, 'pdftohtml.exe') if (islinux or isbsd) and getattr(sys, 'frozen', False): PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml') +PDFTOTEXT = os.path.join(os.path.dirname(PDFTOHTML), 'pdftotext' + ('.exe' if iswindows else '')) def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): diff --git a/src/calibre/test_build.py b/src/calibre/test_build.py index d366edf7f5..c0f5029216 100644 --- a/src/calibre/test_build.py +++ b/src/calibre/test_build.py @@ -409,11 +409,12 @@ class BuildTest(unittest.TestCase): @unittest.skipUnless(getattr(sys, 'frozen', False), 'Only makes sense to test executables in frozen builds') def test_executables(self): from calibre.utils.ipc.launch import Worker - from calibre.ebooks.pdf.pdftohtml import PDFTOHTML + from calibre.ebooks.pdf.pdftohtml import PDFTOHTML, PDFTOTEXT w = Worker({}) self.assertTrue(os.path.exists(w.executable), 'calibre-parallel (%s) does not exist' % w.executable) self.assertTrue(os.path.exists(w.gui_executable), 'calibre-parallel-gui (%s) does not exist' % w.gui_executable) self.assertTrue(os.path.exists(PDFTOHTML), 'pdftohtml (%s) does not exist' % PDFTOHTML) + self.assertTrue(os.path.exists(PDFTOTEXT), 'pdftotext (%s) does not exist' % PDFTOTEXT) if iswindows: from calibre.devices.usbms.device import eject_exe self.assertTrue(os.path.exists(eject_exe()), 'calibre-eject.exe (%s) does not exist' % eject_exe())