mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Use pdftotext to index PDF files
Much faster and less resource intensive than pdftohtml
This commit is contained in:
parent
1d67f1e923
commit
cde3ff211c
@ -38,7 +38,7 @@ qt_get_dll_path = partial(get_dll_path, loc=os.path.join(QT_PREFIX, 'lib'))
|
||||
|
||||
def binary_includes():
|
||||
return [
|
||||
j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'optipng', 'JxrDecApp')] + [
|
||||
j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'optipng', 'JxrDecApp')] + [
|
||||
|
||||
j(PREFIX, 'private', 'mozjpeg', 'bin', x) for x in ('jpegtran', 'cjpeg')] + [
|
||||
] + list(map(
|
||||
|
@ -484,7 +484,7 @@ class Freeze:
|
||||
print('\nAdding poppler')
|
||||
for x in ('libopenjp2.7.dylib', 'libpoppler.115.dylib',):
|
||||
self.install_dylib(join(PREFIX, 'lib', x))
|
||||
for x in ('pdftohtml', 'pdftoppm', 'pdfinfo'):
|
||||
for x in ('pdftohtml', 'pdftoppm', 'pdfinfo', 'pdftotext'):
|
||||
self.install_dylib(
|
||||
join(PREFIX, 'bin', x), set_id=False, dest=self.helpers_dir)
|
||||
|
||||
|
@ -145,7 +145,7 @@ def freeze(env, ext_dir, incdir):
|
||||
shutil.copy2(x + '.manifest', env.dll_dir)
|
||||
|
||||
bindir = os.path.join(PREFIX, 'bin')
|
||||
for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'):
|
||||
for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'):
|
||||
copybin(os.path.join(bindir, x + '.exe'))
|
||||
for f in glob.glob(os.path.join(bindir, '*.dll')):
|
||||
if re.search(r'(easylzma|icutest)', f.lower()) is None:
|
||||
|
@ -3,6 +3,7 @@
|
||||
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
@ -61,6 +62,19 @@ def is_fmt_ok(input_fmt):
|
||||
return input_plugin
|
||||
|
||||
|
||||
def pdftotext(path):
|
||||
import subprocess
|
||||
|
||||
from calibre.ebooks.pdf.pdftohtml import PDFTOTEXT, popen
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
cmd = [PDFTOTEXT] + '-enc UTF-8 -nodiag -eol unix'.split() + [os.path.basename(path), '-']
|
||||
p = popen(cmd, cwd=os.path.dirname(path), stdout=subprocess.PIPE, stdin=subprocess.DEVNULL)
|
||||
raw = p.stdout.read()
|
||||
if p.wait() != 0:
|
||||
return ''
|
||||
return clean_ascii_chars(raw).decode('utf-8', 'replace')
|
||||
|
||||
|
||||
def extract_text(pathtoebook):
|
||||
input_fmt = pathtoebook.rpartition('.')[-1].upper()
|
||||
ans = ''
|
||||
@ -68,6 +82,8 @@ def extract_text(pathtoebook):
|
||||
if not input_plugin:
|
||||
return ans
|
||||
input_plugin = plugin_for_input_format(input_fmt)
|
||||
if input_fmt == 'PDF':
|
||||
return pdftotext(pathtoebook)
|
||||
with TemporaryDirectory() as tdir:
|
||||
texts = []
|
||||
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
|
||||
|
@ -3,12 +3,14 @@
|
||||
|
||||
|
||||
import builtins
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from apsw import Connection
|
||||
|
||||
from calibre.constants import plugins
|
||||
from calibre.db.tests.base import BaseTest
|
||||
from calibre.db.annotations import unicode_normalize
|
||||
from calibre.db.tests.base import BaseTest
|
||||
|
||||
|
||||
def print(*args, **kwargs):
|
||||
@ -51,7 +53,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
|
||||
|
||||
|
||||
def tokenize(text, flags=None, remove_diacritics=True):
|
||||
from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
|
||||
from calibre_extensions.sqlite_extension import FTS5_TOKENIZE_DOCUMENT, tokenize
|
||||
if flags is None:
|
||||
flags = FTS5_TOKENIZE_DOCUMENT
|
||||
return tokenize(unicode_normalize(text), remove_diacritics, flags)
|
||||
@ -183,6 +185,73 @@ class FTSTest(BaseTest):
|
||||
|
||||
# }}}
|
||||
|
||||
def test_pdftotext(self):
|
||||
pdf_data = '''\
|
||||
%PDF-1.1
|
||||
%¥±ë
|
||||
|
||||
1 0 obj
|
||||
<< /Type /Catalog
|
||||
/Pages 2 0 R
|
||||
>>
|
||||
endobj
|
||||
|
||||
2 0 obj
|
||||
<< /Type /Pages
|
||||
/Kids [3 0 R]
|
||||
/Count 1
|
||||
/MediaBox [0 0 300 144]
|
||||
>>
|
||||
endobj
|
||||
|
||||
3 0 obj
|
||||
<< /Type /Page
|
||||
/Parent 2 0 R
|
||||
/Resources
|
||||
<< /Font
|
||||
<< /F1
|
||||
<< /Type /Font
|
||||
/Subtype /Type1
|
||||
/BaseFont /Times-Roman
|
||||
>>
|
||||
>>
|
||||
>>
|
||||
/Contents 4 0 R
|
||||
>>
|
||||
endobj
|
||||
|
||||
4 0 obj
|
||||
<< /Length 55 >>
|
||||
stream
|
||||
BT
|
||||
/F1 18 Tf
|
||||
0 0 Td
|
||||
(Hello World) Tj
|
||||
ET
|
||||
endstream
|
||||
endobj
|
||||
|
||||
xref
|
||||
0 5
|
||||
0000000000 65535 f
|
||||
0000000018 00000 n
|
||||
0000000077 00000 n
|
||||
0000000178 00000 n
|
||||
0000000457 00000 n
|
||||
trailer
|
||||
<< /Root 1 0 R
|
||||
/Size 5
|
||||
>>
|
||||
startxref
|
||||
565
|
||||
%%EOF'''
|
||||
with tempfile.TemporaryDirectory() as tdir:
|
||||
pdf = os.path.join(tdir, 'test.pdf')
|
||||
with open(pdf, 'w') as f:
|
||||
f.write(pdf_data)
|
||||
from calibre.db.fts.text import pdftotext
|
||||
self.assertEqual(pdftotext(pdf).strip(), 'Hello World')
|
||||
|
||||
|
||||
def find_tests():
|
||||
import unittest
|
||||
|
@ -33,6 +33,7 @@ if iswindows and hasattr(sys, 'frozen'):
|
||||
PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
|
||||
if (islinux or isbsd) and getattr(sys, 'frozen', False):
|
||||
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
|
||||
PDFTOTEXT = os.path.join(os.path.dirname(PDFTOHTML), 'pdftotext' + ('.exe' if iswindows else ''))
|
||||
|
||||
|
||||
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
||||
|
@ -409,11 +409,12 @@ class BuildTest(unittest.TestCase):
|
||||
@unittest.skipUnless(getattr(sys, 'frozen', False), 'Only makes sense to test executables in frozen builds')
|
||||
def test_executables(self):
|
||||
from calibre.utils.ipc.launch import Worker
|
||||
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
|
||||
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML, PDFTOTEXT
|
||||
w = Worker({})
|
||||
self.assertTrue(os.path.exists(w.executable), 'calibre-parallel (%s) does not exist' % w.executable)
|
||||
self.assertTrue(os.path.exists(w.gui_executable), 'calibre-parallel-gui (%s) does not exist' % w.gui_executable)
|
||||
self.assertTrue(os.path.exists(PDFTOHTML), 'pdftohtml (%s) does not exist' % PDFTOHTML)
|
||||
self.assertTrue(os.path.exists(PDFTOTEXT), 'pdftotext (%s) does not exist' % PDFTOTEXT)
|
||||
if iswindows:
|
||||
from calibre.devices.usbms.device import eject_exe
|
||||
self.assertTrue(os.path.exists(eject_exe()), 'calibre-eject.exe (%s) does not exist' % eject_exe())
|
||||
|
Loading…
x
Reference in New Issue
Block a user