mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Use pdftotext to index PDF files
Much faster and less resource intensive than pdftohtml
This commit is contained in:
parent
1d67f1e923
commit
cde3ff211c
@ -38,7 +38,7 @@ qt_get_dll_path = partial(get_dll_path, loc=os.path.join(QT_PREFIX, 'lib'))
|
|||||||
|
|
||||||
def binary_includes():
|
def binary_includes():
|
||||||
return [
|
return [
|
||||||
j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'optipng', 'JxrDecApp')] + [
|
j(PREFIX, 'bin', x) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'optipng', 'JxrDecApp')] + [
|
||||||
|
|
||||||
j(PREFIX, 'private', 'mozjpeg', 'bin', x) for x in ('jpegtran', 'cjpeg')] + [
|
j(PREFIX, 'private', 'mozjpeg', 'bin', x) for x in ('jpegtran', 'cjpeg')] + [
|
||||||
] + list(map(
|
] + list(map(
|
||||||
|
@ -484,7 +484,7 @@ class Freeze:
|
|||||||
print('\nAdding poppler')
|
print('\nAdding poppler')
|
||||||
for x in ('libopenjp2.7.dylib', 'libpoppler.115.dylib',):
|
for x in ('libopenjp2.7.dylib', 'libpoppler.115.dylib',):
|
||||||
self.install_dylib(join(PREFIX, 'lib', x))
|
self.install_dylib(join(PREFIX, 'lib', x))
|
||||||
for x in ('pdftohtml', 'pdftoppm', 'pdfinfo'):
|
for x in ('pdftohtml', 'pdftoppm', 'pdfinfo', 'pdftotext'):
|
||||||
self.install_dylib(
|
self.install_dylib(
|
||||||
join(PREFIX, 'bin', x), set_id=False, dest=self.helpers_dir)
|
join(PREFIX, 'bin', x), set_id=False, dest=self.helpers_dir)
|
||||||
|
|
||||||
|
@ -145,7 +145,7 @@ def freeze(env, ext_dir, incdir):
|
|||||||
shutil.copy2(x + '.manifest', env.dll_dir)
|
shutil.copy2(x + '.manifest', env.dll_dir)
|
||||||
|
|
||||||
bindir = os.path.join(PREFIX, 'bin')
|
bindir = os.path.join(PREFIX, 'bin')
|
||||||
for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'):
|
for x in ('pdftohtml', 'pdfinfo', 'pdftoppm', 'pdftotext', 'jpegtran-calibre', 'cjpeg-calibre', 'optipng-calibre', 'JXRDecApp-calibre'):
|
||||||
copybin(os.path.join(bindir, x + '.exe'))
|
copybin(os.path.join(bindir, x + '.exe'))
|
||||||
for f in glob.glob(os.path.join(bindir, '*.dll')):
|
for f in glob.glob(os.path.join(bindir, '*.dll')):
|
||||||
if re.search(r'(easylzma|icutest)', f.lower()) is None:
|
if re.search(r'(easylzma|icutest)', f.lower()) is None:
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
@ -61,6 +62,19 @@ def is_fmt_ok(input_fmt):
|
|||||||
return input_plugin
|
return input_plugin
|
||||||
|
|
||||||
|
|
||||||
|
def pdftotext(path):
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
from calibre.ebooks.pdf.pdftohtml import PDFTOTEXT, popen
|
||||||
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
cmd = [PDFTOTEXT] + '-enc UTF-8 -nodiag -eol unix'.split() + [os.path.basename(path), '-']
|
||||||
|
p = popen(cmd, cwd=os.path.dirname(path), stdout=subprocess.PIPE, stdin=subprocess.DEVNULL)
|
||||||
|
raw = p.stdout.read()
|
||||||
|
if p.wait() != 0:
|
||||||
|
return ''
|
||||||
|
return clean_ascii_chars(raw).decode('utf-8', 'replace')
|
||||||
|
|
||||||
|
|
||||||
def extract_text(pathtoebook):
|
def extract_text(pathtoebook):
|
||||||
input_fmt = pathtoebook.rpartition('.')[-1].upper()
|
input_fmt = pathtoebook.rpartition('.')[-1].upper()
|
||||||
ans = ''
|
ans = ''
|
||||||
@ -68,6 +82,8 @@ def extract_text(pathtoebook):
|
|||||||
if not input_plugin:
|
if not input_plugin:
|
||||||
return ans
|
return ans
|
||||||
input_plugin = plugin_for_input_format(input_fmt)
|
input_plugin = plugin_for_input_format(input_fmt)
|
||||||
|
if input_fmt == 'PDF':
|
||||||
|
return pdftotext(pathtoebook)
|
||||||
with TemporaryDirectory() as tdir:
|
with TemporaryDirectory() as tdir:
|
||||||
texts = []
|
texts = []
|
||||||
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
|
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
|
|
||||||
|
|
||||||
import builtins
|
import builtins
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import tempfile
|
||||||
from apsw import Connection
|
from apsw import Connection
|
||||||
|
|
||||||
from calibre.constants import plugins
|
from calibre.constants import plugins
|
||||||
from calibre.db.tests.base import BaseTest
|
|
||||||
from calibre.db.annotations import unicode_normalize
|
from calibre.db.annotations import unicode_normalize
|
||||||
|
from calibre.db.tests.base import BaseTest
|
||||||
|
|
||||||
|
|
||||||
def print(*args, **kwargs):
|
def print(*args, **kwargs):
|
||||||
@ -51,7 +53,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
|
|||||||
|
|
||||||
|
|
||||||
def tokenize(text, flags=None, remove_diacritics=True):
|
def tokenize(text, flags=None, remove_diacritics=True):
|
||||||
from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
|
from calibre_extensions.sqlite_extension import FTS5_TOKENIZE_DOCUMENT, tokenize
|
||||||
if flags is None:
|
if flags is None:
|
||||||
flags = FTS5_TOKENIZE_DOCUMENT
|
flags = FTS5_TOKENIZE_DOCUMENT
|
||||||
return tokenize(unicode_normalize(text), remove_diacritics, flags)
|
return tokenize(unicode_normalize(text), remove_diacritics, flags)
|
||||||
@ -183,6 +185,73 @@ class FTSTest(BaseTest):
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
def test_pdftotext(self):
|
||||||
|
pdf_data = '''\
|
||||||
|
%PDF-1.1
|
||||||
|
%¥±ë
|
||||||
|
|
||||||
|
1 0 obj
|
||||||
|
<< /Type /Catalog
|
||||||
|
/Pages 2 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
2 0 obj
|
||||||
|
<< /Type /Pages
|
||||||
|
/Kids [3 0 R]
|
||||||
|
/Count 1
|
||||||
|
/MediaBox [0 0 300 144]
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
3 0 obj
|
||||||
|
<< /Type /Page
|
||||||
|
/Parent 2 0 R
|
||||||
|
/Resources
|
||||||
|
<< /Font
|
||||||
|
<< /F1
|
||||||
|
<< /Type /Font
|
||||||
|
/Subtype /Type1
|
||||||
|
/BaseFont /Times-Roman
|
||||||
|
>>
|
||||||
|
>>
|
||||||
|
>>
|
||||||
|
/Contents 4 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
4 0 obj
|
||||||
|
<< /Length 55 >>
|
||||||
|
stream
|
||||||
|
BT
|
||||||
|
/F1 18 Tf
|
||||||
|
0 0 Td
|
||||||
|
(Hello World) Tj
|
||||||
|
ET
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
|
||||||
|
xref
|
||||||
|
0 5
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000018 00000 n
|
||||||
|
0000000077 00000 n
|
||||||
|
0000000178 00000 n
|
||||||
|
0000000457 00000 n
|
||||||
|
trailer
|
||||||
|
<< /Root 1 0 R
|
||||||
|
/Size 5
|
||||||
|
>>
|
||||||
|
startxref
|
||||||
|
565
|
||||||
|
%%EOF'''
|
||||||
|
with tempfile.TemporaryDirectory() as tdir:
|
||||||
|
pdf = os.path.join(tdir, 'test.pdf')
|
||||||
|
with open(pdf, 'w') as f:
|
||||||
|
f.write(pdf_data)
|
||||||
|
from calibre.db.fts.text import pdftotext
|
||||||
|
self.assertEqual(pdftotext(pdf).strip(), 'Hello World')
|
||||||
|
|
||||||
|
|
||||||
def find_tests():
|
def find_tests():
|
||||||
import unittest
|
import unittest
|
||||||
|
@ -33,6 +33,7 @@ if iswindows and hasattr(sys, 'frozen'):
|
|||||||
PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
|
PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
|
||||||
if (islinux or isbsd) and getattr(sys, 'frozen', False):
|
if (islinux or isbsd) and getattr(sys, 'frozen', False):
|
||||||
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
|
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
|
||||||
|
PDFTOTEXT = os.path.join(os.path.dirname(PDFTOHTML), 'pdftotext' + ('.exe' if iswindows else ''))
|
||||||
|
|
||||||
|
|
||||||
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
||||||
|
@ -409,11 +409,12 @@ class BuildTest(unittest.TestCase):
|
|||||||
@unittest.skipUnless(getattr(sys, 'frozen', False), 'Only makes sense to test executables in frozen builds')
|
@unittest.skipUnless(getattr(sys, 'frozen', False), 'Only makes sense to test executables in frozen builds')
|
||||||
def test_executables(self):
|
def test_executables(self):
|
||||||
from calibre.utils.ipc.launch import Worker
|
from calibre.utils.ipc.launch import Worker
|
||||||
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
|
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML, PDFTOTEXT
|
||||||
w = Worker({})
|
w = Worker({})
|
||||||
self.assertTrue(os.path.exists(w.executable), 'calibre-parallel (%s) does not exist' % w.executable)
|
self.assertTrue(os.path.exists(w.executable), 'calibre-parallel (%s) does not exist' % w.executable)
|
||||||
self.assertTrue(os.path.exists(w.gui_executable), 'calibre-parallel-gui (%s) does not exist' % w.gui_executable)
|
self.assertTrue(os.path.exists(w.gui_executable), 'calibre-parallel-gui (%s) does not exist' % w.gui_executable)
|
||||||
self.assertTrue(os.path.exists(PDFTOHTML), 'pdftohtml (%s) does not exist' % PDFTOHTML)
|
self.assertTrue(os.path.exists(PDFTOHTML), 'pdftohtml (%s) does not exist' % PDFTOHTML)
|
||||||
|
self.assertTrue(os.path.exists(PDFTOTEXT), 'pdftotext (%s) does not exist' % PDFTOTEXT)
|
||||||
if iswindows:
|
if iswindows:
|
||||||
from calibre.devices.usbms.device import eject_exe
|
from calibre.devices.usbms.device import eject_exe
|
||||||
self.assertTrue(os.path.exists(eject_exe()), 'calibre-eject.exe (%s) does not exist' % eject_exe())
|
self.assertTrue(os.path.exists(eject_exe()), 'calibre-eject.exe (%s) does not exist' % eject_exe())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user