Basic test to tokenizer

This commit is contained in:
Kovid Goyal 2021-06-15 13:07:18 +05:30
parent c819fcb870
commit 79ea88ddb8
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -3,25 +3,49 @@
# License: GPLv3 Copyright: 2021, Kovid Goyal <kovid at kovidgoyal.net>
from calibre.db.tests.base import BaseTest
import builtins
import sys
from apsw import Connection
from calibre.constants import plugins
from calibre.db.tests.base import BaseTest
def print(*args, **kwargs):
kwargs['file'] = sys.__stdout__
builtins.print(*args, **kwargs)
class TestConn(Connection):
def __init__(self):
def __init__(self, remove_diacritics=True):
super().__init__(':memory:')
plugins.load_apsw_extension(self, 'sqlite_extension')
self.cursor().execute("CREATE VIRTUAL TABLE fts_table USING fts5(t, tokenize = 'unicode61 remove_diacritics 2')")
options = []
if remove_diacritics:
options.append('remove_diacritics'), options.append('2')
options = ' '.join(options)
self.execute(f'''
CREATE VIRTUAL TABLE fts_table USING fts5(t, tokenize = 'unicode61 {options}');
CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
''')
def execute(self, *a):
return self.cursor().execute(*a)
def insert_text(self, text):
self.cursor().execute('INSERT INTO fts_table(t) VALUES (?)', (text,))
self.execute('INSERT INTO fts_table(t) VALUES (?)', (text,))
def term_row_counts(self):
return dict(self.execute('SELECT term,doc FROM fts_row'))
class FTSTest(BaseTest):
ae = BaseTest.assertEqual
def test_basic_fts(self): # {{{
conn = TestConn()
conn.insert_text('two words, and a period. With another.')
conn.insert_text('and another')
self.ae(conn.term_row_counts(), {'a': 1, 'and': 2, 'another': 2, 'period': 1, 'two': 1, 'with': 1, 'words': 1})
# }}}