mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Refactor hunspell to allow virtualization of file I/O
This commit is contained in:
parent
5f4110524b
commit
da7ef93e27
@ -72,7 +72,7 @@ extensions = [
|
|||||||
|
|
||||||
Extension('hunspell',
|
Extension('hunspell',
|
||||||
['hunspell/'+x for x in
|
['hunspell/'+x for x in
|
||||||
'affentry.cxx affixmgr.cxx csutil.cxx dictmgr.cxx filemgr.cxx hashmgr.cxx hunspell.cxx hunzip.cxx phonet.cxx replist.cxx suggestmgr.cxx'.split()
|
'affentry.cxx affixmgr.cxx csutil.cxx dictmgr.cxx filemgr.cxx hashmgr.cxx hunspell.cxx phonet.cxx replist.cxx suggestmgr.cxx'.split()
|
||||||
] + ['calibre/utils/spell/hunspell_wrapper.cpp',],
|
] + ['calibre/utils/spell/hunspell_wrapper.cpp',],
|
||||||
inc_dirs=['hunspell'],
|
inc_dirs=['hunspell'],
|
||||||
cflags='/DHUNSPELL_STATIC /D_CRT_SECURE_NO_WARNINGS /DUNICODE /D_UNICODE'.split() if iswindows else ['-DHUNSPELL_STATIC'],
|
cflags='/DHUNSPELL_STATIC /D_CRT_SECURE_NO_WARNINGS /DUNICODE /D_UNICODE'.split() if iswindows else ['-DHUNSPELL_STATIC'],
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
* Distributed under terms of the GPL3 license.
|
* Distributed under terms of the GPL3 license.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#define PY_SSIZE_T_CLEAN 1
|
||||||
#include <Python.h>
|
#include <Python.h>
|
||||||
#include <new>
|
#include <new>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -21,15 +22,16 @@ static PyObject *HunspellError = NULL;
|
|||||||
|
|
||||||
static int
|
static int
|
||||||
init_type(Dictionary *self, PyObject *args, PyObject *kwds) {
|
init_type(Dictionary *self, PyObject *args, PyObject *kwds) {
|
||||||
char *dpath = NULL, *apath = NULL;
|
char *dic = NULL, *aff = NULL;
|
||||||
|
Py_ssize_t diclen, afflen;
|
||||||
|
|
||||||
self->handle = NULL;
|
self->handle = NULL;
|
||||||
self->encoding = NULL;
|
self->encoding = NULL;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "ss", &dpath, &apath)) return 1;
|
if (!PyArg_ParseTuple(args, "s#s#", &dic, &diclen, &aff, &afflen)) return 1;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
self->handle = new (std::nothrow) Hunspell(apath, dpath);
|
self->handle = new (std::nothrow) Hunspell(aff, afflen, dic, diclen);
|
||||||
} catch (const std::exception &ex) {
|
} catch (const std::exception &ex) {
|
||||||
PyErr_SetString(HunspellError, ex.what());
|
PyErr_SetString(HunspellError, ex.what());
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
|
|
||||||
#include "csutil.hxx"
|
#include "csutil.hxx"
|
||||||
|
|
||||||
AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)
|
AffixMgr::AffixMgr(const char *aff_data, const size_t aff_len, HashMgr** ptr, int * md)
|
||||||
{
|
{
|
||||||
// register hash manager and load affix data from aff file
|
// register hash manager and load affix data from aff file
|
||||||
pHMgr = ptr[0];
|
pHMgr = ptr[0];
|
||||||
@ -110,8 +110,8 @@ AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * k
|
|||||||
contclasses[j] = 0;
|
contclasses[j] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parse_file(affpath, key)) {
|
if (parse_file(aff_data, aff_len)) {
|
||||||
HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
|
HUNSPELL_WARNING(stderr, "Failure loading aff file\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cpdmin == -1) cpdmin = MINCPDLEN;
|
if (cpdmin == -1) cpdmin = MINCPDLEN;
|
||||||
@ -255,7 +255,7 @@ AffixMgr::~AffixMgr()
|
|||||||
|
|
||||||
|
|
||||||
// read in aff file and build up prefix and suffix entry objects
|
// read in aff file and build up prefix and suffix entry objects
|
||||||
int AffixMgr::parse_file(const char * affpath, const char * key)
|
int AffixMgr::parse_file(const char *aff_data, const size_t aff_len)
|
||||||
{
|
{
|
||||||
char * line; // io buffers
|
char * line; // io buffers
|
||||||
char ft; // affix type
|
char ft; // affix type
|
||||||
@ -268,9 +268,9 @@ int AffixMgr::parse_file(const char * affpath, const char * key)
|
|||||||
int firstline = 1;
|
int firstline = 1;
|
||||||
|
|
||||||
// open the affix file
|
// open the affix file
|
||||||
FileMgr * afflst = new FileMgr(affpath, key);
|
FileMgr * afflst = new FileMgr(aff_data, aff_len);
|
||||||
if (!afflst) {
|
if (!afflst) {
|
||||||
HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
|
HUNSPELL_WARNING(stderr, "error: could not open affix description file \n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -109,8 +109,7 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
AffixMgr(const char * affpath, HashMgr** ptr, int * md,
|
AffixMgr(const char *aff_data, const size_t aff_len, HashMgr** ptr, int * md);
|
||||||
const char * key = NULL);
|
|
||||||
~AffixMgr();
|
~AffixMgr();
|
||||||
struct hentry * affix_check(const char * word, int len,
|
struct hentry * affix_check(const char * word, int len,
|
||||||
const unsigned short needflag = (unsigned short) 0,
|
const unsigned short needflag = (unsigned short) 0,
|
||||||
@ -217,7 +216,7 @@ public:
|
|||||||
int get_fullstrip() const;
|
int get_fullstrip() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int parse_file(const char * affpath, const char * key);
|
int parse_file(const char *aff_data, const size_t aff_len);
|
||||||
int parse_flag(char * line, unsigned short * out, FileMgr * af);
|
int parse_flag(char * line, unsigned short * out, FileMgr * af);
|
||||||
int parse_num(char * line, int * out, FileMgr * af);
|
int parse_num(char * line, int * out, FileMgr * af);
|
||||||
int parse_cpdsyllable(char * line, FileMgr * af);
|
int parse_cpdsyllable(char * line, FileMgr * af);
|
||||||
|
@ -3,45 +3,42 @@
|
|||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
#include "filemgr.hxx"
|
#include "filemgr.hxx"
|
||||||
|
|
||||||
int FileMgr::fail(const char * err, const char * par) {
|
FileMgr::FileMgr(const char *data, const size_t dlen) {
|
||||||
fprintf(stderr, err, par);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
FileMgr::FileMgr(const char * file, const char * key) {
|
|
||||||
linenum = 0;
|
linenum = 0;
|
||||||
hin = NULL;
|
last = 0;
|
||||||
fin = fopen(file, "r");
|
buf = new char[dlen+1];
|
||||||
if (!fin) {
|
memcpy(buf, data, dlen);
|
||||||
// check hzipped file
|
buf[dlen] = 0;
|
||||||
char * st = (char *) malloc(strlen(file) + strlen(HZIP_EXTENSION) + 1);
|
pos = buf;
|
||||||
if (st) {
|
buflen = dlen;
|
||||||
strcpy(st, file);
|
|
||||||
strcat(st, HZIP_EXTENSION);
|
|
||||||
hin = new Hunzip(st, key);
|
|
||||||
free(st);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!fin && !hin) fail(MSG_OPEN, file);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
FileMgr::~FileMgr()
|
FileMgr::~FileMgr()
|
||||||
{
|
{
|
||||||
if (fin) fclose(fin);
|
if (buf != NULL) { delete[] buf; buf = NULL; }
|
||||||
if (hin) delete hin;
|
pos = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
char * FileMgr::getline() {
|
char * FileMgr::getline() {
|
||||||
const char * l;
|
if (buf == NULL) return NULL;
|
||||||
|
if (((size_t)(pos - buf)) >= buflen) {
|
||||||
|
// free up the memory as it will not be needed anymore
|
||||||
|
delete[] buf; buf = NULL; pos = NULL; return NULL;
|
||||||
|
}
|
||||||
|
if (pos != buf) *pos = last; // Restore the character that was previously replaced by null
|
||||||
|
char *ans = pos;
|
||||||
|
// Move pos to the start of the next line
|
||||||
|
pos = (char *)memchr(pos, 10, buflen - (pos - buf));
|
||||||
|
if (pos == NULL) pos = buf + buflen + 1;
|
||||||
|
else pos++;
|
||||||
|
// Ensure the current line is null terminated
|
||||||
|
last = *pos;
|
||||||
|
*pos = 0;
|
||||||
linenum++;
|
linenum++;
|
||||||
if (fin) return fgets(in, BUFSIZE - 1, fin);
|
return ans;
|
||||||
if (hin && (l = hin->getline())) return strcpy(in, l);
|
|
||||||
linenum--;
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int FileMgr::getlinenum() {
|
int FileMgr::getlinenum() {
|
||||||
|
@ -4,20 +4,17 @@
|
|||||||
|
|
||||||
#include "hunvisapi.h"
|
#include "hunvisapi.h"
|
||||||
|
|
||||||
#include "hunzip.hxx"
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
class LIBHUNSPELL_DLL_EXPORTED FileMgr
|
class LIBHUNSPELL_DLL_EXPORTED FileMgr
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
FILE * fin;
|
char *buf;
|
||||||
Hunzip * hin;
|
char *pos;
|
||||||
char in[BUFSIZE + 50]; // input buffer
|
size_t buflen;
|
||||||
int fail(const char * err, const char * par);
|
char last;
|
||||||
int linenum;
|
int linenum;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
FileMgr(const char * filename, const char * key = NULL);
|
FileMgr(const char *data, const size_t dlen);
|
||||||
~FileMgr();
|
~FileMgr();
|
||||||
char * getline();
|
char * getline();
|
||||||
int getlinenum();
|
int getlinenum();
|
||||||
|
@ -10,9 +10,11 @@
|
|||||||
#include "csutil.hxx"
|
#include "csutil.hxx"
|
||||||
#include "atypes.hxx"
|
#include "atypes.hxx"
|
||||||
|
|
||||||
|
#define BUFSIZE 65536
|
||||||
|
|
||||||
// build a hash table from a munched word list
|
// build a hash table from a munched word list
|
||||||
|
|
||||||
HashMgr::HashMgr(const char * tpath, const char * apath, const char * key)
|
HashMgr::HashMgr(const char *aff_data, const size_t aff_len, const char *dic_data, const size_t dic_len)
|
||||||
{
|
{
|
||||||
tablesize = 0;
|
tablesize = 0;
|
||||||
tableptr = NULL;
|
tableptr = NULL;
|
||||||
@ -31,8 +33,8 @@ HashMgr::HashMgr(const char * tpath, const char * apath, const char * key)
|
|||||||
numaliasm = 0;
|
numaliasm = 0;
|
||||||
aliasm = NULL;
|
aliasm = NULL;
|
||||||
forbiddenword = FORBIDDENWORD; // forbidden word signing flag
|
forbiddenword = FORBIDDENWORD; // forbidden word signing flag
|
||||||
load_config(apath, key);
|
load_config(aff_data, aff_len);
|
||||||
int ec = load_tables(tpath, key);
|
int ec = load_tables(dic_data, dic_len);
|
||||||
if (ec) {
|
if (ec) {
|
||||||
/* error condition - what should we do here */
|
/* error condition - what should we do here */
|
||||||
HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
|
HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
|
||||||
@ -349,7 +351,7 @@ struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
|
|||||||
}
|
}
|
||||||
|
|
||||||
// load a munched word list and build a hash table on the fly
|
// load a munched word list and build a hash table on the fly
|
||||||
int HashMgr::load_tables(const char * tpath, const char * key)
|
int HashMgr::load_tables(const char *dic_data, const size_t dic_len)
|
||||||
{
|
{
|
||||||
int al;
|
int al;
|
||||||
char * ap;
|
char * ap;
|
||||||
@ -359,7 +361,7 @@ int HashMgr::load_tables(const char * tpath, const char * key)
|
|||||||
char * ts;
|
char * ts;
|
||||||
|
|
||||||
// open dictionary file
|
// open dictionary file
|
||||||
FileMgr * dict = new FileMgr(tpath, key);
|
FileMgr * dict = new FileMgr(dic_data, dic_len);
|
||||||
if (dict == NULL) return 1;
|
if (dict == NULL) return 1;
|
||||||
|
|
||||||
// first read the first line of file to get hash table size */
|
// first read the first line of file to get hash table size */
|
||||||
@ -601,15 +603,15 @@ char * HashMgr::encode_flag(unsigned short f) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// read in aff file and set flag mode
|
// read in aff file and set flag mode
|
||||||
int HashMgr::load_config(const char * affpath, const char * key)
|
int HashMgr::load_config(const char *aff_data, const size_t aff_len)
|
||||||
{
|
{
|
||||||
char * line; // io buffers
|
char * line; // io buffers
|
||||||
int firstline = 1;
|
int firstline = 1;
|
||||||
|
|
||||||
// open the affix file
|
// open the affix file
|
||||||
FileMgr * afflst = new FileMgr(affpath, key);
|
FileMgr * afflst = new FileMgr(aff_data, aff_len);
|
||||||
if (!afflst) {
|
if (!afflst) {
|
||||||
HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath);
|
HUNSPELL_WARNING(stderr, "Error - could not open affix description file");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ class LIBHUNSPELL_DLL_EXPORTED HashMgr
|
|||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
HashMgr(const char * tpath, const char * apath, const char * key = NULL);
|
HashMgr(const char *aff_data, const size_t aff_len, const char *dic_data, const size_t dic_len);
|
||||||
~HashMgr();
|
~HashMgr();
|
||||||
|
|
||||||
struct hentry * lookup(const char *) const;
|
struct hentry * lookup(const char *) const;
|
||||||
@ -54,10 +54,10 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
int get_clen_and_captype(const char * word, int wbl, int * captype);
|
int get_clen_and_captype(const char * word, int wbl, int * captype);
|
||||||
int load_tables(const char * tpath, const char * key);
|
int load_tables(const char *dic_data, const size_t dic_len);
|
||||||
int add_word(const char * word, int wbl, int wcl, unsigned short * ap,
|
int add_word(const char * word, int wbl, int wcl, unsigned short * ap,
|
||||||
int al, const char * desc, bool onlyupcase);
|
int al, const char * desc, bool onlyupcase);
|
||||||
int load_config(const char * affpath, const char * key);
|
int load_config(const char *aff_data, const size_t aff_len);
|
||||||
int parse_aliasf(char * line, FileMgr * af);
|
int parse_aliasf(char * line, FileMgr * af);
|
||||||
int add_hidden_capitalized_word(char * word, int wbl, int wcl,
|
int add_hidden_capitalized_word(char * word, int wbl, int wcl,
|
||||||
unsigned short * flags, int al, char * dp, int captype);
|
unsigned short * flags, int al, char * dp, int captype);
|
||||||
|
@ -6,28 +6,26 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "hunspell.hxx"
|
#include "hunspell.hxx"
|
||||||
#include "hunspell.h"
|
|
||||||
#ifndef MOZILLA_CLIENT
|
#ifndef MOZILLA_CLIENT
|
||||||
# include "config.h"
|
# include "config.h"
|
||||||
#endif
|
#endif
|
||||||
#include "csutil.hxx"
|
#include "csutil.hxx"
|
||||||
|
|
||||||
Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key)
|
Hunspell::Hunspell(const char *affix_data, const size_t aff_len, const char *dic_data, const size_t dic_len)
|
||||||
{
|
{
|
||||||
encoding = NULL;
|
encoding = NULL;
|
||||||
csconv = NULL;
|
csconv = NULL;
|
||||||
utf8 = 0;
|
utf8 = 0;
|
||||||
complexprefixes = 0;
|
complexprefixes = 0;
|
||||||
affixpath = mystrdup(affpath);
|
|
||||||
maxdic = 0;
|
maxdic = 0;
|
||||||
|
|
||||||
/* first set up the hash manager */
|
/* first set up the hash manager */
|
||||||
pHMgr[0] = new HashMgr(dpath, affpath, key);
|
pHMgr[0] = new HashMgr(affix_data, aff_len, dic_data, dic_len);
|
||||||
if (pHMgr[0]) maxdic = 1;
|
if (pHMgr[0]) maxdic = 1;
|
||||||
|
|
||||||
/* next set up the affix manager */
|
/* next set up the affix manager */
|
||||||
/* it needs access to the hash manager lookup methods */
|
/* it needs access to the hash manager lookup methods */
|
||||||
pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key);
|
pAMgr = new AffixMgr(affix_data, aff_len, pHMgr, &maxdic);
|
||||||
|
|
||||||
/* get the preferred try string and the dictionary */
|
/* get the preferred try string and the dictionary */
|
||||||
/* encoding from the Affix Manager for that dictionary */
|
/* encoding from the Affix Manager for that dictionary */
|
||||||
@ -59,16 +57,6 @@ Hunspell::~Hunspell()
|
|||||||
csconv= NULL;
|
csconv= NULL;
|
||||||
if (encoding) free(encoding);
|
if (encoding) free(encoding);
|
||||||
encoding = NULL;
|
encoding = NULL;
|
||||||
if (affixpath) free(affixpath);
|
|
||||||
affixpath = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// load extra dictionaries
|
|
||||||
int Hunspell::add_dic(const char * dpath, const char * key) {
|
|
||||||
if (maxdic == MAXDIC || !affixpath) return 1;
|
|
||||||
pHMgr[maxdic] = new HashMgr(dpath, affixpath, key);
|
|
||||||
if (pHMgr[maxdic]) maxdic++; else return 1;
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// make a copy of src at destination while removing all leading
|
// make a copy of src at destination while removing all leading
|
||||||
@ -1919,88 +1907,4 @@ char * Hunspell::morph_with_correction(const char * word)
|
|||||||
|
|
||||||
#endif // END OF HUNSPELL_EXPERIMENTAL CODE
|
#endif // END OF HUNSPELL_EXPERIMENTAL CODE
|
||||||
|
|
||||||
Hunhandle *Hunspell_create(const char * affpath, const char * dpath)
|
|
||||||
{
|
|
||||||
return (Hunhandle*)(new Hunspell(affpath, dpath));
|
|
||||||
}
|
|
||||||
|
|
||||||
Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath,
|
|
||||||
const char * key)
|
|
||||||
{
|
|
||||||
return (Hunhandle*)(new Hunspell(affpath, dpath, key));
|
|
||||||
}
|
|
||||||
|
|
||||||
void Hunspell_destroy(Hunhandle *pHunspell)
|
|
||||||
{
|
|
||||||
delete (Hunspell*)(pHunspell);
|
|
||||||
}
|
|
||||||
|
|
||||||
int Hunspell_spell(Hunhandle *pHunspell, const char *word)
|
|
||||||
{
|
|
||||||
return ((Hunspell*)pHunspell)->spell(word);
|
|
||||||
}
|
|
||||||
|
|
||||||
char *Hunspell_get_dic_encoding(Hunhandle *pHunspell)
|
|
||||||
{
|
|
||||||
return ((Hunspell*)pHunspell)->get_dic_encoding();
|
|
||||||
}
|
|
||||||
|
|
||||||
int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word)
|
|
||||||
{
|
|
||||||
return ((Hunspell*)pHunspell)->suggest(slst, word);
|
|
||||||
}
|
|
||||||
|
|
||||||
int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word)
|
|
||||||
{
|
|
||||||
return ((Hunspell*)pHunspell)->analyze(slst, word);
|
|
||||||
}
|
|
||||||
|
|
||||||
int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word)
|
|
||||||
{
|
|
||||||
return ((Hunspell*)pHunspell)->stem(slst, word);
|
|
||||||
}
|
|
||||||
|
|
||||||
int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n)
|
|
||||||
{
|
|
||||||
return ((Hunspell*)pHunspell)->stem(slst, desc, n);
|
|
||||||
}
|
|
||||||
|
|
||||||
int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word,
|
|
||||||
const char * word2)
|
|
||||||
{
|
|
||||||
return ((Hunspell*)pHunspell)->generate(slst, word, word2);
|
|
||||||
}
|
|
||||||
|
|
||||||
int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word,
|
|
||||||
char** desc, int n)
|
|
||||||
{
|
|
||||||
return ((Hunspell*)pHunspell)->generate(slst, word, desc, n);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* functions for run-time modification of the dictionary */
|
|
||||||
|
|
||||||
/* add word to the run-time dictionary */
|
|
||||||
|
|
||||||
int Hunspell_add(Hunhandle *pHunspell, const char * word) {
|
|
||||||
return ((Hunspell*)pHunspell)->add(word);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* add word to the run-time dictionary with affix flags of
|
|
||||||
* the example (a dictionary word): Hunspell will recognize
|
|
||||||
* affixed forms of the new word, too.
|
|
||||||
*/
|
|
||||||
|
|
||||||
int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word,
|
|
||||||
const char * example) {
|
|
||||||
return ((Hunspell*)pHunspell)->add_with_affix(word, example);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* remove word from the run-time dictionary */
|
|
||||||
|
|
||||||
int Hunspell_remove(Hunhandle *pHunspell, const char * word) {
|
|
||||||
return ((Hunspell*)pHunspell)->remove(word);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Hunspell_free_list(Hunhandle *, char *** slst, int n) {
|
|
||||||
freelist(slst, n);
|
|
||||||
}
|
|
||||||
|
@ -1,95 +0,0 @@
|
|||||||
#ifndef _MYSPELLMGR_H_
|
|
||||||
#define _MYSPELLMGR_H_
|
|
||||||
|
|
||||||
#include "hunvisapi.h"
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef struct Hunhandle Hunhandle;
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED Hunhandle *Hunspell_create(const char * affpath, const char * dpath);
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath,
|
|
||||||
const char * key);
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED void Hunspell_destroy(Hunhandle *pHunspell);
|
|
||||||
|
|
||||||
/* spell(word) - spellcheck word
|
|
||||||
* output: 0 = bad word, not 0 = good word
|
|
||||||
*/
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED int Hunspell_spell(Hunhandle *pHunspell, const char *);
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED char *Hunspell_get_dic_encoding(Hunhandle *pHunspell);
|
|
||||||
|
|
||||||
/* suggest(suggestions, word) - search suggestions
|
|
||||||
* input: pointer to an array of strings pointer and the (bad) word
|
|
||||||
* array of strings pointer (here *slst) may not be initialized
|
|
||||||
* output: number of suggestions in string array, and suggestions in
|
|
||||||
* a newly allocated array of strings (*slts will be NULL when number
|
|
||||||
* of suggestion equals 0.)
|
|
||||||
*/
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word);
|
|
||||||
|
|
||||||
/* morphological functions */
|
|
||||||
|
|
||||||
/* analyze(result, word) - morphological analysis of the word */
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word);
|
|
||||||
|
|
||||||
/* stem(result, word) - stemmer function */
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word);
|
|
||||||
|
|
||||||
/* stem(result, analysis, n) - get stems from a morph. analysis
|
|
||||||
* example:
|
|
||||||
* char ** result, result2;
|
|
||||||
* int n1 = Hunspell_analyze(result, "words");
|
|
||||||
* int n2 = Hunspell_stem2(result2, result, n1);
|
|
||||||
*/
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n);
|
|
||||||
|
|
||||||
/* generate(result, word, word2) - morphological generation by example(s) */
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word,
|
|
||||||
const char * word2);
|
|
||||||
|
|
||||||
/* generate(result, word, desc, n) - generation by morph. description(s)
|
|
||||||
* example:
|
|
||||||
* char ** result;
|
|
||||||
* char * affix = "is:plural"; // description depends from dictionaries, too
|
|
||||||
* int n = Hunspell_generate2(result, "word", &affix, 1);
|
|
||||||
* for (int i = 0; i < n; i++) printf("%s\n", result[i]);
|
|
||||||
*/
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word,
|
|
||||||
char** desc, int n);
|
|
||||||
|
|
||||||
/* functions for run-time modification of the dictionary */
|
|
||||||
|
|
||||||
/* add word to the run-time dictionary */
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED int Hunspell_add(Hunhandle *pHunspell, const char * word);
|
|
||||||
|
|
||||||
/* add word to the run-time dictionary with affix flags of
|
|
||||||
* the example (a dictionary word): Hunspell will recognize
|
|
||||||
* affixed forms of the new word, too.
|
|
||||||
*/
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, const char * example);
|
|
||||||
|
|
||||||
/* remove word from the run-time dictionary */
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED int Hunspell_remove(Hunhandle *pHunspell, const char * word);
|
|
||||||
|
|
||||||
/* free suggestion lists */
|
|
||||||
|
|
||||||
LIBHUNSPELL_DLL_EXPORTED void Hunspell_free_list(Hunhandle *pHunspell, char *** slst, int n);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
@ -23,7 +23,6 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell
|
|||||||
HashMgr* pHMgr[MAXDIC];
|
HashMgr* pHMgr[MAXDIC];
|
||||||
int maxdic;
|
int maxdic;
|
||||||
SuggestMgr* pSMgr;
|
SuggestMgr* pSMgr;
|
||||||
char * affixpath;
|
|
||||||
char * encoding;
|
char * encoding;
|
||||||
struct cs_info * csconv;
|
struct cs_info * csconv;
|
||||||
int langnum;
|
int langnum;
|
||||||
@ -34,15 +33,12 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
/* Hunspell(aff, dic) - constructor of Hunspell class
|
/* Hunspell(aff, dic) - constructor of Hunspell class
|
||||||
* input: path of affix file and dictionary file
|
* input: The affix and dictionary data as bytes
|
||||||
*/
|
*/
|
||||||
|
|
||||||
Hunspell(const char * affpath, const char * dpath, const char * key = NULL);
|
Hunspell(const char *affix_data, const size_t affix_len, const char *dic_data, const size_t dic_len);
|
||||||
~Hunspell();
|
~Hunspell();
|
||||||
|
|
||||||
/* load extra dictionaries (only dic files) */
|
|
||||||
int add_dic(const char * dpath, const char * key = NULL);
|
|
||||||
|
|
||||||
/* spell(word) - spellcheck word
|
/* spell(word) - spellcheck word
|
||||||
* output: 0 = bad word, not 0 = good word
|
* output: 0 = bad word, not 0 = good word
|
||||||
*
|
*
|
||||||
|
@ -1,193 +0,0 @@
|
|||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
#include "hunzip.hxx"
|
|
||||||
|
|
||||||
#define CODELEN 65536
|
|
||||||
#define BASEBITREC 5000
|
|
||||||
|
|
||||||
#define UNCOMPRESSED '\002'
|
|
||||||
#define MAGIC "hz0"
|
|
||||||
#define MAGIC_ENCRYPT "hz1"
|
|
||||||
#define MAGICLEN (sizeof(MAGIC) - 1)
|
|
||||||
|
|
||||||
int Hunzip::fail(const char * err, const char * par) {
|
|
||||||
fprintf(stderr, err, par);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
Hunzip::Hunzip(const char * file, const char * key) {
|
|
||||||
bufsiz = 0;
|
|
||||||
lastbit = 0;
|
|
||||||
inc = 0;
|
|
||||||
outc = 0;
|
|
||||||
dec = NULL;
|
|
||||||
fin = NULL;
|
|
||||||
filename = (char *) malloc(strlen(file) + 1);
|
|
||||||
if (filename) strcpy(filename, file);
|
|
||||||
if (getcode(key) == -1) bufsiz = -1;
|
|
||||||
else bufsiz = getbuf();
|
|
||||||
}
|
|
||||||
|
|
||||||
int Hunzip::getcode(const char * key) {
|
|
||||||
unsigned char c[2];
|
|
||||||
int i, j, n, p;
|
|
||||||
int allocatedbit = BASEBITREC;
|
|
||||||
const char * enc = key;
|
|
||||||
|
|
||||||
if (!filename) return -1;
|
|
||||||
|
|
||||||
fin = fopen(filename, "rb");
|
|
||||||
if (!fin) return -1;
|
|
||||||
|
|
||||||
// read magic number
|
|
||||||
if ((fread(in, 1, 3, fin) < MAGICLEN)
|
|
||||||
|| !(strncmp(MAGIC, in, MAGICLEN) == 0 ||
|
|
||||||
strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0)) {
|
|
||||||
return fail(MSG_FORMAT, filename);
|
|
||||||
}
|
|
||||||
|
|
||||||
// check encryption
|
|
||||||
if (strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0) {
|
|
||||||
unsigned char cs;
|
|
||||||
if (!key) return fail(MSG_KEY, filename);
|
|
||||||
if (fread(&c, 1, 1, fin) < 1) return fail(MSG_FORMAT, filename);
|
|
||||||
for (cs = 0; *enc; enc++) cs ^= *enc;
|
|
||||||
if (cs != c[0]) return fail(MSG_KEY, filename);
|
|
||||||
enc = key;
|
|
||||||
} else key = NULL;
|
|
||||||
|
|
||||||
// read record count
|
|
||||||
if (fread(&c, 1, 2, fin) < 2) return fail(MSG_FORMAT, filename);
|
|
||||||
|
|
||||||
if (key) {
|
|
||||||
c[0] ^= *enc;
|
|
||||||
if (*(++enc) == '\0') enc = key;
|
|
||||||
c[1] ^= *enc;
|
|
||||||
}
|
|
||||||
|
|
||||||
n = ((int) c[0] << 8) + c[1];
|
|
||||||
dec = (struct bit *) malloc(BASEBITREC * sizeof(struct bit));
|
|
||||||
if (!dec) return fail(MSG_MEMORY, filename);
|
|
||||||
dec[0].v[0] = 0;
|
|
||||||
dec[0].v[1] = 0;
|
|
||||||
|
|
||||||
// read codes
|
|
||||||
for (i = 0; i < n; i++) {
|
|
||||||
unsigned char l;
|
|
||||||
if (fread(c, 1, 2, fin) < 2) return fail(MSG_FORMAT, filename);
|
|
||||||
if (key) {
|
|
||||||
if (*(++enc) == '\0') enc = key;
|
|
||||||
c[0] ^= *enc;
|
|
||||||
if (*(++enc) == '\0') enc = key;
|
|
||||||
c[1] ^= *enc;
|
|
||||||
}
|
|
||||||
if (fread(&l, 1, 1, fin) < 1) return fail(MSG_FORMAT, filename);
|
|
||||||
if (key) {
|
|
||||||
if (*(++enc) == '\0') enc = key;
|
|
||||||
l ^= *enc;
|
|
||||||
}
|
|
||||||
if (fread(in, 1, l/8+1, fin) < (size_t) l/8+1) return fail(MSG_FORMAT, filename);
|
|
||||||
if (key) for (j = 0; j <= l/8; j++) {
|
|
||||||
if (*(++enc) == '\0') enc = key;
|
|
||||||
in[j] ^= *enc;
|
|
||||||
}
|
|
||||||
p = 0;
|
|
||||||
for (j = 0; j < l; j++) {
|
|
||||||
int b = (in[j/8] & (1 << (7 - (j % 8)))) ? 1 : 0;
|
|
||||||
int oldp = p;
|
|
||||||
p = dec[p].v[b];
|
|
||||||
if (p == 0) {
|
|
||||||
lastbit++;
|
|
||||||
if (lastbit == allocatedbit) {
|
|
||||||
allocatedbit += BASEBITREC;
|
|
||||||
dec = (struct bit *) realloc(dec, allocatedbit * sizeof(struct bit));
|
|
||||||
}
|
|
||||||
dec[lastbit].v[0] = 0;
|
|
||||||
dec[lastbit].v[1] = 0;
|
|
||||||
dec[oldp].v[b] = lastbit;
|
|
||||||
p = lastbit;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
dec[p].c[0] = c[0];
|
|
||||||
dec[p].c[1] = c[1];
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
Hunzip::~Hunzip()
|
|
||||||
{
|
|
||||||
if (dec) free(dec);
|
|
||||||
if (fin) fclose(fin);
|
|
||||||
if (filename) free(filename);
|
|
||||||
}
|
|
||||||
|
|
||||||
int Hunzip::getbuf() {
|
|
||||||
int p = 0;
|
|
||||||
int o = 0;
|
|
||||||
do {
|
|
||||||
if (inc == 0) inbits = fread(in, 1, BUFSIZE, fin) * 8;
|
|
||||||
for (; inc < inbits; inc++) {
|
|
||||||
int b = (in[inc / 8] & (1 << (7 - (inc % 8)))) ? 1 : 0;
|
|
||||||
int oldp = p;
|
|
||||||
p = dec[p].v[b];
|
|
||||||
if (p == 0) {
|
|
||||||
if (oldp == lastbit) {
|
|
||||||
fclose(fin);
|
|
||||||
fin = NULL;
|
|
||||||
// add last odd byte
|
|
||||||
if (dec[lastbit].c[0]) out[o++] = dec[lastbit].c[1];
|
|
||||||
return o;
|
|
||||||
}
|
|
||||||
out[o++] = dec[oldp].c[0];
|
|
||||||
out[o++] = dec[oldp].c[1];
|
|
||||||
if (o == BUFSIZE) return o;
|
|
||||||
p = dec[p].v[b];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inc = 0;
|
|
||||||
} while (inbits == BUFSIZE * 8);
|
|
||||||
return fail(MSG_FORMAT, filename);
|
|
||||||
}
|
|
||||||
|
|
||||||
const char * Hunzip::getline() {
|
|
||||||
char linebuf[BUFSIZE];
|
|
||||||
int l = 0, eol = 0, left = 0, right = 0;
|
|
||||||
if (bufsiz == -1) return NULL;
|
|
||||||
while (l < bufsiz && !eol) {
|
|
||||||
linebuf[l++] = out[outc];
|
|
||||||
switch (out[outc]) {
|
|
||||||
case '\t': break;
|
|
||||||
case 31: { // escape
|
|
||||||
if (++outc == bufsiz) {
|
|
||||||
bufsiz = getbuf();
|
|
||||||
outc = 0;
|
|
||||||
}
|
|
||||||
linebuf[l - 1] = out[outc];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case ' ': break;
|
|
||||||
default: if (((unsigned char) out[outc]) < 47) {
|
|
||||||
if (out[outc] > 32) {
|
|
||||||
right = out[outc] - 31;
|
|
||||||
if (++outc == bufsiz) {
|
|
||||||
bufsiz = getbuf();
|
|
||||||
outc = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (out[outc] == 30) left = 9; else left = out[outc];
|
|
||||||
linebuf[l-1] = '\n';
|
|
||||||
eol = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (++outc == bufsiz) {
|
|
||||||
outc = 0;
|
|
||||||
bufsiz = fin ? getbuf(): -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (right) strcpy(linebuf + l - 1, line + strlen(line) - right - 1);
|
|
||||||
else linebuf[l] = '\0';
|
|
||||||
strcpy(line + left, linebuf);
|
|
||||||
return line;
|
|
||||||
}
|
|
@ -1,45 +0,0 @@
|
|||||||
/* hunzip: file decompression for sorted dictionaries with optional encryption,
|
|
||||||
* algorithm: prefix-suffix encoding and 16-bit Huffman encoding */
|
|
||||||
|
|
||||||
#ifndef _HUNZIP_HXX_
|
|
||||||
#define _HUNZIP_HXX_
|
|
||||||
|
|
||||||
#include "hunvisapi.h"
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
#define BUFSIZE 65536
|
|
||||||
#define HZIP_EXTENSION ".hz"
|
|
||||||
|
|
||||||
#define MSG_OPEN "error: %s: cannot open\n"
|
|
||||||
#define MSG_FORMAT "error: %s: not in hzip format\n"
|
|
||||||
#define MSG_MEMORY "error: %s: missing memory\n"
|
|
||||||
#define MSG_KEY "error: %s: missing or bad password\n"
|
|
||||||
|
|
||||||
struct bit {
|
|
||||||
unsigned char c[2];
|
|
||||||
int v[2];
|
|
||||||
};
|
|
||||||
|
|
||||||
class LIBHUNSPELL_DLL_EXPORTED Hunzip
|
|
||||||
{
|
|
||||||
|
|
||||||
protected:
|
|
||||||
char * filename;
|
|
||||||
FILE * fin;
|
|
||||||
int bufsiz, lastbit, inc, inbits, outc;
|
|
||||||
struct bit * dec; // code table
|
|
||||||
char in[BUFSIZE]; // input buffer
|
|
||||||
char out[BUFSIZE + 1]; // Huffman-decoded buffer
|
|
||||||
char line[BUFSIZE + 50]; // decoded line
|
|
||||||
int getcode(const char * key);
|
|
||||||
int getbuf();
|
|
||||||
int fail(const char * err, const char * par);
|
|
||||||
|
|
||||||
public:
|
|
||||||
Hunzip(const char * filename, const char * key = NULL);
|
|
||||||
~Hunzip();
|
|
||||||
const char * getline();
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
Loading…
x
Reference in New Issue
Block a user