mirror of
https://github.com/kovidgoyal/calibre.git
synced 2026-02-07 11:33:30 -05:00
111 lines
3.7 KiB
C
111 lines
3.7 KiB
C
/*
|
|
* unicode_names.c
|
|
* Copyright (C) 2018 Kovid Goyal <kovid at kovidgoyal.net>
|
|
*
|
|
* Distributed under terms of the GPL3 license.
|
|
*/
|
|
|
|
#include "names.h"
|
|
|
|
static inline void
|
|
add_matches(const word_trie *wt, char_type *codepoints, size_t *pos, const size_t sz) {
|
|
size_t num = mark_groups[wt->match_offset];
|
|
for (size_t i = wt->match_offset + 1; i < wt->match_offset + 1 + num && *pos < sz; i++, (*pos)++) {
|
|
codepoints[*pos] = mark_to_cp[mark_groups[i]];
|
|
}
|
|
}
|
|
|
|
static void
|
|
process_trie_node(const word_trie *wt, char_type *codepoints, size_t *pos, const size_t sz) {
|
|
if (wt->match_offset) add_matches(wt, codepoints, pos, sz);
|
|
size_t num_children = children_array[wt->children_offset];
|
|
if (!num_children) return;
|
|
for (size_t c = wt->children_offset + 1; c < wt->children_offset + 1 + num_children; c++) {
|
|
if (*pos > sz) return;
|
|
uint32_t x = children_array[c];
|
|
process_trie_node(&all_trie_nodes[x >> 8], codepoints, pos, sz);
|
|
}
|
|
}
|
|
|
|
static inline PyObject*
|
|
codepoints_for_word(const char *word, size_t len) {
|
|
const word_trie *wt = all_trie_nodes;
|
|
for (size_t i = 0; i < len; i++) {
|
|
unsigned char ch = word[i];
|
|
size_t num_children = children_array[wt->children_offset];
|
|
if (!num_children) return PyFrozenSet_New(NULL);
|
|
bool found = false;
|
|
for (size_t c = wt->children_offset + 1; c < wt->children_offset + 1 + num_children; c++) {
|
|
uint32_t x = children_array[c];
|
|
if ((x & 0xff) == ch) {
|
|
found = true;
|
|
wt = &all_trie_nodes[x >> 8];
|
|
break;
|
|
}
|
|
}
|
|
if (!found) return PyFrozenSet_New(NULL);
|
|
}
|
|
static char_type codepoints[1024];
|
|
size_t cpos = 0;
|
|
process_trie_node(wt, codepoints, &cpos, arraysz(codepoints));
|
|
PyObject *ans = PyFrozenSet_New(NULL); if (ans == NULL) return NULL;
|
|
for (size_t i = 0; i < cpos; i++) {
|
|
PyObject *t = PyLong_FromUnsignedLong(codepoints[i]); if (t == NULL) { Py_DECREF(ans); return NULL; }
|
|
int ret = PySet_Add(ans, t); Py_DECREF(t); if (ret != 0) { Py_DECREF(ans); return NULL; }
|
|
}
|
|
return ans;
|
|
}
|
|
|
|
static PyObject*
|
|
cfw(PyObject *self UNUSED, PyObject *args) {
|
|
const char *word;
|
|
if (!PyArg_ParseTuple(args, "s", &word)) return NULL;
|
|
return codepoints_for_word(word, strlen(word));
|
|
}
|
|
|
|
static PyObject*
|
|
nfc(PyObject *self UNUSED, PyObject *args) {
|
|
unsigned int cp;
|
|
if (!PyArg_ParseTuple(args, "I", &cp)) return NULL;
|
|
const char *n = name_for_codepoint(cp);
|
|
if (n == NULL) Py_RETURN_NONE;
|
|
return PyUnicode_FromString(n);
|
|
}
|
|
|
|
static PyMethodDef unicode_names_methods[] = {
|
|
{"codepoints_for_word", (PyCFunction)cfw, METH_VARARGS,
|
|
"Return a set of integer codepoints for where each codepoint's name "
|
|
"contains ``word``,"},
|
|
{"name_for_codepoint", (PyCFunction)nfc, METH_VARARGS,
|
|
"Returns the given codepoint's name"},
|
|
{NULL, NULL, 0, NULL} /* Sentinel */
|
|
};
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
#define INITERROR return NULL
|
|
static struct PyModuleDef unicode_names_module = {
|
|
/* m_base */ PyModuleDef_HEAD_INIT,
|
|
/* m_name */ "unicode_names",
|
|
/* m_doc */ "A library to assist with selecting special characters",
|
|
/* m_size */ -1,
|
|
/* m_methods */ unicode_names_methods,
|
|
/* m_slots */ 0,
|
|
/* m_traverse */ 0,
|
|
/* m_clear */ 0,
|
|
/* m_free */ 0,
|
|
};
|
|
|
|
CALIBRE_MODINIT_FUNC PyInit_unicode_names(void) {
|
|
#else
|
|
#define INITERROR return
|
|
CALIBRE_MODINIT_FUNC initunicode_names(void) {
|
|
#endif
|
|
// Create the module
|
|
#if PY_MAJOR_VERSION >= 3
|
|
PyObject *mod = PyModule_Create(&unicode_names_module);
|
|
return mod;
|
|
#else
|
|
Py_InitModule3("unicode_names", unicode_names_methods, "");
|
|
#endif
|
|
}
|