如何处理 C 中的 python 集合对象操作?

how to tackle the python set object operations in C?

我想在Python中引入C代码,C代码有如下语句:

#include<Python.h>

PyObject *getFeature(wchar_t *text,
                     PyObject *unigram);
// where the unigram is a Set Object with type 'PySetObject'
#include<test.h>

PyObject *getFeature(wchar_t *text,
                     PyObject *unigram)
{
    int ret = -1;
    PyObject *featureList = PyList_New(0);

    PyObject *curString = PyUnicode_FromWideChar(text, 2);
    ret = PySet_Contains(unigram, curString);
    printf("## res: `nc`, %d.\n", ret);
    ret = PyList_Append(featureList, curString);

    return featureList;
}

然后我编译它并得到一个名为 libtest.so 的共享库。所以我可以将这个 C .so 文件导入 python 代码 ctypes 如下所示:

import ctypes

dir_path = 'path/to/the/libtest.so'
feature_extractor = ctypes.cdll.LoadLibrary(
    os.path.join(dir_path, 'libtest.so'))
get_feature_c = feature_extractor.getFeature
get_feature_c.argtypes = [
    ctypes.c_wchar_p, ctypes.py_object]
get_feature_c.restype = ctypes.py_object

unigram = {'据','nc', 'kls'}
print(hash('据'))
print(hash('nc'))
print(hash('kls'))
res = get_feature_c('nc', unigram)


执行这个test.py文件,我会遇到以下错误:

6875335301337518411
6875335301337518411
-5567445891360670268
Segmentation fault

我知道错误是由于具有相同散列值 6875335301337518411 的不同字符串nc 的冲突引起的. Python 使用二级哈希表来解决具有相同哈希值的字符串的冲突。

那么如何解决这个问题,将二级冲突哈希表导入到C代码中呢?

哈希匹配是一个转移注意力的问题。问题是没有使用 PyDLL,因此在使用 CPython API 时会保留 GIL。

test.c

#include <Python.h>

#ifdef _WIN32
#   define API __declspec(dllexport)
#else
#   define API
#endif

API PyObject *getFeature(wchar_t *text, PyObject *unigram)
{
    int ret = -1;
    PyObject *featureList = PyList_New(0);

    PyObject *curString = PyUnicode_FromWideChar(text, 2);
    ret = PySet_Contains(unigram, curString);
    printf("## res: `nc`, %d.\n", ret);
    ret = PyList_Append(featureList, curString);
    Py_DECREF(curString); // fix reference leak
    return featureList;
}

test.py

import ctypes as ct

dll = ct.PyDLL('./test') # Use PyDLL so GIL is held
dll.getFeature.argtypes = ct.c_wchar_p, ct.py_object
dll.getFeature.restype = ct.py_object

unigram = {'据','nc', 'kls'}
print(hash('据'))
print(hash('nc'))
print(hash('kls'))
print(dll.getFeature('nc', unigram))

输出:

5393181648594783828
5393181648594783828
-5015907635941537187
## res: `nc`, 1.
['nc']