Source code for nlpir.native.ictclas

# coding=utf-8
from nlpir.native.nlpir_base import NLPIRBase
from ctypes import c_bool, c_char, c_char_p, c_double, c_int, c_uint, POINTER, Structure, byref
import typing


[docs]class ResultT(Structure): """The NLPIR ``result_t`` structure. copy from pynlpir""" _fields_ = [ # The start position of the word in the source Chinese text string. ('start', c_int), # The detected word's length. ('length', c_int), # A string representing the word's part of speech. ('sPOS', c_char * 40), ('iPOS', c_int), ('word_ID', c_int), # If the word is found in the user's dictionary. ('word_type', c_int), # The weight of the detected word. ('weight', c_int) ]
[docs]class ICTCLAS(NLPIRBase): """ A dynamic link library native class for Chinese Segmentation """ POS_MAP_NUMBER = 4 # add by qp 2008.11.25 ICT_POS_MAP_FIRST = 1 # 计算所一级标注集 ICT_POS_MAP_SECOND = 0 # 计算所二级标注集 PKU_POS_MAP_SECOND = 2 # 北大二级标注集 PKU_POS_MAP_FIRST = 3 # 北大一级标注集 POS_SIZE = 40 @property def dll_name(self) -> str: return "NLPIR"
[docs] @NLPIRBase.byte_str_transform def init_lib(self, data_path: str, encode: int, license_code: str) -> int: """ Call **NLPIR_Init** :param str data_path: :param int encode: :param str license_code: :return: 1 success 0 fail """ return self.get_func('NLPIR_Init', [c_char_p, c_int, c_char_p], c_int)(data_path, encode, license_code)
[docs] def exit_lib(self) -> bool: """ Call **NLPIR_Exit** :return: exit success or not """ return self.get_func('NLPIR_Exit', restype=c_bool)()
[docs] @NLPIRBase.byte_str_transform def paragraph_process(self, paragraph: str, pos_tagged: int = 1) -> str: """ Call **NLPIR_ParagraphProcessing** Chinese word segment, segment paragraph to a string :param str paragraph: the string want to be segmented :param int pos_tagged: show the pos tag or not 1-> True, 0-> False :return: segmented string """ return self.get_func('NLPIR_ParagraphProcess', [c_char_p, c_int], c_char_p)(paragraph, pos_tagged)
[docs] @NLPIRBase.byte_str_transform def paragraph_process_a(self, paragraph: str, user_dict: bool = True) -> typing.Tuple[ResultT, int]: """ Call **ParagraphProcessingA** Segment paragraph to an Array of ResultT, get more detail info :param str paragraph: the string want to be segmented :param bool user_dict: use user dictionary or not :return: a result of segment, an array of ResultT and the length of the ResultT """ self.logger.warning("not recommended, use paragraph_process instead") result_count = c_int() result = self.get_func('NLPIR_ParagraphProcessA', [c_char_p, POINTER(c_int), c_bool], POINTER(ResultT))( paragraph, byref(result_count), user_dict ) return result, result_count.value
[docs] @NLPIRBase.byte_str_transform def get_paragraph_process_a_word_count(self, paragraph: str) -> int: raise NotImplementedError("Not recommended, use paragraph_process")
[docs] @NLPIRBase.byte_str_transform def paragraph_process_aw(self, count: int, result: ResultT) -> None: raise NotImplementedError("Not recommended, use paragraph_process")
[docs] @NLPIRBase.byte_str_transform def file_process(self, source_filename: str, result_filename: str, pos_tagged: int = 1) -> float: """ Call **NLPIR_FileProcess** Segment a text file and save to a file. :param str source_filename: the path of a text file that want to be segmented :param str result_filename: the path to save the result of segmentation :param int pos_tagged: show the pos tag or not 1->True, 0->False """ return self.get_func('NLPIR_FileProcess', [c_char_p, c_char_p, c_int], c_double)( source_filename, result_filename, pos_tagged )
[docs] @NLPIRBase.byte_str_transform def import_user_dict(self, filename: str, overwrite: bool = False) -> int: """ Call **NLPIR_ImportUserDict** Import a user dict to the system, the format of the dict file:: word1 pos_tag word2 pos_tag If you import a user dict to the system, the user dict will save to the system (in Data directory). You cannot delete the word in the user dict from the system use :func:`clean_user_word` or :func:`del_usr_word`. **TODO** add more comment for clean the user dict and add the function to the high-level method :param str filename: the path of user dict file :param bool overwrite: overwrite the current user dict or not :return: import success or not 1->True 2->False """ return self.get_func('NLPIR_ImportUserDict', [c_char_p, c_bool], c_uint)(filename, overwrite)
[docs] @NLPIRBase.byte_str_transform def add_user_word(self, word: str) -> int: """ Call **NLPIR_AddUserWord** Add a word to the user dictionary ,example:: 单词 词性 or:: 单词 (default n) The added word only add in memory and will not affect the user dict, you can use :func:`clean_user_word` or :func:`del_usr_word` to delete the word or all the words in memory. If you want to save to the user dict ,use :func:`save_the_usr_dic` to save to the *Data* directory. :param str word: :return: 1,true ; 0,false """ return self.get_func('NLPIR_AddUserWord', [c_char_p], c_int)(word)
[docs] @NLPIRBase.byte_str_transform def clean_user_word(self) -> int: """ Call **NLPIR_CleanUserWord** Clean all temporary added user words, more info see :func:`add_user_word` TODO figure out the return value :return: 1,true ; 0,false """ return self.get_func('NLPIR_CleanUserWord', None, c_int)()
[docs] @NLPIRBase.byte_str_transform def clean_current_user_word(self) -> int: """ Call **NLPIR_CleanCurrentUserWord** Clean all Current temporary added user words and restore previous stored data ** Now Only for win and linux x64 ** :return: 1,true; 2,false """ return self.get_func('NLPIR_CleanCurrentUserWord', None, c_int)()
[docs] @NLPIRBase.byte_str_transform def save_the_usr_dic(self) -> int: """ Call **NLPIR_SaveTheUsrDic** Save in-memory dict to user dict, more info see :func:`add_user_word` :return: 1,true; 2,false """ return self.get_func('NLPIR_SaveTheUsrDic', None, c_int)()
[docs] @NLPIRBase.byte_str_transform def del_usr_word(self, word: str) -> int: """ Call **NLPIR_DelUsrWord** Delete a word from the user dictionary, more info see :func:`add_user_word` :param str word: the word to be delete :return: -1, the word not exist in the user dictionary; else, the handle of the word deleted """ return self.get_func('NLPIR_DelUsrWord', [c_char_p], c_int)(word)
[docs] @NLPIRBase.byte_str_transform def get_uni_prob(self, word) -> float: """ Call **NLPIR_GetUniProb** Get Unigram Probability :param str word: input word :return: The unitary probability of a word. """ return self.get_func("NLPIR_GetUniProb", [c_char_p], c_double)(word)
[docs] @NLPIRBase.byte_str_transform def is_word(self, word: str) -> int: """ Call **NLPIR_IsWord** Judge whether the word is included in the core dictionary :param str word: input word :return: 1: exists; 0: no exists """ return self.get_func("NLPIR_IsWord", [c_char_p], c_int)(word)
[docs] @NLPIRBase.byte_str_transform def is_user_word(self, word: str, is_ascii: bool = False) -> int: """ Call **NLPIR_IsUserWord** Judge whether the word is included in the user-defined dictionary :param str word: input word :param bool is_ascii: is ascii encode or not :return: 1: exists; 0: no exists """ return self.get_func("NLPIR_IsUserWord", [c_char_p], c_int)(word, is_ascii)
[docs] @NLPIRBase.byte_str_transform def get_word_pos(self, word: str) -> str: """ Call **NLPIR_GetWordPOS** Get the word Part-Of-Speech information :param str word: input word :return: pos tagging """ return self.get_func("NLPIR_GetWordPOS", [c_char_p], c_char_p)(word)
[docs] def set_pos_map(self, pos_map: int) -> int: """ Call **NLPIR_SetPOSmap** Select which pos map will use: - :attr:`ICT_POS_MAP_FIRST` 计算所一级标注集 - :attr:`ICT_POS_MAP_SECOND` 计算所二级标注集 - :attr:`PKU_POS_MAP_SECOND` 北大二级标注集 - :attr:`PKU_POS_MAP_FIRST` 北大一级标注集 Default is :attr:`ICT_POS_MAP_SECOND` :param int pos_map: :return: 0, failed; else, success """ return self.get_func("NLPIR_SetPOSmap")(pos_map)
[docs] @NLPIRBase.byte_str_transform def finer_segment(self, line: str) -> str: """ Call **NLPIR_FinerSegment** 当前的切分结果过大时,如“中华人民共和国”, 需要执行该函数,将切分结果细分为“中华 人民 共和国” 细分粒度最大为三个汉字,如果不能细分,则返回为空字符串 :param str line: string need to be segmented :return: segmented string, return null string if line cannot be segmented """ return self.get_func("NLPIR_FinerSegment", [c_char_p], c_char_p)(line)
[docs] @NLPIRBase.byte_str_transform def get_eng_word_origin(self, word: str) -> str: """ Call **NLPIR_GetEngWordOrign** 获取各类英文单词的原型,考虑了过去分词、单复数等情况:: driven->drive drives->drive drove-->drive :param str word: word to be stemmed :return: the stemmed word """ return self.get_func("NLPIR_GetEngWordOrign", [c_char_p], c_char_p)(word)
[docs] @NLPIRBase.byte_str_transform def word_freq_stat(self, text: str, stop_word_remove: bool = True) -> str: """ Call **NLPIR_WordFreqStat** 获取输入文本的词,词性,频统计结果,按照词频大小排序 :param str text: 输入的文本内容 :param bool stop_word_remove: true-去除停用词 false-不去除停用词 :return: 返回的是词频统计结果形式如下 :: 张华平/nr/10#博士/n/9#分词/n/8 """ return self.get_func("NLPIR_WordFreqStat", [c_char_p, c_bool], c_char_p)(text, stop_word_remove)
[docs] @NLPIRBase.byte_str_transform def file_word_freq_stat(self, filename: str, stop_word_remove: bool = True) -> str: """ Call **NLPIR_FileWordFreqStat** Same as :func:`word_freq_stat` :param str filename: path of text file :param bool stop_word_remove: remove stop words or not :return: same as :func:`word_freq_stat` """ return self.get_func("NLPIR_FileWordFreqStat", [c_char_p, c_bool], c_char_p)(filename, stop_word_remove)
[docs] @NLPIRBase.byte_str_transform def get_last_error_msg(self) -> str: """ Call **NLPIR_GetLastErrorMsg** :return: error message """ return self.get_func("NLPIR_GetLastErrorMsg", None, c_char_p)()
[docs] @NLPIRBase.byte_str_transform def tokenizer_for_ir(self, sline: str) -> str: """ Call **NLPIR_Tokenizer4IR** 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词 :param str sline: The source paragraph :param bool stop_word_remove: remove stop words or not :return: same as :func:`word_freq_stat` """ return self.get_func("NLPIR_Tokenizer4IR", [c_char_p, c_bool], c_char_p)(sline)