Source code for nlpir.native.new_word_finder

# coding=utf-8
from nlpir.native.nlpir_base import NLPIRBase
from nlpir.native import nlpir_base
from ctypes import c_bool, c_char_p, c_int, c_uint, c_ulong


[docs]class NewWordFinder(NLPIRBase): @property def dll_name(self) -> str: return "NewWordFinder"
[docs] @NLPIRBase.byte_str_transform def init_lib(self, data_path: str, encode: int, license_code: str) -> int: """ Call **NWF_Init** :param str data_path: :param int encode: :param str license_code: :return: 1 success 0 fail """ return self.get_func("NWF_Init", [c_char_p, c_int, c_char_p], c_int)(data_path, encode, license_code)
[docs] @NLPIRBase.byte_str_transform def exit_lib(self) -> bool: """ Call **NWF_Exit** :return: exit success or not """ return self.get_func("NWF_Exit", [None], c_bool)()
[docs] @NLPIRBase.byte_str_transform def get_new_words( self, line: str, max_key_limit: int = 50, format_opt: int = nlpir_base.OUTPUT_FORMAT_SHARP ) -> str: """ Call **NWF_GetNewWords** Extract New words from line :param str line: the input paragraph The input size cannot be very big(less than 60MB). Process large memory, recommend use NWF_NWI series functions :param str max_key_limit: maximum of key words, up to 50 :param int format_opt: output format option, there three options: - :data:`nlpir.native.nlpir_base.OUTPUT_FORMAT_SHARP` get string split by sharp - :data:`nlpir.native.nlpir_base.OUTPUT_FORMAT_JSON` get json format - :data:`nlpir.native.nlpir_base.OUTPUT_FORMAT_EXCEL` get csv format :return: new words list :: Sharp format "科学发展观/23.80/1#屌丝/12.20/2" with weight Json格式如下: [ { "freq" : 152, "pos" : "n_new", "weight" : 77.884208081632579, "word" : "公允价值" }, { "freq" : 71, "pos" : "n_new", "weight" : 75.102183562405372, "word" : "长期股权投资" } ] """ return self.get_func("NWF_GetNewWords", [c_char_p, c_int, c_int], c_char_p)(line, max_key_limit, format_opt)
[docs] @NLPIRBase.byte_str_transform def get_file_new_words( self, file_name: str, max_key_limit: int = 50, format_opt: int = nlpir_base.OUTPUT_FORMAT_SHARP ) -> str: """ Call **NWF_GetFileNewWords** Extract new words from a text file :param str file_name: the path of text file :param int max_key_limit: max key want to get :param int format_opt: same as :func:`get_new_words` :return: same as :func:`get_new_words` """ return self.get_func("NWF_GetFileNewWords", [c_char_p, c_int, c_int], c_char_p)( file_name, max_key_limit, format_opt )
""" * 以下函数为2013版本专门针对新词发现的过程,一般建议脱机实现,不宜在线处理 * 新词识别完成后,再自动导入到分词系统中,即可完成 * 函数以NWF_NWI(New Word Identification)开头 """
[docs] @NLPIRBase.byte_str_transform def batch_start(self) -> bool: """ Call **NWF_Batch_Start** 启动新词识别,for very large size of data :return: true:success, false:fail """ return self.get_func("NWF_Batch_Start", [None], c_int)()
[docs] @NLPIRBase.byte_str_transform def batch_addfile(self, filename: str) -> int: """ Call **NWF_Batch_AddFile** 往新词识别系统中添加待识别新词的文本文件,需要在运行NWF_Batch_Start()之后,才有效 :param str filename: the path of file :return: 1 success 0 fail """ return self.get_func("NWF_Batch_AddFile", [c_char_p], c_ulong)(filename)
[docs] @NLPIRBase.byte_str_transform def batch_addmen(self, text: str) -> int: """ Call **NWF_Batch_AddMem** 往新词识别系统中添加一段待识别新词的内存,需要在运行NWF_Batch_Start()之后,才有效 :param str text: text string :return: 1 success 0 fail """ return self.get_func("NWF_Batch_AddMem", [c_char_p], c_ulong)(text)
[docs] @NLPIRBase.byte_str_transform def batch_complete(self) -> int: """ Call **NWF_Batch_Complete** 新词识别添加内容结束,需要在运行NWF_Batch_Start()之后,才有效 :return: 1 success 0 fail """ return self.get_func("NWF_Batch_Complete", None, c_int)()
[docs] @NLPIRBase.byte_str_transform def batch_getresult(self, format_json: bool = False) -> str: """ Call **NWF_Batch_GetResult** 获取新词识别的结果, 需要在运行NWF_Batch_Complete()之后,才有效 :param bool format_json: get json format or not :return: 输出格式为 :: 新词1】 【权重1】 【新词2】 【权重2】 ... Json格式如下: [ { "freq" : 152, "pos" : "n_new", "weight" : 77.884208081632579, "word" : "公允价值" }, { "freq" : 71, "pos" : "n_new", "weight" : 75.102183562405372, "word" : "长期股权投资" } ] """ return self.get_func("NWF_Batch_GetResult", [c_bool], c_char_p)(format_json)
[docs] @NLPIRBase.byte_str_transform def result2user_dict(self) -> int: """ Call **NWF_Result2UserDict** 将新词识别结果导入到用户词典中,需要在运行NLPIR_NWI_Complete()之后,才有效. 如果需要将新词结果永久保存,建议在执行NLPIR_SaveTheUsrDic :return: bool, true:success, false:fail """ return self.get_func("NWF_Result2UserDict", None, c_uint)()
[docs] @NLPIRBase.byte_str_transform def get_last_error_msg(self) -> str: return self.get_func("NWF_GetLastErrorMsg", None, c_char_p)()