Source code for nlpir.native.key_extract
# coding=utf-8
from nlpir.native.nlpir_base import NLPIRBase
from nlpir.native import nlpir_base
from ctypes import c_bool, c_char_p, c_int, c_uint, c_ulong
import typing
[docs]class KeyExtract(NLPIRBase):
"""
A dynamic link library native class for Key Words Extract
"""
@property
def dll_name(self) -> str:
return "KeyExtract"
[docs] @NLPIRBase.byte_str_transform
def init_lib(self, data_path: str, encode: int, license_code: str) -> int:
"""
Call **KeyExtract_Init**
:param str data_path:
:param int encode:
:param str license_code:
:return: 1 success 0 fail
"""
return self.get_func('KeyExtract_Init', [c_char_p, c_int, c_char_p], c_int)(data_path, encode, license_code)
[docs] def exit_lib(self) -> bool:
"""
Call **KeyExtract_Exit**
:return: exit success or not
"""
return self.get_func('KeyExtract_Exit', restype=c_bool)()
[docs] @NLPIRBase.byte_str_transform
def get_keywords(self, line: str, max_key_limit: int = 50, format_opt: int = nlpir_base.OUTPUT_FORMAT_SHARP) -> str:
"""
Call **KeyExtract_GetKeyWords**
Extract keyword from text, 从文本中获取关键词
:param line: the input paragraph
:param max_key_limit: maximum of key words, up to 50
:param format_opt: output format option, there three options:
- :data:`nlpir.native.nlpir_base.OUTPUT_FORMAT_SHARP` get string split by sharp
- :data:`nlpir.native.nlpir_base.OUTPUT_FORMAT_JSON` get json format
- :data:`nlpir.native.nlpir_base.OUTPUT_FORMAT_EXCEL` get csv format
:return: the keyword with weight
Split with ``#``:
::
科学发展观/n/23.80/12#宏观经济/n/12.20/12#
JSON形式:
::
[
{
'freq': 2,
'pos': 'n_new',
'weight': 7.771335980376418,
'word': '国家权力'
},{
'freq': 7,
'pos': 'n',
'weight': 7.438759706600493,
'word': '权力'
},{
'freq': 1,
'pos': 'nrf',
'weight': 5.280000338096665,
'word': '孟德斯鸠'
},{ ...
}, ...
]
"""
return self.get_func('KeyExtract_GetKeyWords', [c_char_p, c_int, c_int], c_char_p)(
line, max_key_limit, format_opt)
[docs] @NLPIRBase.byte_str_transform
def get_file_keywords(
self,
filename: str,
max_key_limit: int = 50,
format_opt: int = nlpir_base.OUTPUT_FORMAT_SHARP
) -> str:
"""
Call **KeyExtract_GetKeyWords**
Extract keyword from file, 从文本文件中获取关键词
:param filename: the input text file
:param max_key_limit: maximum of key words, up to 50
:param format_opt: same as :func:`get_keywords`
:return: the keyword with weight
Split with ``#``
::
科学发展观/n/23.80/12#宏观经济/n/12.20/12#
JSON形式:
::
[
{
'freq': 2,
'pos': 'n_new',
'weight': 7.771335980376418,
'word': '国家权力'
},{
'freq': 7,
'pos': 'n',
'weight': 7.438759706600493,
'word': '权力'
},{
'freq': 1,
'pos': 'nrf',
'weight': 5.280000338096665,
'word': '孟德斯鸠'
},{ ...
}, ...
]
"""
return self.get_func('KeyExtract_GetFileKeyWords', [c_char_p, c_int, c_int], c_char_p)(
filename, max_key_limit, format_opt)
[docs] @NLPIRBase.byte_str_transform
def import_user_dict(self, filename: str, overwrite: bool = False):
"""
Call **KeyExtract_ImportUserDict**
Import a user dict to the system, the format of the dict file::
word1 pos_tag
word2 pos_tag
If you import a user dict to the system, the user dict will save to the system (in Data directory).
You cannot delete the word in the user dict from the system use :func:`clean_user_word` or :func:`del_usr_word`.
:param str filename: the path of user dict file
:param bool overwrite: overwrite the current user dict or not
:return: import success or not 1->True 2->False
"""
return self.get_func('KeyExtract_ImportUserDict', [c_char_p, c_bool], c_uint)(filename, overwrite)
[docs] @NLPIRBase.byte_str_transform
def add_user_word(self, word: str) -> int:
"""
Call **KeyExtract_AddUserWord**
Add a word to the user dictionary ,example::
单词 词性
or::
单词 (default n)
The added word only add in memory and will not affect the user dict, you can use :func:`clean_user_word` or
:func:`del_usr_word` to delete the word or all the words in memory. If you want to save to the user dict ,use
:func:`save_the_usr_dic` to save to the *Data* directory.
:param str word:
:return: 1,true ; 0,false
"""
return self.get_func('KeyExtract_AddUserWord', [c_char_p], c_int)(word)
[docs] @NLPIRBase.byte_str_transform
def clean_user_word(self) -> int:
"""
Call **KeyExtract_CleanUserWord**
Clean all temporary added user words, more info see :func:`add_user_word`
:return: 1,true ; 0,false
"""
return self.get_func('KeyExtract_CleanUserWord', None, c_int)()
[docs] @NLPIRBase.byte_str_transform
def save_the_usr_dic(self) -> int:
"""
Call **KeyExtract_SaveTheUsrDic**
Save in-memory dict to user dict, more info see :func:`add_user_word`
:return: 1,true; 2,false
"""
return self.get_func('KeyExtract_SaveTheUsrDic', None, c_int)()
[docs] @NLPIRBase.byte_str_transform
def del_usr_word(self, word: str) -> int:
"""
Call **KeyExtract_DelUsrWord**
Delete a word from the user dictionary, more info see :func:`add_user_word`
:param str word: the word to be delete
:return: -1, the word not exist in the user dictionary; else, the handle of the word deleted
"""
return self.get_func('KeyExtract_DelUsrWord', [c_char_p], c_int)(word)
[docs] @NLPIRBase.byte_str_transform
def import_key_blacklist(self, filename: str, pos_blacklist: typing.Optional[str] = None) -> int:
"""
Call **KeyExtract_ImportKeyBlackList**
Import keyword black list
This function will save words to KeyBlackList.pdat , if you want to remove the words form the system
need to backup it before use this function. Or use the function :func:`nlpir.key_extract.import_blacklist` ,
That function will backup that file automatically and you can use :func:`nlpir.key_extract.clean_blacklist` to
clean current blacklist and restore the origin file.
This list of word will not affect the key word extract and segmentation
:param filename: A word list that the words want to import to the blacklist (stop word list),
一个停用词词表,里面为想进行屏蔽的词,也可以包括别的词,是否不进行抽取是按照词表中的词性来确定的.
:param pos_blacklist: A list of pos that want to block in the system, 想要屏蔽的词的词性
:return: number of words that import to the systems
"""
return self.get_func('KeyExtract_ImportKeyBlackList', [c_char_p, c_char_p], c_uint)(filename, pos_blacklist)
"""
/*********************************************************************
*
以下函数为2013版本专门针对关键词批量发现的过程,一般建议脱机实现,不宜在线处理
*********************************************************************/
"""
[docs] @NLPIRBase.byte_str_transform
def batch_start(self) -> int:
"""
Call **KeyExtract_Batch_Start**
启动关键词识别
:return: rue:success, false:fail
"""
return self.get_func('KeyExtract_Batch_Start', None, c_int)()
[docs] @NLPIRBase.byte_str_transform
def batch_add_file(self, filename) -> int:
"""
Call **KeyExtract_Batch_AddFile**
往关键词识别系统中添加待识别关键词的文本文件, 需要在运行 :func:`batch_start` 之后,才有效
:param filename: 文件名
:return: true:success, false:fail
"""
return self.get_func('KeyExtract_Batch_AddFile', [c_char_p], c_ulong)(filename)
[docs] @NLPIRBase.byte_str_transform
def batch_addmen(self, txt: str) -> bool:
"""
Call **KeyExtract_Batch_AddMem**
往关键词识别系统中添加一段待识别关键词的内存,需要在运行 :func:`batch_start` 之后,才有效
:param txt: 文件名
:return: true:success, false:fail
"""
return self.get_func('KeyExtract_Batch_AddMem', [c_char_p], c_bool)(txt)
[docs] @NLPIRBase.byte_str_transform
def batch_complete(self) -> int:
"""
Call **KeyExtract_Batch_Complete**
关键词识别添加内容结束,需要在运行 :func:`batch_start` 之后,才有效
:return: true:success, false:fail
"""
return self.get_func('KeyExtract_Batch_Complete', None, c_int)()
[docs] @NLPIRBase.byte_str_transform
def batch_getresult(self, weight_out: bool) -> str:
"""
Call **KeyExtract_Batch_GetResult**
获取关键词识别的结果,需要在运行 :func:`batch_complete` 之后,才有效
:param weight_out: 是否需要输出每个关键词的权重参数
:return: 输出格式为 【关键词1】 【权重1】 【关键词2】 【权重2】 ...
"""
return self.get_func('KeyExtract_Batch_GetResult', [c_bool], str)(weight_out)
[docs] @NLPIRBase.byte_str_transform
def get_last_error_msg(self) -> str:
"""
Call **KeyExtract_GetLastErrorMsg**
:return: error message
"""
return self.get_func("KeyExtract_GetLastErrorMsg", None, c_char_p)()