Source code for nlpir.native.doc_extractor
# coding=utf-8
from nlpir.native.nlpir_base import NLPIRBase
from ctypes import c_bool, c_char_p, c_int, c_uint, c_size_t
DOC_EXTRACT_TYPE_PERSON = 0 #: 人名
DOC_EXTRACT_TYPE_LOCATION = 1 #: 地名
DOC_EXTRACT_TYPE_ORGANIZATION = 2 #: 机构名
DOC_EXTRACT_TYPE_KEYWORD = 3 #: 关键词
DOC_EXTRACT_TYPE_AUTHOR = 4 #: 文章作者
DOC_EXTRACT_TYPE_MEDIA = 5 #: 媒体
DOC_EXTRACT_TYPE_COUNTRY = 6 #: 文章对应的所在国别
DOC_EXTRACT_TYPE_PROVINCE = 7 #: 文章对应的所在省份
DOC_EXTRACT_TYPE_ABSTRACT = 8 #: 文章的摘要
DOC_EXTRACT_TYPE_POSITIVE = 9 #: 文章的正面情感词
DOC_EXTRACT_TYPE_NEGATIVE = 10 #: 文章的负面情感词
DOC_EXTRACT_TYPE_TEXT = 11 #: 文章去除网页等标签后的正文
DOC_EXTRACT_TYPE_TIME = 12 #: 时间词
#: 用户自定义的词类,第一个自定义词
#: 后续的自定义词,依次序号为::data:`DOC_EXTRACT_TYPE_USER` + 1 , :data:`DOC_EXTRACT_TYPE_USER` + 2 , ...
DOC_EXTRACT_TYPE_USER = 13
PERSON_REQUIRED = 0x0001
LOCATION_REQUIRED = 0x0002
ORGANIZATION_REQUIRED = 0x0004
KEYWORD_REQUIRED = 0x0008
AUTHOR_REQUIRED = 0x0010
MEDIA_REQUIRED = 0x0100
COUNTRY_REQUIRED = 0x0200
PROVINCE_REQUIRED = 0x0400
ABSTRACT_REQUIRED = 0x0800
SENTIWORD_REQUIRED = 0x1000
SENTIMENT_REQUIRED = 0x2000
TIME_REQUIRED = 0x4000
HTML_REMOVER_REQUIRED = 0x8000 #: 是否需要去除网页标签的功能选项
ALL_REQUIRED = 0xffff
[docs]class DocExtractor(NLPIRBase):
"""
A dynamic link library native class for Document Extractor
"""
DOC_EXTRACT_DELIMITER = "#" #: 分隔符
DOC_EXTRACT_TYPE_MAX_LENGTH = 600 # 最大长度
@property
def dll_name(self) -> str:
return "DocExtractor"
[docs] @NLPIRBase.byte_str_transform
def init_lib(self, data_path: str, encode: int, license_code: str) -> int:
"""
Call **DE_Init**
:param str data_path:
:param int encode:
:param str license_code:
:return: 1 success 0 fail
"""
return self.get_func('DE_Init', [c_char_p, c_int, c_char_p], c_int)(data_path, encode, license_code)
[docs] def exit_lib(self) -> bool:
"""
Call **DE_Exit**
:return: exit success or not
"""
return self.get_func('DE_Exit', restype=c_bool)()
[docs] @NLPIRBase.byte_str_transform
def get_last_error_msg(self) -> str:
"""
Call **DE_GetLastErrorMsg**
:return: error message
"""
return self.get_func("DE_GetLastErrorMsg", None, c_char_p)()
[docs] @NLPIRBase.byte_str_transform
def pares_doc_e(
self, text: str,
user_def_pos: str,
summary_needed: bool = True,
func_required: int = ALL_REQUIRED
) -> int:
"""
Call **DE_ParseDocE**
生成单文档摘要
:param text: 文档内容
:param user_def_pos: 用户自定义的词性标记, 最多三种(人名、地名、机构名、媒体等内置,无需设置, 不同词类之间采用#分割,
如 ``gms#gjtgj#g``
:param summary_needed: 是否需要计算摘要
:param func_required:
:return: 用于获取内容的handle, 获取内容完毕后应使用 :func:`release_handle` 释放对应资源
"""
return self.get_func("DE_ParseDocE", [c_char_p, c_char_p, c_bool, c_uint], c_size_t)(
text, user_def_pos, summary_needed, func_required
)
[docs] @NLPIRBase.byte_str_transform
def release_handle(self, handle: int) -> None:
"""
Call **DE_ReleaseHandle**
释放 :func:`parse_doc_e` 结果所占据的空间
:param handle: :func:`parse_doc_e` 执行后返回的HANDLE
:return:
"""
return self.get_func("DE_ReleaseHandle", [c_size_t], None)(handle)
[docs] @NLPIRBase.byte_str_transform
def get_result(self, handle: int, doc_extract_type: int) -> str:
"""
Call **DE_GetResult**
从运行完的 :func:`parse_doc_e` 结果中,获取指定抽取的结果内容
:param handle: :func:`parse_doc_e` 执行后返回的HANDLE
:param doc_extract_type: 获取的抽取类型,从DOC_EXTRACT_TYPE_PERSON开始的结果
:return:
"""
return self.get_func("DE_GetResult", [c_size_t, c_int], c_char_p)(handle, doc_extract_type)
[docs] @NLPIRBase.byte_str_transform
def get_sentiment_score(self, handle: int) -> int:
"""
Call **DE_GetSentimentScore**
从运行完的 :func:`parse_doc_e` 结果中,获取指文章的情感得分
:param handle: :func:`parse_doc_e` 执行后返回的HANDLE
:return: 情感正负得分
"""
return self.get_func("DE_GetSentimentScore", [c_size_t], c_int)(handle)
[docs] @NLPIRBase.byte_str_transform
def compute_sentiment_doc(self, text: str) -> int:
"""
Call **DE_ComputeSentimentDoc**
生成单文档情感分析结果
:param text: 文档内容
:return:
"""
return self.get_func("DE_ComputeSentimentDoc", [c_char_p], c_int)(text)
[docs] @NLPIRBase.byte_str_transform
def import_sentiment_dict(self, filename: str) -> int:
"""
Call **DE_ImportSentimentDict**
导入用户自定义的情感词表,每行一个词,空格后加上正负权重,如: ``语焉不详 -2``
若导入的情感词属于新词, 需先在用户词典中导入, 否则情感识别自动跳跃
:param filename:
:return:
"""
return self.get_func("DE_ImportSentimentDict", [c_size_t], c_int)(filename)
[docs] @NLPIRBase.byte_str_transform
def import_user_dict(self, filename: str, overwrite: bool = False) -> int:
"""
Call **DE_ImportUserDict**
导入用户词典, see :func:`nlpir.native.ictclas.ICTCLAS.import_user_dict`
:param filename:
:param overwrite:
:return:
"""
return self.get_func("DE_ImportUserDict", [c_char_p, c_bool], c_uint)(filename, overwrite)
[docs] @NLPIRBase.byte_str_transform
def add_user_word(self, word: str) -> int:
"""
Call **DE_AddUserWord**
Add a word to the user dictionary, see :func:`nlpir.native.ictclas.ICTCLAS.add_user_word`
:param word:
:return:
"""
return self.get_func("DE_AddUserWord", [c_char_p], c_int)(word)
[docs] @NLPIRBase.byte_str_transform
def clean_user_word(self) -> int:
"""
Call **DE_CleanUserWord**
Clean all temporary added user words, see :func:`nlpir.native.ictclas.ICTCLAS.clean_user_word`
:return:
"""
return self.get_func("DE_CleanUserWord", None, c_int)()
[docs] @NLPIRBase.byte_str_transform
def save_the_usr_dic(self) -> int:
"""
Call **DE_SaveTheUsrDic**
Save in-memory dict to user dict, see :func:`nlpir.native.ictclas.ICTCLAS.save_the_usr_dic`
:return:
"""
return self.get_func("DE_SaveTheUsrDic", None, c_int)()
[docs] @NLPIRBase.byte_str_transform
def del_usr_word(self, word: str) -> int:
"""
Call **DE_DelUsrWord**
Delete a word from the user dictionary, see :func:`nlpir.native.ictclas.ICTCLAS.del_usr_word`
:param word:
:return:
"""
return self.get_func("DE_DelUsrWord", [c_char_p], c_int)(word)
[docs] @NLPIRBase.byte_str_transform
def import_key_blacklist(self, filename: str, pos_blacklist: str) -> int:
"""
Call **DE_ImportKeyBlackList**
Import keyword black list, see :func:`nlpir.native.key_extract.KeyExtract.import_key_blacklist`
:param filename:
:param pos_blacklist:
:return:
"""
return self.get_func("DE_ImportKeyBlackList", [c_char_p, c_char_p], c_uint)(filename, pos_blacklist)