Source code for nlpir.doc_extractor

#! coding=utf-8
"""
high-level toolbox for Document Extractor
"""
import re
import typing
import nlpir
from nlpir import get_instance as __get_instance__
from nlpir import native

# class and class instance
__cls__ = native.doc_extractor.DocExtractor
__instance__: typing.Optional[native.doc_extractor.DocExtractor] = None
# Location of DLL
__lib__ = None
# Data directory
__data__ = None
# license_code
__license_code__ = None
# encode
__nlpir_encode__ = native.UTF8_CODE


[docs]class ExtractResult: """ A class for retrieve result from Document Extractor's handle """ #: Types map can be retrieved from DocExtractor retrieve_type_map: typing.Dict[str, int] = { "person": native.doc_extractor.DOC_EXTRACT_TYPE_PERSON, "location": native.doc_extractor.DOC_EXTRACT_TYPE_LOCATION, "organization": native.doc_extractor.DOC_EXTRACT_TYPE_ORGANIZATION, "keyword": native.doc_extractor.DOC_EXTRACT_TYPE_KEYWORD, "author": native.doc_extractor.DOC_EXTRACT_TYPE_AUTHOR, "media": native.doc_extractor.DOC_EXTRACT_TYPE_MEDIA, "country": native.doc_extractor.DOC_EXTRACT_TYPE_COUNTRY, "province": native.doc_extractor.DOC_EXTRACT_TYPE_PROVINCE, "abstract": native.doc_extractor.DOC_EXTRACT_TYPE_ABSTRACT, "positive": native.doc_extractor.DOC_EXTRACT_TYPE_POSITIVE, "negative": native.doc_extractor.DOC_EXTRACT_TYPE_NEGATIVE, "text": native.doc_extractor.DOC_EXTRACT_TYPE_TEXT, "time": native.doc_extractor.DOC_EXTRACT_TYPE_TIME, "user": native.doc_extractor.DOC_EXTRACT_TYPE_USER } def __init__(self, handle: int, user_retrieve_type: typing.List[str]): self.handle: int = handle # add user defined pos self.user_retrieve_type_map: typing.Dict[str, int] = { _: self.retrieve_type_map["user"] + i for i, _ in user_retrieve_type } self.retrieve_types: typing.List[int] = [ native.doc_extractor.DOC_EXTRACT_TYPE_PERSON, native.doc_extractor.DOC_EXTRACT_TYPE_LOCATION, native.doc_extractor.DOC_EXTRACT_TYPE_ORGANIZATION, native.doc_extractor.DOC_EXTRACT_TYPE_KEYWORD, native.doc_extractor.DOC_EXTRACT_TYPE_AUTHOR, native.doc_extractor.DOC_EXTRACT_TYPE_MEDIA, native.doc_extractor.DOC_EXTRACT_TYPE_COUNTRY, native.doc_extractor.DOC_EXTRACT_TYPE_PROVINCE, native.doc_extractor.DOC_EXTRACT_TYPE_ABSTRACT, native.doc_extractor.DOC_EXTRACT_TYPE_POSITIVE, native.doc_extractor.DOC_EXTRACT_TYPE_NEGATIVE, native.doc_extractor.DOC_EXTRACT_TYPE_TEXT, native.doc_extractor.DOC_EXTRACT_TYPE_TIME, ] self.retrieve_types += self.user_retrieve_type_map.values() self.all_available_type_map = self.get_available_retrieve_types() self.__retrieve_type_reverse_map = { self.all_available_type_map[retrieve_type]: retrieve_type for retrieve_type in self.all_available_type_map } self.re_result = re.compile(r"(.+?)/([a-z0-9A-Z]+?)/([.\d]+?)/(\d+)?#")
[docs] def get_available_retrieve_types(self) -> typing.Dict[str, int]: """ Get a set of types_name and types available for current extraction result :return: """ return {**self.retrieve_type_map, **self.user_retrieve_type_map}
[docs] def set_retrieve_types(self, retrieve_type_list: typing.List[int]) -> bool: """ Set what type of data want to get from :func:`get_result` , can be set multi-times :param retrieve_type_list: list of retrieve types :return: """ self.retrieve_types = retrieve_type_list return True
[docs] @__get_instance__ def get_result( self, retrieve_types: typing.Optional[typing.List[int]] = None ) -> typing.Dict[str, typing.List[typing.Dict[str, typing.Union[str, int, float]]]]: """ Get result from current result, can be retrieved multi-times. :param retrieve_types: option, a list of retrieve types want to get, default is all types can be retrieved or certain types set by :func:`set_retrieve_types` :return: a dict of result : ``{type_name: [result}]}`` , example :: { "person": [ { "word": "卢梭", "pos": "n", "weight": 1.5, "frq": 100 } ] } """ if retrieve_types is not None: self.set_retrieve_types(retrieve_types) result_dict = dict() for retrieve_type in self.retrieve_types: result = __instance__.get_result( handle=self.handle, doc_extract_type=retrieve_type ) result = self.re_result.findall(result) result_dict[self.__retrieve_type_reverse_map[retrieve_type]] = [ { "word": string_tuple[0], "pos": string_tuple[1], "weight": float(string_tuple[2]), "frq": int(string_tuple[3]) } for string_tuple in result ] return result_dict
[docs] @__get_instance__ def get_sentiment_result(self) -> int: """ Get sentiment point from current extraction result :return: """ return __instance__.get_sentiment_score(self.handle)
@__get_instance__ def __del__(self): return __instance__.release_handle(self.handle)
[docs]@__get_instance__ def get_native_instance() -> native.doc_extractor.DocExtractor: """ 返回原生NLPIR接口,使用更多函数 :return: The singleton instance """ return __instance__
[docs]@__get_instance__ def extract(text: str, user_define_pos: typing.List[str]) -> ExtractResult: """ :param text: :param user_define_pos: :return: """ handle = __instance__.pares_doc_e(text, "#".join(user_define_pos)) return ExtractResult(handle=handle, user_retrieve_type=user_define_pos)
[docs]@__get_instance__ def import_dict(word_list: list) -> list: """ See :func:`nlpir.import_dict` :param word_list: list of words want to add to NLPIR :return: the word fail to add to the NLPIR """ return nlpir.import_dict(word_list=word_list, instance=__instance__)
[docs]@__get_instance__ def clean_user_dict() -> bool: """ See :func:`nlpir.clean_user_dict` :return: success or not """ return nlpir.clean_user_dict(instance=__instance__)
[docs]@__get_instance__ def delete_user_word(word_list: list): """ See :func:`nlpir.delete_user_word` :param word_list: list of words want to delete """ return nlpir.delete_user_word(word_list=word_list, instance=__instance__)
[docs]@__get_instance__ def save_user_dict() -> bool: """ See :func:`nlpir.save_user_dict` :return: Success or not """ return nlpir.save_user_dict(instance=__instance__)
[docs]@__get_instance__ def clean_saved_user_dict(): """ See :func:`nlpir.clean_saved_user_dict` :return: Delete success or not """ return nlpir.clean_saved_user_dict()
[docs]@__get_instance__ def import_blacklist(filename: str, pos_blacklist=typing.List[str]) -> bool: """ Import Blacklist to system, see :func:`nlpir.import_blacklist` """ return nlpir.import_blacklist(__instance__, filename, pos_blacklist)
[docs]@__get_instance__ def clean_blacklist() -> bool: """ 清除黑名单词表, see :func:`nlpir.clean_blacklist` :return: clean success or not """ return nlpir.clean_blacklist()
[docs]@__get_instance__ def recover_blacklist() -> bool: """ 恢复黑名单词表,仅在被重命名的词表存在时才起作用, see :func:`nlpir.recover_blacklist` :return: """ return nlpir.recover_blacklist()