Source code for nlpir.key_scanner

#! coding=utf-8
"""
high-level toolbox for Chinese Key-word Extraction
"""
# pylint: disable=duplicate-code

from nlpir import get_instance as __get_instance__
from nlpir import native
import typing
import re
import json

# class and class instance
__cls__ = native.key_scanner.KeyScanner
__instance__: typing.Optional[native.key_scanner.KeyScanner] = None
# Location of DLL
__lib__ = None
# Data directory
__data__ = None
# license_code
__license_code__ = None
# encode
__nlpir_encode__ = native.UTF8_CODE


[docs]@__get_instance__ def get_native_instance() -> native.key_scanner.KeyScanner: """ 返回原生NLPIR接口,使用更多函数 :return: The singleton instance """ return __instance__
[docs]class KeyScanProcessor: """ 关键词过滤处理单元, 需要指定使用的过滤器, 若要新建 一过滤器, 选择没有被使用的编号实例化本类, 并使用 :func:`import_user_dict` 导入词典后便可使用,下次使用时仅需 实例化对应编号的本类即可. 导入词典后的过滤器会保存在Data文件夹下,若不删除则会一直存在,和导入 用户词典相似,不会消失. :param filter_type_index: 需要使用的过滤器 """ @__get_instance__ def __init__(self, filter_type_index: int = 0): self.handle = __instance__.new_instance(filter_type_index=filter_type_index) @__get_instance__ def __del__(self): __instance__.delete_instance(self.handle)
[docs] def scan(self, text: str): """ Scan text :param text: :return: :: [ { "class": class_name, "freq": frequent_of_class_hit },{ ... }, ... ] """ result = __instance__.scan(text, self.handle) result = re.findall(r"(.+?)/(\d+)#", result) return [{"class": _[0], "freq": _[1]} for _ in result]
[docs] def scan_detail(self, text: str): """ Scan text get detail :: { 'Rules': ['傻逼'], 'illegal':{ 'classes': [ {'freq': 1, 'word': '粗言秽语'}, {'freq': 1, 'word': '污言秽语'}, {'freq': 1, 'word': '新华社禁用'} ], 'hit_count': 3, 'keys': ['傻逼'], 'scan_val': 3.333333333333333 }, 'line_id': 0, 'org_file': '', 'score': 3.333333333333333 } :param text: :return: """ result = __instance__.scan_detail(text, handle=self.handle) result = json.loads(result) if len(result) > 2 else dict() result["rules"] = result.pop("Rules", None) result.pop("Details", None) result.pop("filename", None) result.pop("legal", None) return result
[docs]@__get_instance__ def import_user_dict( user_dict: str, filter_index: int = 0, pinyin_abbrev_needed: bool = True, over_write: bool = False ) -> bool: """ 导入词典, 对应参数参考 :func:`nlpir.native.key_scanner.KeyScanner.import_user_dict` :param user_dict: :param filter_index: :param pinyin_abbrev_needed: :param over_write: :return: """ handle = __instance__.new_instance(filter_index) __instance__.import_user_dict( filename=user_dict, pinyin_abbrev_needed=pinyin_abbrev_needed, over_write=over_write, handle=handle ) __instance__.delete_instance(handle) return True
[docs]@__get_instance__ def delete_user_dict( user_dict: str, filter_index: int = 0 ) -> bool: """ 删除词典中的某些单词, 对应参数参考 :func:`nlpir.native.key_scanner.KeyScanner.delete_user_dict` :param user_dict: :param filter_index: :return: """ handle = __instance__.new_instance(filter_index) __instance__.delete_user_dic(user_dict, handle=handle) __instance__.delete_instance(handle) return True