Source code for nlpir.native.key_scanner

# coding=utf-8
from nlpir.native.nlpir_base import NLPIRBase
from ctypes import c_bool, c_char_p, c_int, c_float

#: UTF8编码转换过程中自动繁简转换处理,扫描过滤功能建议使用
ENCODING_UTF8_FJ = 5
#: 正常扫描模式
SCAN_MODE_NORMAL = 0
#: 形变扫描模式
SCAN_MODE_SHAPE = 1
#: 拼音扫描模式
SCAN_MODE_PINYIN = 2
#: 校对扫描模式
SCAN_MODE_CHECK = 3


[docs]class KeyScanner(NLPIRBase): """ A dynamic link library native class for Keyword Scan """ @property def dll_name(self) -> str: return "KeyScanAPI"
[docs] @NLPIRBase.byte_str_transform def init_lib(self, data_path: str, encode: int, license_code: str) -> int: """ Call **KS_Init** :param str data_path: :param int encode: :param str license_code: :return: 1 success 0 fail """ return self.get_func('KS_Init', [c_char_p, c_int, c_char_p], c_int)( data_path, encode, license_code)
[docs] def exit_lib(self) -> bool: """ Call **KS_Exit** :return: exit success or not """ return self.get_func('KS_Exit', restype=c_bool)()
[docs] @NLPIRBase.byte_str_transform def get_last_error_msg(self) -> str: """ Call **KS_GetLastErrorMsg** :return: error message """ return self.get_func("KS_GetLastErrorMsg", None, c_char_p)()
[docs] @NLPIRBase.byte_str_transform def new_instance(self, filter_type_index: int = 0) -> int: """ Call **KS_NewInstance** Get a instance from system for executing other functions. The function must be invoked before multiple keyword scan filter. This function will alloc memory , it need to be free memory by using :func:`delete_instance` after finish all executions from this handle. :param filter_type_index: which No of filter want to be used in this instance. The filter file will save into `Data/KeyScanner/filter{no}*` :return: a handle from system if success; otherwise return -1; """ return self.get_func("KS_NewInstance", [c_int], c_int)(filter_type_index)
[docs] @NLPIRBase.byte_str_transform def delete_instance(self, handle: int) -> int: """ Call **KS_DeleteInstance** Delete handle created by :func`new_instance`. Once delete handle from system, this handle cannot be used in any situation or will invoke critical errors. :param handle: the handle want to be deleted :return: success or not """ return self.get_func("KS_DeleteInstance", [c_int], c_int)(handle)
[docs] @NLPIRBase.byte_str_transform def import_user_dict( self, filename: str, over_write: bool = False, pinyin_abbrev_needed: bool = False, handle=0 ) -> int: """ Call **ImportUserDict** Import User-defined dictionary 导入用户词典, 此操作为全局操作会影响其他 instance 的过滤 文本文件每行的格式为: ``词条 词类 权重`` (注意,最多定义255个类别), 例:: AV电影 色情 2 六合彩 涉赌 8 1 复杂过滤条件: 支持与或非处理 ;表示或关系,+表示与关系,-表示否 格式如下:: {key11;key12;key13;...;key1N}+{key21;key22;key23;...;key2N}+...+{keyM1;keyM2;keyM3;...;keyMN}-{keyN} 示例:: {中国;中华;中华人民共和国;中国共产党;中共}+{伟大;光荣;正确}-{中华民国;国民党} 政治类 5 表示的是文本内容中包含 ``中国;中华;中华人民共和国;中国共产党;中共`` 中的一种, 同时出现 ``伟大;光荣;正确`` 中的一个,但不能出现 ``中华民国;国民党`` 的任何一个 :param filename: path of user dictionary :param pinyin_abbrev_needed: :param over_write: true将覆盖系统已经有的词表;否则将采用追加的方式追加不良词表 :param handle: handle of KeyScanner :return: success or not """ return self.get_func("KS_ImportUserDict", [c_char_p, c_bool, c_bool, c_int], c_int)( filename, over_write, pinyin_abbrev_needed, handle)
[docs] @NLPIRBase.byte_str_transform def delete_user_dic(self, text: str, handle: int) -> int: """ Call **DeleteUserDict** Delete User-defined dictionary 删除用户词典, 此操作为全局操作, 会删除词典文件并影响所有 instance 文本文件每行的格式为: ``词条`` , 例如:: AV电影 习近平 :param text: Text of user dictionary :param handle: handle of KeyScanner :return: The number of lexical entry deleted successfully 成功删除的词典条数 """ return self.get_func("KS_DeleteUserDict", [c_char_p, c_int], c_int)(text, handle)
[docs] @NLPIRBase.byte_str_transform def delete_user_dic_from_file(self, filename: str, handle: int) -> int: """ Call **DeleteUserDict** Delete User-defined dictionary 删除用户词典, 此操作为全局操作, 会删除词典文件并影响所有 instance 文本文件每行的格式为: ``词条`` , 例如:: AV电影 习近平 :param filename: Text filename for user dictionary :param handle: handle of KeyScanner :return: The number of lexical entry deleted successfully 成功删除的词典条数 """ return self.get_func("KS_DeleteUserDict", [c_char_p, c_int], c_int)(filename, handle)
[docs] @NLPIRBase.byte_str_transform def scan(self, content: str, handle: int = 0) -> str: """ Call **KS_Scan** 扫描输入的文本内容 :param content: 文本内容 :param handle: handle of KeyScanner :return: 涉及不良的所有类别与权重,按照权重排序。如: ``色情/10#暴力/1#`` , ``政治反动/2#FLG/1#涉领导人/1#`` , ``""`` : 表示无扫描命中结果 """ return self.get_func("KS_Scan", [c_char_p, c_int], c_char_p)(content, handle)
[docs] @NLPIRBase.byte_str_transform def scan_detail(self, content: str, scan_mode: int = SCAN_MODE_NORMAL, handle: int = 0) -> str: """ Call **KS_ScanDetail** 扫描输入的文本内容,获得详细结果 :param scan_mode: 扫描模式 :param content: 文本内容 :param handle: handle of KeyScanner :return: 返回包含了扫描结果的内容,扫描结果明细: :: { "Details": ["chou傻逼xi禁评"], "Rules": ["傻逼","xi禁评"], "filename": "", "illegal" :{ "classes":[ { "freq":1, "word":"粗言秽语" },{ "freq":1, "word":"污言秽语" },{ "freq":1, "word":"新华社禁用" },{ "freq":1,"word":"一号首长" } ], "hit_count":4, "keys":["傻逼","xi禁评"], "scan_val":13.333333333333332 }, "legal": { "hit_count":0, "scan_val":0.0 }, "line_id":0, "org_file":"", "score":13.333333333333332 } """ return self.get_func("KS_ScanDetail", [c_char_p, c_int, c_int], c_char_p)(content, scan_mode, handle)
[docs] @NLPIRBase.byte_str_transform def scan_file(self, filename: str, handle: int = 0) -> str: """ Call **KS_ScanFile** 扫描输入的文本文件内容 :param filename: 文本文件名 :param handle: handle of KeyScanner :return: same as :func:`scan` """ return self.get_func("KS_ScanFile", [c_char_p, c_int], c_char_p)(filename, handle)
[docs] @NLPIRBase.byte_str_transform def scan_file_detail(self, filename: str, handle: int = 0) -> str: """ Call **KS_ScanFileDetail** 扫描输入的文本文件内容 :param filename: 文本文件名 :param handle: handle of KeyScanner :return: same as :func:`scan_detail` """ return self.get_func("KS_ScanFileDetail", [c_char_p, c_int], c_char_p)(filename, handle)
[docs] @NLPIRBase.byte_str_transform def scan_line( self, filename: str, result_filename: str, handle: int = 0, encrypt: int = 0, scan_mode: int = SCAN_MODE_NORMAL ) -> int: """ Call **KS_ScanLine** 按行扫描输入的文本文件内容 :param filename: 输入的文本文件名 :param result_filename: 输出的结果文件名 :param handle: handle of KeyScanner :param encrypt: 0 不加密;1,加密 :param scan_mode: :return: same as :func:`scan_detail` """ return self.get_func("KS_ScanLine", [c_char_p, c_char_p, c_int, c_int, c_int], c_int)( filename, result_filename, handle, encrypt, scan_mode )
[docs] @NLPIRBase.byte_str_transform def scan_stat(self, result_file, handle: int = 0) -> int: """ Call **KS_ScanStat** 输出扫描结果的命中统计报告,利于进一步的分析核查 :param result_file: 输出结果的文件文件 :param handle: handle of KeyScanner :return: 成功扫描到问题的文件数 """ return self.get_func("KS_ScanStat", [c_char_p, c_int], c_int)(result_file, handle)
[docs] @NLPIRBase.byte_str_transform def scan_dir( self, input_dir_path: str, result_path: str, filter: str, thread_count: int = 10, encrypt: bool = False, scan_mode: int = SCAN_MODE_NORMAL ) -> int: """ Call **KS_ScanDir** 多线程扫描按行扫描输入的文本夹文件内容 :param input_dir_path: 输入的文件夹路径 :param result_path: 输出结果的文件夹路径 :param filter: 输入的文件后缀名 :param thread_count: 线程数,默认10个 :param encrypt: 0 不加密;1,加密 :param scan_mode: :return: 成功扫描到问题的文件数 """ return self.get_func("KS_ScanDir", [c_char_p, c_char_p, c_char_p, c_int, c_int, c_int], c_int)( input_dir_path, result_path, filter, thread_count, encrypt, scan_mode )
[docs] @NLPIRBase.byte_str_transform def merge_result(self, path: str) -> None: """ Merge多线程的扫描结果 :param path: :return: """ return self.get_func("KS_MergeResult", [c_char_p], None)(path)
[docs] @NLPIRBase.byte_str_transform def scan_add_stat(self, result_file: str, handle: int) -> int: """ 将handle线程扫描结果归并到0线程 :param result_file: :param handle: :return: """ return self.get_func("KS_ScanAddStat", [c_char_p, c_int], c_int)(result_file, handle)
[docs] @NLPIRBase.byte_str_transform def stat_result_filter(self, input_filename: str, result_filename: str, threshold: float = 5.0) -> int: """ Call **KS_StatResultFilter** 对扫描的统计结果进行过滤分析 :param input_filename: 输入的结果文件名 :param result_filename: 输出结果的文件名 :param threshold: 不良得分的阈值 :return: 成功扫描到问题的文件数 """ return self.get_func("KS_StatResultFilter", [c_char_p, c_char_p, c_float], c_int)( input_filename, result_filename, c_float(threshold))
[docs] @NLPIRBase.byte_str_transform def scan_result_filter(self, input_filename: str, result_filename: str, threshold: float = 9.0) -> int: """ Call **KS_ScanResultFilter** 对扫描的详细结果文件进行过滤分析 :param input_filename: 输入的结果文件名 :param result_filename: 输出结果的文件名 :param threshold: 不良得分的阈值 :return: 成功扫描到问题的文件数 """ return self.get_func("KS_ScanResultFilter", [c_char_p, c_char_p, c_float], c_int)( input_filename, result_filename, c_float(threshold))
[docs] @NLPIRBase.byte_str_transform def decrypt(self, input_dir_path: str, result_path: str) -> int: """ Call **KS_Decrypt** 多线程转换扫描结果 :param input_dir_path: 输入的文件夹路径 :param result_path: 输出结果的文件夹路径 :return: """ return self.get_func("KS_Decrypt", [c_char_p, c_char_p], c_int)(input_dir_path, result_path)
[docs] @NLPIRBase.byte_str_transform def export_dict(self, filename: str, handle: int = 0) -> int: """ Call **KS_ExportDict** ExportDict dictionary 导出已经定义的不良词词典, 为保护知识产权,该功能仅局限于管理员内部调度使用 文本文件的格式为: ``词条 词类 权重`` (注意,最多定义255个类别) 例如:: AV电影 色情 2 六合彩 涉赌 8 1 :param filename: Text filename for user dictionary :param handle: handle of KeyScanner :return: The number of lexical entry imported successfully 成功导入的词典条数 """ return self.get_func("KS_ExportDict", [c_char_p, c_int], c_int)(filename, handle)