Source code for nlpir.new_word_finder

#! coding=utf-8
"""
high-level toolbox for Chinese New Word Finder
"""
# pylint: disable=duplicate-code

from nlpir import get_instance as __get_instance__
from nlpir import native, logger
import typing
import json

# class and class instance
__cls__ = native.new_word_finder.NewWordFinder
__instance__: typing.Optional[native.new_word_finder.NewWordFinder] = None
# Location of DLL
__lib__ = None
# Data directory
__data__ = None
# license_code
__license_code__ = None
# encode
__nlpir_encode__ = native.UTF8_CODE


[docs]@__get_instance__ def get_native_instance() -> native.new_word_finder.NewWordFinder: """ 返回原生NLPIR接口,使用更多函数 :return: The singleton instance """ return __instance__
[docs]@__get_instance__ def find_new_words(text: str, max_key: int) -> typing.Optional[typing.List[dict]]: """ 获取新词,返回新词和对应权重词性,此函数适合较少数量的文本的新词发现功能(不超过20M), 若数量较大可以使用 :func:`find_new_words_batch` 进行处理 :param text: 文本内容 :param max_key: 新词发现返回的新词数量最大值 :return: 格式如下 :: [ { "freq" : 225, "pos" : "n_new", "weight" : 126.28066602434734, "word" : "主权者" }, { "freq" : 100, "pos" : "n_new", "weight" : 60.782738270597314, "word" : "行政官" }, { "freq" : 103, "pos" : "n_new", "weight" : 45.549023266744136, "word" : "卢梭" }, { ... } ... ] """ return json.loads(__instance__.get_new_words(line=text, max_key_limit=max_key, format_opt=native.OUTPUT_FORMAT_JSON))
[docs]@__get_instance__ def find_new_words_batch(text_iter: typing.Iterable[str]) -> typing.Optional[typing.List[dict]]: """ :param text_iter: 文本迭代器, 迭代器内容为文本 :return: 同 :func:`find_new_words` """ logger.debug("batch new word finder start") if not __instance__.batch_start(): logger.error("batch start fail") return None for text in text_iter: logger.debug("adding text to system") if not __instance__.batch_addmen(text=text): logger.error("fail to add file") if not __instance__.batch_complete(): logger.error("complete batch process fail") return None try: return json.loads(__instance__.batch_getresult(format_json=True)) except json.decoder.JSONDecodeError: logger.error("get result fail") return []
[docs]def export_dict(words_result: typing.List[dict], save_path: str) -> bool: """ 将 :func:`find_new_words_batch` 和 :func:`find_new_words` 获得的结果保存为词典 :param words_result: 新词发现返回结果 :param save_path: 保存词典位置 :return: 是否保存成功 """ dict_txt = "" for i in words_result: item = f"{i['word']}\t{i['pos']}\n" dict_txt += item with open(save_path, "w") as f: f.write(dict_txt) return True