Source code for nlpir.cluster

#! coding=utf-8
"""
high-level toolbox for Text Cluster
"""
from nlpir import get_instance as __get_instance__
from nlpir import native
import typing
from xml.etree.ElementTree import fromstring
import hashlib

# class and class instance
__cls__ = native.cluster.Cluster
__instance__: typing.Optional[native.Cluster] = None
# Location of DLL
__lib__ = None
# Data directory
__data__ = None
# license_code
__license_code__ = None
# encode
__nlpir_encode__ = native.UTF8_CODE


[docs]@__get_instance__ def get_native_instance() -> native.Cluster: """ 返回原生NLPIR接口,使用更多函数 :return: The singleton instance """ return __instance__
[docs]def hash_text(text: str) -> str: return hashlib.md5(text.encode("utf-8")).hexdigest()
[docs]@__get_instance__ def cluster(iter_text, max_doc: int, max_cls: int) -> typing.Tuple[typing.List[typing.Dict], typing.Dict[str, str]]: """ Make a cluster with a set of text, get the result as below 对文本进行聚类,获得结果的结构如下 :: [ { "cluster_id": 1, "feature": ["aa", "bb", "cc"], "doc_num": 3, "doc_ids": ["abc", "cfg", "cca"] }, ... ] 其中, ``doc_ids`` 使用 :func:`hash_text` 生成, 其结果会返回为 ``text_dict``, ``doc_ids`` is generated by :func:`hash_text` using document text, all the ``doc_id`` and ``doc_text`` will store to ``text_dict`` as KV :: { doc_ids: text } :param iter_text: a iterator to get a set of text :param max_doc: the maximum count for text :param max_cls: the maximum count of class for clustering :return: ``result``, ``text_dict`` """ text_dict = dict() __instance__.clean_data() __instance__.set_parameter(max_cls, max_doc) for text in iter_text: signature = hash_text(text) __instance__.add_content(text=text, signature=signature) text_dict[signature] = text xml_result = __instance__.get_latest_result_e() __instance__.clean_data() return __xml2dict__(xml_result), text_dict
def __xml2dict__(xml_txt: str) -> typing.List[typing.Dict]: """ Transform xml result to dicts :: [ { "cluster_id": 1, "feature": ["aa", "bb", "cc"], "doc_num": 3, "doc_ids": ["abc", "cfg", "cca"] }, ... ] :param xml_txt: :return: """ xml_ele = fromstring(xml_txt) list_result = list() for item in xml_ele.findall("clus"): item_dict = dict() item_dict["cluster_id"] = item.attrib["id"] for i in item: if i.tag == "feature": item_dict["feature"] = i.text.split() else: doc_ids = list() for item_doc in i: doc_ids.append(item_doc.text) item_dict["doc_ids"] = doc_ids item_dict["doc_num"] = int(i.get("num")) list_result.append(item_dict) return list_result
[docs]def searchable_result( cluster_result: typing.List[typing.Dict] ) -> typing.Tuple[ typing.Dict[str, typing.List[str]], typing.Dict[str, typing.List[str]], typing.Dict[str, typing.Set[str]] ]: """ The :func:`cluster` 's result is not easily to search the result. This function will transform the result to a set of result dict to make search result easier. :func:`cluster` 的结果并不容易使用, 本函数将其结果处理成多个容易搜索获取 结果的数据结构. There are three dict data will return, ``cluster_cls``, can get every class's features :: { class_id: [class_feature] } ``feature_dict``, can reverse the search above, get which classes have this feature :: { word: [class_id] } ``result_dict``, can get any doc's class from doc's hash from :func:`hash_text` :: { doc_id: {class_id_1, class_id_2} } :param cluster_result: :return: ``cluster_cls``, ``feature_dict``, ``result_dict`` """ # class_id, class_feature cluster_cls = dict() # word: [class_id] feature_dict = dict() # doc_id : {class_id} result_dict = dict() for cls in cluster_result: cluster_cls[cls["cluster_id"]] = cls["feature"] for word in cls["feature"]: if word not in feature_dict: feature_dict[word] = [cls["cluster_id"]] else: feature_dict[word].append(cls["cluster_id"]) for doc_id in cls["doc_ids"]: if doc_id not in result_dict: result_dict[doc_id] = set() result_dict[doc_id].add(cls["cluster_id"]) return cluster_cls, feature_dict, result_dict