Source code for nlpir.cluster

#! coding=utf-8
"""
high-level toolbox for Text Cluster
"""
from nlpir import get_instance as __get_instance__
from nlpir import native
import typing
from xml.etree.ElementTree import fromstring
import hashlib

# class and class instance
__cls__ = native.cluster.Cluster
__instance__: typing.Optional[native.Cluster] = None
# Location of DLL
__lib__ = None
# Data directory
__data__ = None
# license_code
__license_code__ = None
# encode
__nlpir_encode__ = native.UTF8_CODE


[docs]@__get_instance__
def get_native_instance() -> native.Cluster:
    """
    返回原生NLPIR接口,使用更多函数

    :return: The singleton instance
    """
    return __instance__


[docs]def hash_text(text: str) -> str:
    return hashlib.md5(text.encode("utf-8")).hexdigest()


[docs]@__get_instance__
def cluster(iter_text, max_doc: int, max_cls: int) -> typing.Tuple[typing.List[typing.Dict], typing.Dict[str, str]]:
    """
    Make a cluster with a set of text, get the result as below

    对文本进行聚类,获得结果的结构如下

    ::

        [
            {
                "cluster_id": 1,
                "feature": ["aa", "bb", "cc"],
                "doc_num": 3,
                "doc_ids": ["abc", "cfg", "cca"]
            },
            ...
        ]


    其中, ``doc_ids`` 使用 :func:`hash_text` 生成, 其结果会返回为 ``text_dict``,

    ``doc_ids`` is generated by :func:`hash_text` using document text, all
    the ``doc_id`` and ``doc_text`` will store to ``text_dict`` as KV

    ::

        {
            doc_ids: text
        }


    :param iter_text: a iterator to get a set of text
    :param max_doc: the maximum count for text
    :param max_cls: the maximum count of class for clustering
    :return: ``result``, ``text_dict``
    """
    text_dict = dict()
    __instance__.clean_data()
    __instance__.set_parameter(max_cls, max_doc)
    for text in iter_text:
        signature = hash_text(text)
        __instance__.add_content(text=text, signature=signature)
        text_dict[signature] = text
    xml_result = __instance__.get_latest_result_e()
    __instance__.clean_data()
    return __xml2dict__(xml_result), text_dict


def __xml2dict__(xml_txt: str) -> typing.List[typing.Dict]:
    """
    Transform xml result to dicts

    ::

        [
            {
                "cluster_id": 1,
                "feature": ["aa", "bb", "cc"],
                "doc_num": 3,
                "doc_ids": ["abc", "cfg", "cca"]
            },
            ...
        ]

    :param xml_txt:
    :return:
    """
    xml_ele = fromstring(xml_txt)
    list_result = list()
    for item in xml_ele.findall("clus"):
        item_dict = dict()
        item_dict["cluster_id"] = item.attrib["id"]
        for i in item:
            if i.tag == "feature":
                item_dict["feature"] = i.text.split()
            else:
                doc_ids = list()
                for item_doc in i:
                    doc_ids.append(item_doc.text)
                item_dict["doc_ids"] = doc_ids
                item_dict["doc_num"] = int(i.get("num"))
        list_result.append(item_dict)
    return list_result


[docs]def searchable_result(
        cluster_result: typing.List[typing.Dict]
) -> typing.Tuple[
    typing.Dict[str, typing.List[str]],
    typing.Dict[str, typing.List[str]],
    typing.Dict[str, typing.Set[str]]
]:
    """
    The :func:`cluster` 's result is not easily to search the result.
    This function will transform the result to a set of result dict to
    make search result easier.

    :func:`cluster` 的结果并不容易使用, 本函数将其结果处理成多个容易搜索获取
    结果的数据结构.

    There are three dict data will return,

    ``cluster_cls``, can get every class's features ::

        {
            class_id: [class_feature]
        }

    ``feature_dict``, can reverse the search above, get which classes have this feature ::

        {
            word: [class_id]
        }

    ``result_dict``, can get any doc's class from doc's hash from :func:`hash_text` ::

        {
            doc_id: {class_id_1, class_id_2}
        }

    :param cluster_result:
    :return: ``cluster_cls``, ``feature_dict``, ``result_dict``
    """
    # class_id, class_feature
    cluster_cls = dict()
    # word: [class_id]
    feature_dict = dict()
    # doc_id : {class_id}
    result_dict = dict()
    for cls in cluster_result:
        cluster_cls[cls["cluster_id"]] = cls["feature"]
        for word in cls["feature"]:
            if word not in feature_dict:
                feature_dict[word] = [cls["cluster_id"]]
            else:
                feature_dict[word].append(cls["cluster_id"])
        for doc_id in cls["doc_ids"]:
            if doc_id not in result_dict:
                result_dict[doc_id] = set()
            result_dict[doc_id].add(cls["cluster_id"])
    return cluster_cls, feature_dict, result_dict