#! coding=utf-8
"""
high-level toolbox for Text Cluster
"""
from nlpir import get_instance as __get_instance__
from nlpir import native
import typing
from xml.etree.ElementTree import fromstring
import hashlib
# class and class instance
__cls__ = native.cluster.Cluster
__instance__: typing.Optional[native.Cluster] = None
# Location of DLL
__lib__ = None
# Data directory
__data__ = None
# license_code
__license_code__ = None
# encode
__nlpir_encode__ = native.UTF8_CODE
[docs]@__get_instance__
def get_native_instance() -> native.Cluster:
"""
返回原生NLPIR接口,使用更多函数
:return: The singleton instance
"""
return __instance__
[docs]def hash_text(text: str) -> str:
return hashlib.md5(text.encode("utf-8")).hexdigest()
[docs]@__get_instance__
def cluster(iter_text, max_doc: int, max_cls: int) -> typing.Tuple[typing.List[typing.Dict], typing.Dict[str, str]]:
"""
Make a cluster with a set of text, get the result as below
对文本进行聚类,获得结果的结构如下
::
[
{
"cluster_id": 1,
"feature": ["aa", "bb", "cc"],
"doc_num": 3,
"doc_ids": ["abc", "cfg", "cca"]
},
...
]
其中, ``doc_ids`` 使用 :func:`hash_text` 生成, 其结果会返回为 ``text_dict``,
``doc_ids`` is generated by :func:`hash_text` using document text, all
the ``doc_id`` and ``doc_text`` will store to ``text_dict`` as KV
::
{
doc_ids: text
}
:param iter_text: a iterator to get a set of text
:param max_doc: the maximum count for text
:param max_cls: the maximum count of class for clustering
:return: ``result``, ``text_dict``
"""
text_dict = dict()
__instance__.clean_data()
__instance__.set_parameter(max_cls, max_doc)
for text in iter_text:
signature = hash_text(text)
__instance__.add_content(text=text, signature=signature)
text_dict[signature] = text
xml_result = __instance__.get_latest_result_e()
__instance__.clean_data()
return __xml2dict__(xml_result), text_dict
def __xml2dict__(xml_txt: str) -> typing.List[typing.Dict]:
"""
Transform xml result to dicts
::
[
{
"cluster_id": 1,
"feature": ["aa", "bb", "cc"],
"doc_num": 3,
"doc_ids": ["abc", "cfg", "cca"]
},
...
]
:param xml_txt:
:return:
"""
xml_ele = fromstring(xml_txt)
list_result = list()
for item in xml_ele.findall("clus"):
item_dict = dict()
item_dict["cluster_id"] = item.attrib["id"]
for i in item:
if i.tag == "feature":
item_dict["feature"] = i.text.split()
else:
doc_ids = list()
for item_doc in i:
doc_ids.append(item_doc.text)
item_dict["doc_ids"] = doc_ids
item_dict["doc_num"] = int(i.get("num"))
list_result.append(item_dict)
return list_result
[docs]def searchable_result(
cluster_result: typing.List[typing.Dict]
) -> typing.Tuple[
typing.Dict[str, typing.List[str]],
typing.Dict[str, typing.List[str]],
typing.Dict[str, typing.Set[str]]
]:
"""
The :func:`cluster` 's result is not easily to search the result.
This function will transform the result to a set of result dict to
make search result easier.
:func:`cluster` 的结果并不容易使用, 本函数将其结果处理成多个容易搜索获取
结果的数据结构.
There are three dict data will return,
``cluster_cls``, can get every class's features ::
{
class_id: [class_feature]
}
``feature_dict``, can reverse the search above, get which classes have this feature ::
{
word: [class_id]
}
``result_dict``, can get any doc's class from doc's hash from :func:`hash_text` ::
{
doc_id: {class_id_1, class_id_2}
}
:param cluster_result:
:return: ``cluster_cls``, ``feature_dict``, ``result_dict``
"""
# class_id, class_feature
cluster_cls = dict()
# word: [class_id]
feature_dict = dict()
# doc_id : {class_id}
result_dict = dict()
for cls in cluster_result:
cluster_cls[cls["cluster_id"]] = cls["feature"]
for word in cls["feature"]:
if word not in feature_dict:
feature_dict[word] = [cls["cluster_id"]]
else:
feature_dict[word].append(cls["cluster_id"])
for doc_id in cls["doc_ids"]:
if doc_id not in result_dict:
result_dict[doc_id] = set()
result_dict[doc_id].add(cls["cluster_id"])
return cluster_cls, feature_dict, result_dict