Source code for nlpir.native.ictclas

# coding=utf-8
from nlpir.native.nlpir_base import NLPIRBase
from ctypes import c_bool, c_char, c_char_p, c_double, c_int, c_uint, POINTER, Structure, byref
import typing


[docs]class ResultT(Structure):
    """The NLPIR ``result_t`` structure. copy from pynlpir"""

    _fields_ = [
        # The start position of the word in the source Chinese text string.
        ('start', c_int),

        # The detected word's length.
        ('length', c_int),

        # A string representing the word's part of speech.
        ('sPOS', c_char * 40),

        ('iPOS', c_int),

        ('word_ID', c_int),

        # If the word is found in the user's dictionary.
        ('word_type', c_int),

        # The weight of the detected word.
        ('weight', c_int)
    ]


[docs]class ICTCLAS(NLPIRBase):
    """
    A dynamic link library native class for Chinese Segmentation
    """
    POS_MAP_NUMBER = 4  # add by qp 2008.11.25
    ICT_POS_MAP_FIRST = 1  # 计算所一级标注集
    ICT_POS_MAP_SECOND = 0  # 计算所二级标注集
    PKU_POS_MAP_SECOND = 2  # 北大二级标注集
    PKU_POS_MAP_FIRST = 3  # 北大一级标注集
    POS_SIZE = 40

    @property
    def dll_name(self) -> str:
        return "NLPIR"

[docs]    @NLPIRBase.byte_str_transform
    def init_lib(self, data_path: str, encode: int, license_code: str) -> int:
        """
        Call **NLPIR_Init**

        :param str data_path:
        :param int encode:
        :param str license_code:
        :return: 1 success 0 fail
        """
        return self.get_func('NLPIR_Init', [c_char_p, c_int, c_char_p], c_int)(data_path, encode, license_code)

[docs]    def exit_lib(self) -> bool:
        """
        Call **NLPIR_Exit**

        :return: exit success or not
        """
        return self.get_func('NLPIR_Exit', restype=c_bool)()

[docs]    @NLPIRBase.byte_str_transform
    def paragraph_process(self, paragraph: str, pos_tagged: int = 1) -> str:
        """
         Call **NLPIR_ParagraphProcessing**

         Chinese word segment, segment paragraph to a string

        :param str paragraph: the string want to be segmented
        :param int pos_tagged: show the pos tag or not 1-> True, 0-> False
        :return: segmented string
        """
        return self.get_func('NLPIR_ParagraphProcess', [c_char_p, c_int], c_char_p)(paragraph, pos_tagged)

[docs]    @NLPIRBase.byte_str_transform
    def paragraph_process_a(self, paragraph: str, user_dict: bool = True) -> typing.Tuple[ResultT, int]:
        """
        Call **ParagraphProcessingA**

        Segment paragraph to an Array of ResultT, get more detail info

        :param str paragraph: the string want to be segmented
        :param bool user_dict: use user dictionary or not
        :return: a result of segment, an array of ResultT and the length of the ResultT
        """
        self.logger.warning("not recommended, use paragraph_process instead")
        result_count = c_int()
        result = self.get_func('NLPIR_ParagraphProcessA', [c_char_p, POINTER(c_int), c_bool], POINTER(ResultT))(
            paragraph,
            byref(result_count),
            user_dict
        )
        return result, result_count.value

[docs]    @NLPIRBase.byte_str_transform
    def get_paragraph_process_a_word_count(self, paragraph: str) -> int:
        raise NotImplementedError("Not recommended, use paragraph_process")

[docs]    @NLPIRBase.byte_str_transform
    def paragraph_process_aw(self, count: int, result: ResultT) -> None:
        raise NotImplementedError("Not recommended, use paragraph_process")

[docs]    @NLPIRBase.byte_str_transform
    def file_process(self, source_filename: str, result_filename: str, pos_tagged: int = 1) -> float:
        """
        Call **NLPIR_FileProcess**

        Segment a text file and save to a file.

        :param str source_filename: the path of a text file that want to be segmented
        :param str result_filename: the path to save the result of segmentation
        :param int pos_tagged: show the pos tag or not 1->True, 0->False
        """
        return self.get_func('NLPIR_FileProcess', [c_char_p, c_char_p, c_int], c_double)(
            source_filename,
            result_filename,
            pos_tagged
        )

[docs]    @NLPIRBase.byte_str_transform
    def import_user_dict(self, filename: str, overwrite: bool = False) -> int:
        """
        Call **NLPIR_ImportUserDict**

        Import a user dict to the system, the format of the dict file::

            word1 pos_tag
            word2 pos_tag

        If you import a user dict to the system, the user dict will save to the system (in Data directory).
        You cannot delete the word in the user dict from the system use :func:`clean_user_word` or :func:`del_usr_word`.

        **TODO** add more comment for clean the user dict and add the function to the high-level method

        :param str filename: the path of user dict file
        :param bool overwrite: overwrite the current user dict or not
        :return: import success or not  1->True 2->False
        """
        return self.get_func('NLPIR_ImportUserDict', [c_char_p, c_bool], c_uint)(filename, overwrite)

[docs]    @NLPIRBase.byte_str_transform
    def add_user_word(self, word: str) -> int:
        """
        Call **NLPIR_AddUserWord**

        Add a word to the user dictionary ,example::

            单词 词性

        or::

            单词 (default n)

        The added word only add in memory and will not affect the user dict, you can use :func:`clean_user_word` or
        :func:`del_usr_word` to delete the word or all the words in memory. If you want to save to the user dict ,use
        :func:`save_the_usr_dic` to save to the *Data* directory.

        :param str word:
        :return: 1,true ; 0,false
        """
        return self.get_func('NLPIR_AddUserWord', [c_char_p], c_int)(word)

[docs]    @NLPIRBase.byte_str_transform
    def clean_user_word(self) -> int:
        """
        Call **NLPIR_CleanUserWord**

        Clean all temporary added user words, more info see :func:`add_user_word`
        TODO figure out the return value
        :return: 1,true ; 0,false
        """
        return self.get_func('NLPIR_CleanUserWord', None, c_int)()

[docs]    @NLPIRBase.byte_str_transform
    def clean_current_user_word(self) -> int:
        """
        Call **NLPIR_CleanCurrentUserWord**
        Clean all Current temporary added user words and restore previous stored data

        ** Now Only for win and linux x64 **

        :return: 1,true; 2,false
        """
        return self.get_func('NLPIR_CleanCurrentUserWord', None, c_int)()

[docs]    @NLPIRBase.byte_str_transform
    def save_the_usr_dic(self) -> int:
        """
        Call **NLPIR_SaveTheUsrDic**

        Save in-memory dict to user dict, more info see :func:`add_user_word`

        :return: 1,true; 2,false
        """
        return self.get_func('NLPIR_SaveTheUsrDic', None, c_int)()

[docs]    @NLPIRBase.byte_str_transform
    def del_usr_word(self, word: str) -> int:
        """
        Call **NLPIR_DelUsrWord**

        Delete a word from the user dictionary, more info see :func:`add_user_word`

        :param str word: the word to be delete
        :return: -1, the word not exist in the user dictionary; else, the handle of the word deleted
        """
        return self.get_func('NLPIR_DelUsrWord', [c_char_p], c_int)(word)

[docs]    @NLPIRBase.byte_str_transform
    def get_uni_prob(self, word) -> float:
        """
        Call **NLPIR_GetUniProb**

        Get Unigram Probability

        :param str word: input word
        :return: The unitary probability of a word.
        """

        return self.get_func("NLPIR_GetUniProb", [c_char_p], c_double)(word)

[docs]    @NLPIRBase.byte_str_transform
    def is_word(self, word: str) -> int:
        """
        Call **NLPIR_IsWord**

        Judge whether the word is included in the core dictionary

        :param str word: input word
        :return: 1: exists; 0: no exists
        """
        return self.get_func("NLPIR_IsWord", [c_char_p], c_int)(word)

[docs]    @NLPIRBase.byte_str_transform
    def is_user_word(self, word: str, is_ascii: bool = False) -> int:
        """
        Call **NLPIR_IsUserWord**

        Judge whether the word is included in the user-defined dictionary

        :param str word: input word
        :param bool is_ascii: is ascii encode or not
        :return: 1: exists; 0: no exists
        """
        return self.get_func("NLPIR_IsUserWord", [c_char_p], c_int)(word, is_ascii)

[docs]    @NLPIRBase.byte_str_transform
    def get_word_pos(self, word: str) -> str:
        """
        Call **NLPIR_GetWordPOS**

        Get the word Part-Of-Speech information

        :param str word: input word
        :return: pos tagging
        """
        return self.get_func("NLPIR_GetWordPOS", [c_char_p], c_char_p)(word)

[docs]    def set_pos_map(self, pos_map: int) -> int:
        """
        Call **NLPIR_SetPOSmap**

        Select which pos map will use:

        - :attr:`ICT_POS_MAP_FIRST`   计算所一级标注集
        - :attr:`ICT_POS_MAP_SECOND`  计算所二级标注集
        - :attr:`PKU_POS_MAP_SECOND`  北大二级标注集
        - :attr:`PKU_POS_MAP_FIRST`   北大一级标注集

        Default is :attr:`ICT_POS_MAP_SECOND`

        :param int pos_map:
        :return: 0, failed; else, success
        """
        return self.get_func("NLPIR_SetPOSmap")(pos_map)

[docs]    @NLPIRBase.byte_str_transform
    def finer_segment(self, line: str) -> str:
        """
        Call **NLPIR_FinerSegment**

        当前的切分结果过大时,如“中华人民共和国”, 需要执行该函数,将切分结果细分为“中华 人民 共和国”

        细分粒度最大为三个汉字,如果不能细分，则返回为空字符串

        :param str line: string need to be segmented
        :return: segmented string, return null string if line cannot be segmented
        """
        return self.get_func("NLPIR_FinerSegment", [c_char_p], c_char_p)(line)

[docs]    @NLPIRBase.byte_str_transform
    def get_eng_word_origin(self, word: str) -> str:
        """
        Call **NLPIR_GetEngWordOrign**

        获取各类英文单词的原型，考虑了过去分词、单复数等情况::

            driven->drive   drives->drive   drove-->drive

        :param str word: word to be stemmed
        :return: the stemmed word
        """
        return self.get_func("NLPIR_GetEngWordOrign", [c_char_p], c_char_p)(word)

[docs]    @NLPIRBase.byte_str_transform
    def word_freq_stat(self, text: str, stop_word_remove: bool = True) -> str:
        """
        Call **NLPIR_WordFreqStat**

        获取输入文本的词，词性，频统计结果，按照词频大小排序

        :param str text: 输入的文本内容
        :param bool stop_word_remove: true-去除停用词 false-不去除停用词
        :return: 返回的是词频统计结果形式如下

        ::

            张华平/nr/10#博士/n/9#分词/n/8
        """
        return self.get_func("NLPIR_WordFreqStat", [c_char_p, c_bool], c_char_p)(text, stop_word_remove)

[docs]    @NLPIRBase.byte_str_transform
    def file_word_freq_stat(self, filename: str, stop_word_remove: bool = True) -> str:
        """
        Call **NLPIR_FileWordFreqStat**

        Same as :func:`word_freq_stat`

        :param str filename: path of text file
        :param bool stop_word_remove: remove stop words or not
        :return: same as :func:`word_freq_stat`
        """
        return self.get_func("NLPIR_FileWordFreqStat", [c_char_p, c_bool], c_char_p)(filename, stop_word_remove)

[docs]    @NLPIRBase.byte_str_transform
    def get_last_error_msg(self) -> str:
        """
        Call **NLPIR_GetLastErrorMsg**

        :return: error message
        """
        return self.get_func("NLPIR_GetLastErrorMsg", None, c_char_p)()

[docs]    @NLPIRBase.byte_str_transform
    def tokenizer_for_ir(self, sline: str) -> str:
        """
        Call **NLPIR_Tokenizer4IR**

        搜索引擎模式，在精确模式的基础上，对长词再次切分，提高召回率，适合用于搜索引擎分词

        :param str sline: The source paragraph
        :param bool stop_word_remove: remove stop words or not
        :return: same as :func:`word_freq_stat`
        """
        return self.get_func("NLPIR_Tokenizer4IR", [c_char_p, c_bool], c_char_p)(sline)