#ifdef linux classCNLPIR { #else class__declspec(dllexport) CNLPIR { #endif
3)编译代码
python setup.py build_ext --inplace
4、编写 PyNLPIR.py
为了方便使用对生成的 NLPIR.py 进行重封装。
#!/usr/bin/env python #coding:utf-8 ''' Created on Fri 19, 2013 @author: killua @e-mail: [email protected] @Decription: Python for NLPIR ''' import NLPIR import os
defnlpir_init(init_dir = '.', code_type = 'GBK'): ''' Init the analyzer and prepare necessary data for NLPIR according the configure file. ''' if code_type == 'GBK': is_succeed = NLPIR.NLPIR_Init(init_dir, NLPIR.GBK_CODE) elif code_type == 'UTF-8'or code_type == 'UTF8': is_succeed = NLPIR.NLPIR_Init(init_dir, NLPIR.UTF8_CODE) elif code_type == 'BIG5': is_succeed = NLPIR.NLPIR_Init(init_dir, NLPIR.BIG5_CODE) elif code_type == 'GBK_FANTI': is_succeed = NLPIR.NLPIR_Init(init_dir, NLPIR.GBK_FANTI_CODE) if is_succeed: print'NLPIR Successful.' else: print'NLPIR Failed.'
defnlpir_exit(): ''' Exit the program and free all resources and destroy all working buffer used in NLPIR. ''' return NLPIR.NLPIR_Exit()
defnlpir_import_user_dict(user_dict): ''' Import user-defined dictionary from a text file. ''' return NLPIR.NLPIR_ImportUserDict(user_dict)
defnlpir_paragraph_process(text, is_pos_tagged = False): ''' Process a paragraph ''' return NLPIR.NLPIR_ParagraphProcess(text, is_pos_tagged)
defnlpir_file_process(source_file, target_file, is_pos_tagged = False): ''' Process a text file ''' return NLPIR.NLPIR_FileProcess(source_file, target_file, is_pos_tagged)
defnlpir_add_user_word(word): ''' Add a word to the user dictionary. ''' return NLPIR.NLPIR_AddUserWord(word)
defnlpir_save_user_dict(): ''' Save the user dictionary to disk. ''' return NLPIR.NLPIR_SaveTheUsrDic()
defnlpir_delete_user_word(word): ''' Delete a word from the user dictionary. ''' return NLPIR.NLPIR_DelUsrWord(word)
P(w|c)的含义是,在试图拼写 c 的情况下,出现拼写错误 w
的概率。这需要统计数据的支持,但是为了简化问题,我们假设两个单词在字形上越接近,就有越可能拼错,P(w|C)就越大。举例来说,相差一个字母的拼法,就比相差两个字母的拼法,发生概率更高。你想拼写单词
hello,那么错误拼成 hallo(相差一个字母)的可能性,就比拼成 haallo
高(相差两个字母)。
defread_words(): ''' Read bag of words from file ''' f = open('wiktionary', 'r') for line in f.readlines(): DICTIONARY.update({line.split(':')[0] : float(line.split(':')[1])})
defgenerate_edit_distance1_words(word): ''' Generate a set of all words that are one edit distance from word ''' #delete a character for word deletes = [word[1:]] deletes += [str(word[:i] + word[i+1:]) for i inrange(1, len(word))] #change position between two character for word transposes = [str(word[1] + word[0] + word[2:])] transposes += [str(word[:i-1] + word[i] + word[i-1] + word[i+1:]) for i inrange(2, len(word))] #replaces one character replaces = [str(c + word[1:]) for c in string.lowercase] replaces += [str(word[:i] + c + word[i+1:]) for i inrange(1, len(word)) for c in string.lowercase] #insert one character inserts = [str(c + word) for c in string.lowercase] inserts += [str(word[:i] + c + word[i:]) for i inrange(1, len(word)) for c in string.lowercase] inserts += [str(word + c) for c in string.lowercase]
defwords_filter(words): ''' Word filter ''' returnset(word for word in words if word in DICTIONARY.keys())
defcandidates(word): ''' Get all candidates for word ''' if word in DICTIONARY.keys(): returnset([word]) else: return words_filter([word]) | words_filter(generate_edit_distance1_words(word))
defcorrect(word): ''' correct the word. ''' candidate_words = candidates(word) candidate_dict = {} for item in candidate_words: candidate_dict.setdefault(item, 0) candidate_dict.update({item : DICTIONARY[item]})
returnmax(candidate_dict, key = lambda x : candidate_dict[x])
defread_words(): ''' Read bag of words from file ''' f = open('wiktionary', 'r') for line in f.readlines(): DICTIONARY.update({line.split(':')[0] : float(line.split(':')[1])})
defgenerate_edit_distance1_words(word): ''' Generate a set of all words that are one edit distance from word ''' #delete a character for word deletes = [word[1:]] deletes += [str(word[:i] + word[i+1:]) for i inrange(1, len(word))] #change position between two character for word transposes = [str(word[1] + word[0] + word[2:])] transposes += [str(word[:i-1] + word[i] + word[i-1] + word[i+1:]) for i inrange(2, len(word))] #replaces one character replaces = [str(c + word[1:]) for c in string.lowercase] replaces += [str(word[:i] + c + word[i+1:]) for i inrange(1, len(word)) for c in string.lowercase] #insert one character inserts = [str(c + word) for c in string.lowercase] inserts += [str(word[:i] + c + word[i:]) for i inrange(1, len(word)) for c in string.lowercase] inserts += [str(word + c) for c in string.lowercase]
defwords_filter(words): ''' Word filter ''' returnset(word for word in words if word in DICTIONARY.keys())
defcandidates(word): ''' Get all candidates for word ''' if word in DICTIONARY.keys(): returnset([word]) else: return words_filter([word]) | words_filter(generate_edit_distance1_words(word))
defcorrect(word): ''' correct the word. ''' candidate_words = candidates(word) candidate_dict = {} for item in candidate_words: candidate_dict.setdefault(item, 0) candidate_dict.update({item : DICTIONARY[item]})
returnmax(candidate_dict, key = lambda x : candidate_dict[x])
if __name__ == '__main__': read_words() print sys.argv[1], '->', correct(sys.argv[1])
4)Java 堆(Java Heap)
JVM 中最大的一块区域。Java Heap 是被所有线程所共享,在 JVM
启动时创建。
在 JVM 规范中的描述如下:The heap is the runtime data area from
which memory for all class instances and arrays is allocated.