background

In many times in order to better resolve the text, we not only need the text participle, to stop that simple, in addition to access to key words and the new vocabulary, we also need to get other information for each of the particle size, such as part-of-speech tagging, in python NLPIR we can complete the task very well, if you don’t have NLPIR so you can refer to this article Chapter NLPIR quick build, or directly download I have prepared the Chinese natural language processing file package NLP source set

Code, also my notes

# - * - coding: Utf-8 -*- # # Created :'2017/7/3' # email: [email protected] # CSDN: http://blog.csdn.net/fontthrone import nltk import sys import nlpir sys.path.append(".. /") reload(sys) sys.setdefaultencoding('utf-8') import jieba from jieba import posseg def cutstrpos(txt): # 分词+ partof speech = cutseg. cut(TXT) result = "" for word, flag in cutstr: result += word + "/" + flag + ' ' return result def cutstring(txt): Join (cutstr) return result # open(' TXT /nltest1.txt') textstr = "" try: filestr = txtfileobject.read() finally: Txtfileobject. Close () # using NLPIR2016 participle def ChineseWordsSegmentationByNLPIR2016 (text) : txt = nlpir.seg(text) seg_list = [] for t in txt: Seg_list.append (t[0].encode(' utF-8 ')) return seg_list stopwords_path = 'stopwords\stopwords1893.txt' # seg_list.append(t[0].encode(' utF-8 ')) return seg_list stopwords_path = 'stopwords\stopwords1893.txt' #  ClearStopWordsWithListByNLPIR2016(seg_list): mywordlist = [] liststr = "/ ".join(seg_list) f_stop = open(stopwords_path) try: f_stop_text = f_stop.read() f_stop_text = unicode(f_stop_text, 'utf-8') finally: f_stop.close() f_stop_seg_list = f_stop_text.split('\n') for myword in liststr.split('/'): if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1: mywordlist.append(myword) return ''.join(mywordlist) # print filestr filestr2 = ClearStopWordsWithListByNLPIR2016(ChineseWordsSegmentationByNLPIR2016(filestr)).replace(' ', Posstr = cutstrpos(filestr2) print '**** show is end ****' print '' print 'This is posster' print posstr strtag = [nltk.tag.str2tuple(word) for word in posstr.split()] # for item in strtag: # print item strsBySeg = nlpir.seg(filestr) strsBySeg2 = nlpir.seg(filestr2) strsByParagraphProcess = nlpir.ParagraphProcess(filestr, 1) strsByParagraphProcessA = nlpir.ParagraphProcessA(filestr, ChineseWordsSegmentationByNLPIR2016(filestr)[0], 1) print ' ' print ' ' print '**** strtag ****' for word, tag in strtag: print word, "/", tag, "|", print ' ' print ' ' print '**** strsBySeg ****' for word, tag in strsBySeg: print word, "/", tag, "|", print ' ' print ' ' print '**** strsBySeg2 ****' for word, tag in strsBySeg2: print word, "/", tag, "|", print ' ' print ' ' print '**** strsByParagraphProcess ****' print strsByParagraphProcess # print ' ' # print ' ' # print '**** strsByParagraphProcessA ****' # # for item in strsByParagraphProcessA: # print item, print ' ' print ' ' print '**** show is end ****Copy the code

Practical example

NLPIR automatically classifies and annotates people’s names, allowing us to retrieve custom new words or sentences related to certain types of people. Here is the test code I wrote a while back while working on a project demon

# - * - coding: Utf-8 -*- # # Created :'2017/7/11' # email: [email protected] # CSDN: http://blog.csdn.net/fontthrone from os import path from scipy.misc import imread import matplotlib.pyplot as plt import  jieba from nlpir import * from wordcloud import WordCloud, ImageColorGenerator import sys Reload (sys) sys.setDefaultencoding (' UTF-8 ') d = path.dirName (__file__) text = ' Make preparations for the reception of Minister Yang Dongqi. Def ShowByItem(List): print '********* show ', str(List), ' end *********' for item in List: print item, print print '********* show ', str(List), 'end * * * * * * * * *' # use NLPIR2016 to obtain name def FindAcademicianNameByNLPIR2016 (text, isAddYuanShi) : TXT = seg(text) seg_list = [] for range(len(TXT)): if TXT [I][1] == 'nr' and TXT [I +1][0] == 'member ': If isAddYuanShi == 1: seg_list.append(TXT [I][0].encode(' utF-8 ')+' academician ') else: seg_list.append(txt[i][0].encode('utf-8')) return seg_list str2 = FindAcademicianNameByNLPIR2016(text,1) ********* show ['\xe9\x92\x9f\xe4\ XB8 \x96\xe9\x95\x87\xe9\x99\xa2\xe5\xa3\xab'] end ********* show ['\xe9\x92\x9f\xe4\xb8\x96\xe9\x95\x87\xe9\x99\xa2\xe5\xa3\xab'] endCopy the code

Used in Demon

Use NLPIR2016 to obtain name def FindAcademicianNameByNLPIR2016 (text, isAddYuanShi) : TXT = seg(text) seg_list = [] for range(len(TXT)): if TXT [I][1] == 'nr' and TXT [I +1][0] == 'member ': If isAddYuanShi == 1: seg_list.append(TXT [I][0].encode(' utF-8 ')+' academician ') else: seg_list.append(txt[i][0].encode('utf-8')) strAcademicianName = FindAcademicianNameByNLPIR2016(fullContent,1) StrAcademicianName = list(set(strAcademicianName)) # Store dfAcademicianName = pd.dataframe (strAcademicianName) DfAcademicianName. Columns = [' AcademicianName] dfAcademicianName. To_csv (' CSV/dfAcademicianName ') # use of Pandas DfNewWords = pd.read_csv(" CSV /dfNewWords") dfAcademicianName = pd.read_csv(" CSV /dfAcademicianName") # You can also add it to the user's new vocabulary # Add_word (dfAcademicianName['AcademicianName']) # def academiciancSV (df,strColumn,df1): dfAcademicianName = pd.read_csv("csv/dfAcademicianName") listAcademicianName = list(dfAcademicianName['AcademicianName']) print type(listAcademicianName) mywordlistAcademicianName =[] mywordlisttime = [] mywordAca = [] df1 = df1.copy() numlen = len(df1.index) for i in range(numlen): for myword in df1.loc[i, strColumn].split(): if (myword in listAcademicianName) and len(myword) > 1: print myword mywordlistAcademicianName.append(df.loc[i, strColumn]) mywordAca.append(myword) mywordlisttime.append(df.loc[i, 'time']) return mywordlistAcademicianName mywordlisttime, mywordAca # mywordlistAcademicianName returned, mywordlisttime,mywordAca = GetAcademicianCSV(df,'content',df1)Copy the code

Results the following