The installation

Python version

Python2.7 or 3.4+ is required

Install using PIP

pip install -U nltkCopy the code

Install NLTK data

Import NLTK nltk.download() # import Brown Corpus from nltk.corpus import Brown brown.words() # import Brown Corpus from nltk.corpus import Brown brown.words()Copy the code

After downloading, if the data is not found, you need to set NLTK_DATA as the directory for the data.

The Text object

Form nltk.book import * # Print out the context in which the input words appear in the text text1.concordance(' bomb ') # print out other words that have the same context as the input words Text1.similar (' bomb ') # Accept a list of words, Text1.common_contexts ([' context ', 'gamesome']) # draw the distribution of each word in the text text4.dispersion_plot(['freedom', 'America']) # return the number of times the word appears in the text text1.count(' photobomb ') # print out the double junctions frequently appearing in the text text1.collocations()Copy the code

FreqDist object

Import NLTK from nltk.book import * "" generates FreqDist, which is inherited from dict FreqDist with keys as words, The FreqDist constructor takes any list "" fdist1 = FreqDist(text1) # Plot high-frequency terms fdist1.plot(10) # print the top 15 most frequent occurrences as a table Fdist1. Tabulate (# 15) back to the top 15 list occurrences # [(', ', 18713), (' the ', 13721), ('. ', 6862), (' of ', 6536), (' and ', 6024). ... fdist1.most_common(15) # return a list of low-frequency items, Once again, low frequency items are #['whalin', 'luxurious ', 'footmanism', 'peacefulness', 'incorruptible', FreqDist::hapaxes() # return fdist1.max() # words = set(text1) long_words = [w for w in words if len(w) > 7 and fdist1[w] > 7] print(sorted(long_words))Copy the code

Chinese word segmentation

Stanford Chinese Word Segmentation supports pos tagging, named entity recognition and parsing. Download the latest JAR package to download SLF4J

# -*- coding:utf-8 -*- from nltk.tokenize.stanford_segmenter import StanfordSegmenter segmenter = StanfordSegmenter( Path_to_jar = "Stanford - segmenter - 3.7.0. Jar", path_to_slf4j = "slf4j - simple - 1.7.25. Jar", path_to_sihan_corpora_dict="./data", path_to_model="./data/pku.gz", Path_to_dict ="./data/ dice-chris6.ser.gz ") sentence = u segmenter.segment(sentence) print segmenter.segment_file("test.simp.utf8")Copy the code

corpus

Import NLTK # Gutenberg, webtext and skynest are instance objects of PlaintextCorpusReader from NLTK. Corpus import Gutenberg #['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', Gutenberg. Fileids () # Accepts one or more text identifiers as arguments, Returns the text word list # [' [', 'Emma', 'by', 'Jane', 'Austen', '1816', '] '. ] Emma = Gutenberg. Words ("austen-emma.txt") Return the original string #'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, 'emma_str = Gutenberg. Raw ("austen-emma.txt") # accept one or more text identis as arguments, Emma_sents = Gutenberg. Sents ("austen-emma.txt") print(emma_sents) #['firefox. TXT ', 'grail.txt', 'overheard.txt', 'pirates.txt', 'singles.txt', Corpus import webtext print(webtext.fileids()) # Inaugural Address corpus from nltk.corpus import skynest Print (skynskine.fileids ()) # nps_chat is a NPSChatCorpusReader object from nltk.corpus import nps_chat Print (nps_chat.fileids()) # Posts ('10-19-30s_705posts.xml') print(chat_room) # browns corpus Brown and Reuters is CategorizedTaggedCorpusReader instance objects from me. The corpus import brown # returns the category of the corpus logo print (brown) categories ()) Print (brown.fileids(['news', 'lore')) print(brown.fileids(['news', 'lore')) Ca02 = brown.words(fileids='ca02') print('ca02: ', CA02) # Corpus from nltk.corpus import Reuters print(reuters.categories())Copy the code

Conditional frequency distribution

ConditionalFreqDist ConditionalFreqDist ConditionalFreqDist ConditionalFreqDist ConditionalFreqDist Corpus import Brown # Conditional frequency distributions need to deal with lists of pairings, each of which takes the form (condition, event), in this case condition as stylistic category and event as word. pairs = [(genre, word) for genre in brown.categories() for word in brown.words(categories=genre)] cfd = nltk.ConditionalFreqDist(pairs) Print (cfd.conditions()) Genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'May ', 'might', 'must', 'will'] CFD. Tabulate (conditions=genres, samples=modals) # CFD. Plot (conditions=genres, samples=modals) # Based on the given list of words, Generate all double phrase sent = [' I ', 'am', 'a', 'good', 'man'] # [(' I ', 'am'), (' am ', 'a'), (' a ', 'good'), (' good ', 'man')] print(list(nltk.bigrams(sent)) Text = brown.words(Categories ='news') bigrams_words = nltk.bigrams(text) CFD = nltk.ConditionalFreqDist(bigrams_words) fd = cfd['can'] fd.plot(10)Copy the code

The part of speech tagging

Part-of-speech tagging set

  • Part of speech coding table of PFR People’s Daily Annotated Corpus
  • “Norms for Corpus processing in modern Chinese — Word Segmentation and part-of-speech Tagging.” part of speech markers
  • ICTCLAS 3.0 Chinese part-of-speech Marker set
  • HanLP part-of-speech tagging set
  • BosonNLP Pos tagging
  • Stammer participle labeling
Import NLTK # partof speech tagger # participle the specified sentence, Words = nltk.word_tokenize('And now for something completely different') #['And', 'now', 'for', 'something', 'completely', 'different'] print(words) # Returns the list of tags word_tag = me. Pos_tag (words) # [(' And ', 'CC'), (' now ', 'RB'), (' for 'And' IN '), (' something ', ':'), (' completely ', 'RB'), (' the company ', 'JJ)] print (word_tag) # # tagging corpus brown can be thought of as a CategorizedTaggedCorpusReader instance objects. from nltk.corpus import brown words_tag = brown.tagged_words(categories='news') #[('The', 'AT'), ('Fulton', 'NP-TL'), Print (words_tag[:10]) # print(words_tag[:10]) # print(words_tag[:10]) Tagged_sents = brown.tagged_sents(Categories ='news') print(tagged_sents) # sinica_treebank The library is also marked the part of speech # sinica_treebank can be seen as a SinicaTreebankCorpusReader instance objects. From nltk.corpus import sinicA_treebank #['parsed'] print(sinica_treebank.fileids()) # Sinica_treebank. Words ('parsed') print(words[:40]) # words_tag = sinica_treebank. Tagged_words ('parsed') Print (words_tag = sinica_treebank. Tagged_words ('parsed') tag_fd = nltk.FreqDist(tag for (word, tag) in words_tag) tag_fd.tabulate(5)Copy the code

Create a part of speech tagger

import nltk raw = "You are a good man, but i don't love you!" Default_tagger = nLTK.defaulttagger ('NN') # Tokens = nLTK.word_tokenize (raw) Tagged_words = default_tagger.tag(tokens) Print (tagged_words) from nltk.corpus import brown # Use marked sentences to evaluate the tagger, Tagged_sents = brown.tagged_sents(Categories ='news') #0.13089484257215028 Print (default_tagger. Evaluate (tagged_sents)) Fd = nltk.freqdist (brown.words(categories='news')) most_common_pairs = fd.most_common(100) Most_common_words = [I [0] for I in MOST_common_pairs] # CFD = nltk.conditionalfreqdist (brown.tagged_words(Categories ='news')) # ConditionalFreqDist(categories='news') # ConditionalFreqDist(categories='news') # ConditionalFreqDist(categories='news') # ConditionalFreqDist(categories='news') # ConditionalFreqDist(categories='news') # ConditionalFreqDist(categories='news' A (word-tag) dictionary, the UnigramTagger and DefaultTagger classes both inherit from TaggerI likely_tags = dict((word, CFD [word].max()) for word in most_common_words Generate query annotator baseline_tagger = nltk.unigramtagger (model=likely_tags) tagged_sents = brown.tagged_sents(Categories ='news') # 0.45578495136941344 print(baseline_tagger. Evaluate (tagged_sents)) #  but i don't love you!" tokens = nltk.word_tokenize(raw) # [('You', None), ('are', 'BER'), ('a', 'AT'), ('good', None), Baseline_tagger2 = nltK. UnigramTagger(Model = Likely_tags) Backoff = nltk.defaulttagger ('NN')) tagged_sents = brown.tagged_sents(Categories ='news') # 0.5817769556656125 Print (baseline_tagger2.evaluate(tagged_sents)) # For news text frequency distribution, Fd = nltk.freqdist (brown.words(categories='news')) most_common_pairs = fd.most_common(500) Most_common_words = [I [0] for I in MOST_common_pairs] # CFD = nltk.conditionalfreqdist (brown.tagged_words(Categories ='news')) # ConditionalFreqDist(categories='news') # ConditionalFreqDist(categories='news') # ConditionalFreqDist(categories='news') # ConditionalFreqDist(categories='news') # ConditionalFreqDist(categories='news') # ConditionalFreqDist(categories='news' Likely_tags = dict((word, CFD [word].max()) for word in most_common_words) Baseline_tagger = nltK. UnigramTagger(model=likely_tags, Backoff = nltk.defaulttagger ('NN')) tagged_sents = brown.tagged_sents(Categories ='news') # 0.6789983491457326 print(baseline_tagger.evaluate(tagged_sents))Copy the code

Unary annotator

Import NLTK from nltk.corpus import brown tagged_sents = brown.tagged_sents(categories='news') # generate unigram_tagger Print (unigram_tagger. Evaluate (tagged_sents)) # evaluate(tagged_sents) Separating the training set from the test set, using 90% of the data set as the training set, Tagged_sents = brown.tagged_sents(Categories ='news') size = int(len(tagged_sents) * 0.9) train_sets = Tagged_sents [:size] # unigram_tagger = NLTK.UnigramTagger(train_sets) # 0.9353630649241612 print (unigram_tagger. Evaluate (train_sets) # 0.8115219774743347 print(unigram_tagger.evaluate(test_sets))Copy the code

Binary annotator

Tagged_sents = brown.tagged_sents(Categories ='news') size = int(len(tagged_sents) * 0.9) train_sets = Tagged_sents [:size] test_sets = tagged_sents[:size :] # Generate binary tagram_tagger = NLTK 0.7890434263872471 print (bigram_tagger. Evaluate (train_sets) # 0.10186384929731884 print(bigram_tagger.evaluate(test_sets))Copy the code

The binary annotator will examine the mark of a word itself and the word before it. If a new word is encountered, the binary annotator cannot mark it, and the following words cannot be marked. Therefore, we will see that the binary annotator has a low accuracy rate in the test set.

Combinatorial annotator

Combine as follows – try to annotate words using the BigRAM annotator. – If bigram annotator cannot find a tag, try unigram annotator. – If the Unigram annotator cannot find a tag, use the default annotator.

Tagged_sents = brown.tagged_sents(Categories ='news') size = Int (len(tagged_sents) * 0.9) train_sets = tagged_sents[:size] test_sets = tagged_sents[size:] # T0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train=train_sets, backoff=t0) t2 = nltk.BigramTagger(train=train_sets, Print (T2. evaluate(train_sets)) # 0.8459085019435861 print(t2.evaluate(test_sets))Copy the code

Classify Chinese nicknames by gender

import nltk import random from nltk.classify import apply_features from nltk.corpus import PlaintextCorpusReader names_corpus = PlaintextCorpusReader('./', ['female.txt', 'male.txt']) all_names = names_corpus.words() ch_freq = nltk.FreqDist(ch.lower() for name in all_names for ch in name) ch_freq_most = ch_freq.most_common(1000) ch_features = [ch for (ch, Count) in ch_freq_most print(ch_freq_most) def name_features(name): "" Name_chs = set([ch.lower() for ch in name]) features = {} for ch in ch_features: features['contain(%s)' % ch] = (ch in name_chs) return features female_names = [(name, 'female')for name in names_corpus.words('female.txt')] male_names = [(name, 'male')for name in names_corpus.words('male.txt')] total_names = female_names + male_names random.shuffle(total_names) Train_set_size = int(len(total_names) * 0.6) train_names = total_names[:train_set_size] test_names = total_names[train_set_size:] train_set = apply_features(name_features, train_names, True) test_set = apply_features(name_features, test_names, True) classifier = nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(classifier, train_set)) print(nltk.classify.accuracy(classifier, test_set)) classifier.show_most_informative_features(20) for (name, tag) in test_names: guess = classifier.classify(name_features(name)) if guess ! = tag: print(tag, guess, name)Copy the code

Regular expression blocker

Group one or more consecutive words into a group. Chunking is a basic technique for entity recognition.

Chunk noun phrases

Noun phrase segmentation is also called NP-segmentation. One of the most useful sources of NP-segmentation information is part-of-speech tagging, so we usually carry out part-of-speech tagging before segmentation.

symbol meaning example
S sentence the man walked
NP noun phrase a dog
VP verb phrase saw a park
PP prepositional phrase with a telescope
Det determiner the
N noun dog
V verb walked
P preposition in

Regular expression blocker

Import NLTK # 分词 text = "Lucy let down her long golden hair" sentence = nltk.word_tokenize(text) # pos sentence_tag = Print (sentence_tag) # print(sentence_tag) # print(sentence_tag) # print(sentence_tag) # print(sentence_tag) # print Then with a noun # the second rule to match one or more proper nouns # $symbol is one of the regular expression special characters, must use escape symbol \ to match PP $grammar = r "" "NP: {< DT | PRP \ $>? <JJ>*< NNP>} {<NNP>+} """ # block cp = nltk.regexpparser (grammar) tree = cp.parse(sentence_tag) tree.draw()Copy the code

Add gap

Import NLTK # 分词 text = "The little yellow dog barked at the cat" sentence = nltk.word_tokenize(text) # Sentence_tag = nltk.pos_tag(sentence) print(sentence_tag) # Define gap syntax # first rule matches the entire sentence # second rule matches one or more verbs or prepositions # A pair of}{represents a gap between the syntactically matched words. Grammar = r""" NP: {<. * > +}} < VBD | IN > + {" "" cp = me. RegexpParser (grammar) # block me Tree Tree = cp. Parse (sentence_tag) Tree. The draw ()Copy the code

Evaluate the blocker

The standard way to place a block in a text file is the IOB tag: I (inside, inside), O (outside, outside), B (begn, start)

Conll2000 corpus

Import NLTK from NLTK. Corpus import conll2000 # load the NP block in the training text and return the result as a list. Test_sents = conll2000.chunked_sents("train.txt", Chunk_types = [" NP "]) # tree2conlltags function Tree object can be converted to the format of IBO tag list tags = me. The chunk. Tree2conlltags (test_sents [0]) print (tags)Copy the code

The conLL2000 corpus was used to evaluate the chunker

# grammar = r"NP: {<[CDJNP].*>+}" cp = nltk.RegexpParser(Grammar) # test_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"]) print(cp.evaluate(test_sents))Copy the code

Create a splitter using a unary annotator

class UnigramChunker(nltk.ChunkParserI): Def __init__(self, train_sents): def __init__(self, train_sents) def __init__(self, train_sents) "" constructor :param train_sents: Tree object list "" train_data = [] for sent in train_sents: [(word, tag, IOB-tag),...] Conlltags = me. The chunk. Tree2conlltags # (sent) to find out each part-of-speech tagging corresponding IOB tag ti_list = [(t, I) for w, t, Self.__tagger = nltk.UnigramTagger(train_data) def parse(self, Tokens): """ Param tokens: list of words with tags :return: Tags = [tag for (word, Ti_list = self.__tagger.tag(tags) # Iob_tag = [iob_tag for (tag, tags) Conlltags = [(word, pos, iOB_tag) for ((word, pos), iOB_tag) in zip(tokens, iob_tags)] return nltk.chunk.conlltags2tree(conlltags) test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"]) train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"]) unigram_chunker = UnigramChunker(train_sents) print(unigram_chunker.evaluate(test_sents))Copy the code