5 分类与标注词汇

importos, re,nltk

fromnltk.corpusimportwords, state_union,brown,treebank

fromcollectionsimportdefaultdict

列表与元组

# words = ['I', 'turned', 'off', 'the', 'spectroroute','the']

# words2=('I', 'turned', 'off', 'the', 'spectroroute','the','I')

# print (set(words))

# #print(reversed(words))

# print(sorted(words))

# print (set(words2))

# print(reversed(words2))

# print(sorted(words2))

#NOUN 名词

# brown_news_tagged=brown.tagged_words(categories='news',tagset='universal')

# word_tag_pairs=nltk.bigrams(brown_news_tagged)

# noun_proceders = [a[1]for(a,b)in word_tag_pairs if b[1]=='NOUN']

# fdist=nltk.FreqDist(noun_proceders)

# common_proceders=[tag for (tag,value) in fdist.most_common()]

# print(common_proceders) 获取名词前置的高频词类

#Verb 动词

获得过去分词以及过去式词形相同的动词

# wsj=treebank.tagged_words()

# cfd1=nltk.ConditionalFreqDist(wsj)

# vl=[w for w in cfd1.conditions()if 'VBN' in cfd1[w] and 'VBD' in cfd1[w]]

# print(vl)

获取某过去分词词以及其tag的位置

# cfd2=nltk.ConditionalFreqDist((tag,word)for (word,tag)in wsj)

# vbn_list=list(cfd2['VBN'])

# idx1=wsj.index(('kicked','VBN'))

# print(idx1)

获取其前置词

# for v in vbn_list:

# idx=wsj.index((v, 'VBN'))

# print (wsj[idx-1:idx])

等同于：

#print([wsj[wsj.index((v, 'VBN'))-1:wsj.index((v, 'VBN'))] for v in vbn_list])

#Ajectives and Adverbs 形容词和副词

词典反置是常用方法

# def findtags(tag_prefix, tagges_text):

# cfd=nltk.ConditionalFreqDist((tag,word) for (word,tag) in tagges_text

# if tag.startswith(tag_prefix))

# return dict((tag, cfd[tag].most_common(5) for tag in cfd.conditions()))

#exploring tagged corpora 探索标注的数据库

# brwon_learnd_tagged=brown.tagged_words(categories='learned', tagset='universal')

# tags=[b[1]for(a,b)in nltk.bigrams(brwon_learnd_tagged)if a[0]=='often']

# #print(tags)

# fd=nltk.FreqDist(tags)

# print(fd.tabulate())

# brwon_learnd_tagged=brown.tagged_words(categories='news', tagset='universal')

# cfd=nltk.ConditionalFreqDist((word.lower(),tag)

# for (word,tag) in brwon_learnd_tagged)

# for word in sorted(cfd.conditions()):

# if len(cfd[word])>3:

# tags=[tag for (tag, _) in cfd[word].most_common()]

# #print(cfd[word])

# print(word, tags)

#dictionary 词典：默认词典

# news_words = brown.words(categories='news')

# fd=nltk.FreqDist(news_words)

# v1000=[word for (word, _) in fd.most_common(1000)]

# mapping=defaultdict(lambda: 'UNK')

# for word in v1000:

# mapping[word]=word

# new_word=[mapping[word] for word in news_words]

# print(new_word[:20])

# incrementally updating a Dictionary 词典内容递增

# words = words.words('en')

# last_letters=defaultdict(list)

# for word in words:

# key=word[-2:] 发现有该类键，就将其名称以及值添加到字典中

# last_letters[key].append(word)

# print(last_letters['zy'][:10])

# anagrams=defaultdict(list) 找出有特定字母组成的所有的词

# for word in words:

# key=''.join(sorted(word))

# anagrams[key].append(word)

Nltk提供的简单方法

# anagrams=nltk.Index((''.join(sorted(w)),w)for w in words)

# print(anagrams['abc'])

#invert a dictionary 反置词典便于查找

# pos={'cats':'N','name':'N','old':'ADJ','young':'ADJ','run':'V', 'sing':'V'}

# #pos2=dict((value,key)for (key,value)in pos.items())

# pos2=nltk.Index((value,key)for (key,value)in pos.items())

# print(pos2['N'])

#Automatic Tagging 自动标注: 用100个高频词汇的高频tag做tagger

#The Lookup Tagger 查找tagger

# brown_tagged_sents=brown.tagged_sents(categories='news')

# fd=nltk.FreqDist(brown.words(categories='news'))

# cfd=nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))

# most_freq_words=fd.most_common(100)

# likely_tags=dict((word, cfd[word].max())for (word,_)in most_freq_words)

# baseline_tagger=nltk.UnigramTagger(model=likely_tags)

# print(cfd['news'].max())

# print(cfd['news'].tabulate())

# print(baseline_tagger.evaluate(brown_tagged_sents))

#N-Gram Tagging 多级标注

brown_tagged_sents=brown.tagged_sents(categories='news')

brown_sents=brown.sents(categories='news')

size=int(len(brown_tagged_sents)*0.9)

train_sents=brown_tagged_sents[:size] 将数据拆分

#print(train_sents[3])

test_sents=brown_tagged_sents[size:]

unigram_tagger=nltk.UnigramTagger(train_sents)

print(unigram_tagger.size())

#print(unigram_tagger.tag(brown_sents[3]))

# print(bigram_tagger.evaluate(test_sents))

#combination

# t0=nltk.DefaultTagger('NN')

# t1=nltk.UnigramTagger(train_sents, backoff=t0)

# t2=nltk.BigramTagger(train_sents, cutoff=2, backoff=t1)

#print(t2.evaluate(test_sents))

# test_tags = [tag for sent in brown.sents(categories='editorial')

# for (word, tag) in t2.tag(sent)]

# gold_tags = [tag for (word, tag) in brown.tagged_words(categories='editorial')]

# print(nltk.ConfusionMatrix(gold_tags, test_tags))

# cfd=nltk.ConditionalFreqDist(

# ((x[1],y[0]),y[1])

# for sent in brown_tagged_sents

# for x,y in nltk.bigrams(sent))

# ambigous_context=[c for c in cfd.conditions() if len(cfd[c])>1]

# print(sum(cfd[c].N()for c in ambigous_context)/cfd.N())

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 205,033评论 6赞 478
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 87,725评论 2赞 381
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 151,473评论 0赞 338
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 54,846评论 1赞 277
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 63,848评论 5赞 368
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 48,691评论 1赞 282
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 38,053评论 3赞 399
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,700评论 0赞 258
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 42,856评论 1赞 300
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,676评论 2赞 323
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 37,787评论 1赞 333
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,430评论 4赞 321
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 39,034评论 3赞 307
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 29,990评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,218评论 1赞 260
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 45,174评论 2赞 352
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,526评论 2赞 343

5 分类与标注词汇

推荐阅读更多精彩内容