使用tf-idf进行前20个关键词提取
import jieba import math import operator class homework(): def file(self,n): #预处理 file_path = n f='' stopwords = [line.strip() for line in open("E:/nlpproject/baidu_stopwords.txt", encoding='utf-8').read()] sign = ['!', '~', '·', '@', '¥', '……', ',', ',', '*', '“', '”', '‘', '’', ')', '{', '}','【', '】', ';', ':', '「', ',', '」', '。', '-', '、', '?', '《','》', ' ', '④', '',' ', '(', ')', ':', '.'] with open(file_path, "r", encoding="utf-8") as pf: f = f+pf.read() for char in sign: f = f.replace(char, "") words = jieba.lcut(f, cut_all=False, HMM=True) if words not in stopwords: words.append(words) return words def high_frequency(self, word_list1, word_list2): #TF-IDF,找出两篇文章的关键词 word1 = word_list1 word2 = word_list2 counts = {} TF = {} IDF = {} TF_IDF = {} sum_word = 0 for word in word1: counts[word] = counts.get(word, 0) + 1 sum_word = sum_word + 1 # 文章的总词数 for word in word1: # 计算TF TF[word] = counts[word] / sum_word if word in word2: IDF[word] = math.log(2 / 2) else: IDF[word] = math.log(2 / 1) for word in word1: TF_IDF[word] = TF[word] * IDF[word] TF_IDF = sorted(TF_IDF.items(), key=operator.itemgetter(1), reverse=True)# TF_IDF已经由字典变成列表 print('该文章的前20个关键词为:') for i in range(20): print(TF_IDF[i]) TF_IDF = dict(TF_IDF) if __name__ == '__main__': n1='E:/nlpproject/text1.txt' n2='E:/nlpproject/text2.txt' b = homework() print(b.file(n1)) print(b.file(n2))
以上就是本篇文章【自然语言文本处理-提取关键词】的全部内容了,欢迎阅览 ! 文章地址:http://ww.kub2b.com/news/15937.html
栏目首页
相关文章
动态
同类文章
热门文章
网站地图
返回首页 企库往资讯移动站 http://ww.kub2b.com/mobile/ , 查看更多