# coding: utf-8 import pandas as pd df = pd.read_excel('./文本.xlsx') # print(df.head()) # df['专业关键词'] for text in df['工作要求']: # print(text) if text is not None: with open('工作要求.txt', mode='a', encoding='utf-8') as file: file.write(str(text))
from collections import Counter import pandas as pd df = pd.read_excel('./文本.xlsx') # print(df.head())
words = []
with open('工作要求.txt', 'r', encoding='utf-8') as f: line = f.readlines() for word in line[0].split(' '): words.append(word)
print(len(words))
counter = Counter(words) # print(counter)
# df['专业关键词'] for text in df['专业关键词']: for k, v in counter.items(): if k == text: print(k, v)
这个代码对于英文文本还是适用的,不过有个小问题,如下。
最后这里也给出中文分词的代码和可视化代码,两者结合在一起的,感兴趣的小伙伴们可以试试看。
from collections import Counter # 统计词频 from pyecharts.charts import Bar from pyecharts import options as opts from snownlp import SnowNLP import jieba # 分词 with open('text_分词后_outputs.txt', 'r',encoding='utf-8') as f: read = f.read() with open('stop_word.txt', 'r', encoding='utf-8') as f: stop_word = f.read() word = jieba.cut(read) words = [] for i in list(word): if i not in stop_word: words.append(i)
columns = [] data = [] for k, v in dict(Counter(words).most_common(10)).items():