with open('stopwords.txt'), 'r', encoding='utf-8') as f: text = ' '.join(f.readlines()) STOP_WORDS = set(text.split())
在互联网上可以很容易地找到每种语言的停止词列表。
该算法的第一步是用文本中的单词频率建立一个列表。
text = '...............'
word_weights={} for word in word_tokenize(text) : word = word.lower() if len(word) > 1 and word not in STOP_WORDS: if word in word_weights.keys(): word_weights[word] += 1 else: word_weights[word] = 1
sentence_weights={} for sent in tokenize.sent_tokenize(text): for word in word_tokenize(sent) : word = word.lower() if word in word_weights.keys(): if sent in sentence_weights.keys(): sentence_weights[sent] += word_weights[word] else: sentence_weights[sent] = word_weights[word]
summary='' for sentence,strength in sentence_weights.items(): if strength in highest_weights: summary += sentence + ' ' summary = summary.replace('_', ' ').strip()
with open('stopwords.txt', 'r', encoding='utf-8') as f: text = ' '.join(f.readlines()) STOP_WORDS = set(text.split())
def summarize(text, no_sentences=3): word_weights={} for word in word_tokenize(text): word = word.lower() if len(word) > 1 and word not in STOP_WORDS: if word in word_weights.keys(): word_weights[word] += 1 else: word_weights[word] = 1
sentence_weights={} for sent in tokenize.sent_tokenize(text): for word in word_tokenize(sent) : word = word.lower() if word in word_weights.keys(): if sent in sentence_weights.keys(): sentence_weights[sent] += word_weights[word] else: sentence_weights[sent] = word_weights[word] highest_weights = sorted(sentence_weights.values())[-no_sentences:]
summary='' for sentence,strength in sentence_weights.items(): if strength in highest_weights: summary += sentence + ' ' summary = summary.replace('_', ' ').strip() return summary