Spaces:
Sleeping
Sleeping
import re | |
def get_sentences(txt): | |
return txt.split('.') | |
def get_words(txt): | |
only_words_text = re.compile(r'[^0-9^a-z^A-Z\s]').sub('',txt) | |
return only_words_text.split(' ') | |
def get_keywords(word_list , min_ratio=0.001, max_ratio=0.5) : | |
""" this method takes a word list and returns a set of keywords """ | |
assert (min_ratio < 1 and max_ratio < 1) | |
count_dict = {} | |
for word in word_list: | |
count_dict.setdefault(word , 0) | |
count_dict[word] +=1 | |
keywords = set() | |
for word , cnt in count_dict.items(): | |
word_percentage = count_dict[word]* 1.0 / len (word_list) | |
if word_percentage <= max_ratio and word_percentage >=min_ratio: | |
keywords.add(word) | |
return keywords | |
def get_sentence_weight (sentence , keywords): | |
""" this method takes a sentence string and a set of keywords and returns weight of the sentence """ | |
sen_list = sentence.split(' ') | |
window_start = 0; window_end = -1; | |
#calculating window start | |
for i in range(len(sen_list)): | |
if sen_list[i] in keywords: | |
window_start = i | |
break | |
#calculating window end | |
for i in range(len(sen_list) - 1 , 0 , -1) : | |
if sen_list[i] in keywords: | |
window_end = i | |
break | |
if window_start > window_end : | |
return 0 | |
window_size = window_end - window_start + 1 | |
#calculating number of keywords | |
keywords_cnt =0 | |
for w in sen_list : | |
if w in keywords: | |
keywords_cnt +=1 | |
return keywords_cnt*keywords_cnt *1.0 / window_size | |
def summarize(text): | |
txt = text.replace('\n','') | |
word_list = get_words(txt) | |
keywords = get_keywords(word_list , 0.05 , 0.5) | |
sentence_list = get_sentences(txt) | |
sentence_weight = {} | |
for sen in sentence_list: | |
sentence_weight[sen] = get_sentence_weight(sen, keywords) | |
top_sentences = list(sentence_list) # make a copy | |
top_sentences.sort(key=lambda x: sentence_weight[x], reverse=True) # sort by score | |
top_sentences = top_sentences[:int(len(sentence_weight)*0.2)] # get a part | |
top_sentences.sort(key=lambda x: sentence_list.index(x)) # sort by occurrence | |
summary = '. '.join(top_sentences) | |
return summary |