Ulpan's picture
Upload 10 files
7a96f34
import re
def get_sentences(txt):
return txt.split('.')
def get_words(txt):
only_words_text = re.compile(r'[^0-9^a-z^A-Z\s]').sub('',txt)
return only_words_text.split(' ')
def get_keywords(word_list , min_ratio=0.001, max_ratio=0.5) :
""" this method takes a word list and returns a set of keywords """
assert (min_ratio < 1 and max_ratio < 1)
count_dict = {}
for word in word_list:
count_dict.setdefault(word , 0)
count_dict[word] +=1
keywords = set()
for word , cnt in count_dict.items():
word_percentage = count_dict[word]* 1.0 / len (word_list)
if word_percentage <= max_ratio and word_percentage >=min_ratio:
keywords.add(word)
return keywords
def get_sentence_weight (sentence , keywords):
""" this method takes a sentence string and a set of keywords and returns weight of the sentence """
sen_list = sentence.split(' ')
window_start = 0; window_end = -1;
#calculating window start
for i in range(len(sen_list)):
if sen_list[i] in keywords:
window_start = i
break
#calculating window end
for i in range(len(sen_list) - 1 , 0 , -1) :
if sen_list[i] in keywords:
window_end = i
break
if window_start > window_end :
return 0
window_size = window_end - window_start + 1
#calculating number of keywords
keywords_cnt =0
for w in sen_list :
if w in keywords:
keywords_cnt +=1
return keywords_cnt*keywords_cnt *1.0 / window_size
def summarize(text):
txt = text.replace('\n','')
word_list = get_words(txt)
keywords = get_keywords(word_list , 0.05 , 0.5)
sentence_list = get_sentences(txt)
sentence_weight = {}
for sen in sentence_list:
sentence_weight[sen] = get_sentence_weight(sen, keywords)
top_sentences = list(sentence_list) # make a copy
top_sentences.sort(key=lambda x: sentence_weight[x], reverse=True) # sort by score
top_sentences = top_sentences[:int(len(sentence_weight)*0.2)] # get a part
top_sentences.sort(key=lambda x: sentence_list.index(x)) # sort by occurrence
summary = '. '.join(top_sentences)
return summary