Spaces:

Ulpan
/

simple_streamlit_app

Sleeping

App Files Files Community

simple_streamlit_app / src /summary.py

Ulpan

Upload 10 files

7a96f34 about 2 years ago

raw

history blame contribute delete

2.27 kB

	import re

	def get_sentences(txt):
	return txt.split('.')

	def get_words(txt):
	only_words_text = re.compile(r'[^0-9^a-z^A-Z\s]').sub('',txt)
	return only_words_text.split(' ')

	def get_keywords(word_list , min_ratio=0.001, max_ratio=0.5) :
	""" this method takes a word list and returns a set of keywords """
	assert (min_ratio < 1 and max_ratio < 1)
	count_dict = {}
	for word in word_list:
	count_dict.setdefault(word , 0)
	count_dict[word] +=1
	keywords = set()
	for word , cnt in count_dict.items():
	word_percentage = count_dict[word]* 1.0 / len (word_list)
	if word_percentage <= max_ratio and word_percentage >=min_ratio:
	keywords.add(word)
	return keywords

	def get_sentence_weight (sentence , keywords):
	""" this method takes a sentence string and a set of keywords and returns weight of the sentence """
	sen_list = sentence.split(' ')
	window_start = 0; window_end = -1;
	#calculating window start
	for i in range(len(sen_list)):
	if sen_list[i] in keywords:
	window_start = i
	break
	#calculating window end
	for i in range(len(sen_list) - 1 , 0 , -1) :
	if sen_list[i] in keywords:
	window_end = i
	break
	if window_start > window_end :
	return 0
	window_size = window_end - window_start + 1
	#calculating number of keywords
	keywords_cnt =0
	for w in sen_list :
	if w in keywords:
	keywords_cnt +=1
	return keywords_cntkeywords_cnt 1.0 / window_size

	def summarize(text):
	txt = text.replace('\n','')
	word_list = get_words(txt)
	keywords = get_keywords(word_list , 0.05 , 0.5)
	sentence_list = get_sentences(txt)
	sentence_weight = {}
	for sen in sentence_list:
	sentence_weight[sen] = get_sentence_weight(sen, keywords)
	top_sentences = list(sentence_list) # make a copy
	top_sentences.sort(key=lambda x: sentence_weight[x], reverse=True) # sort by score
	top_sentences = top_sentences[:int(len(sentence_weight)*0.2)] # get a part
	top_sentences.sort(key=lambda x: sentence_list.index(x)) # sort by occurrence
	summary = '. '.join(top_sentences)
	return summary