berito commited on
Commit
02b2abd
·
1 Parent(s): 0d71b77

text summarization functionality added

Browse files
Files changed (1) hide show
  1. fast_text_summarizer.py +32 -0
fast_text_summarizer.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fasttext
2
+ import numpy as np
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+
5
+ class FastTextSummarizer:
6
+ def __init__(self, model_file):
7
+ self.model = fasttext.load_model(model_file)
8
+
9
+ def sentence_embedding(self, sentence):
10
+ words = sentence.split()
11
+ word_vectors = [self.model.get_word_vector(word) for word in words if word in self.model.words]
12
+ if word_vectors:
13
+ return np.mean(word_vectors, axis=0)
14
+ else:
15
+ return np.zeros(self.model.get_dimension())
16
+
17
+ def summarize(self, text, num_sentences=3):
18
+ # Split text into sentences (adjust for Amharic, e.g., using '።')
19
+ sentences = text.split('።') # Amharic sentence delimiter
20
+ sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
21
+ # Compute embeddings for each sentence
22
+ sentence_embeddings = [self.sentence_embedding(sentence) for sentence in sentences]
23
+ # Compute document embedding as the mean of sentence embeddings
24
+ document_embedding = np.mean(sentence_embeddings, axis=0)
25
+ # Calculate similarity between each sentence and the document
26
+ similarities = cosine_similarity([document_embedding], sentence_embeddings).flatten()
27
+ # Rank sentences by similarity
28
+ ranked_indices = similarities.argsort()[::-1]
29
+ top_indices = ranked_indices[:num_sentences]
30
+ # Extract the top sentences
31
+ summary = '። '.join([sentences[i] for i in sorted(top_indices)]) + '።'
32
+ return summary