Spaces:
Sleeping
Sleeping
text summarization functionality added
Browse files- fast_text_summarizer.py +32 -0
fast_text_summarizer.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fasttext
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
+
|
5 |
+
class FastTextSummarizer:
|
6 |
+
def __init__(self, model_file):
|
7 |
+
self.model = fasttext.load_model(model_file)
|
8 |
+
|
9 |
+
def sentence_embedding(self, sentence):
|
10 |
+
words = sentence.split()
|
11 |
+
word_vectors = [self.model.get_word_vector(word) for word in words if word in self.model.words]
|
12 |
+
if word_vectors:
|
13 |
+
return np.mean(word_vectors, axis=0)
|
14 |
+
else:
|
15 |
+
return np.zeros(self.model.get_dimension())
|
16 |
+
|
17 |
+
def summarize(self, text, num_sentences=3):
|
18 |
+
# Split text into sentences (adjust for Amharic, e.g., using '።')
|
19 |
+
sentences = text.split('።') # Amharic sentence delimiter
|
20 |
+
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
21 |
+
# Compute embeddings for each sentence
|
22 |
+
sentence_embeddings = [self.sentence_embedding(sentence) for sentence in sentences]
|
23 |
+
# Compute document embedding as the mean of sentence embeddings
|
24 |
+
document_embedding = np.mean(sentence_embeddings, axis=0)
|
25 |
+
# Calculate similarity between each sentence and the document
|
26 |
+
similarities = cosine_similarity([document_embedding], sentence_embeddings).flatten()
|
27 |
+
# Rank sentences by similarity
|
28 |
+
ranked_indices = similarities.argsort()[::-1]
|
29 |
+
top_indices = ranked_indices[:num_sentences]
|
30 |
+
# Extract the top sentences
|
31 |
+
summary = '። '.join([sentences[i] for i in sorted(top_indices)]) + '።'
|
32 |
+
return summary
|