Spaces:
Running
Running
import fasttext | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
class FastTextSummarizer: | |
def __init__(self, model_file): | |
self.model = fasttext.load_model(model_file) | |
def sentence_embedding(self, sentence): | |
words = sentence.split() | |
word_vectors = [self.model.get_word_vector(word) for word in words if word in self.model.words] | |
if word_vectors: | |
return np.mean(word_vectors, axis=0) | |
else: | |
return np.zeros(self.model.get_dimension()) | |
def summarize(self, text, num_sentences=3): | |
# Split text into sentences (adjust for Amharic, e.g., using '።') | |
sentences = text.split('።') # Amharic sentence delimiter | |
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] | |
# Compute embeddings for each sentence | |
sentence_embeddings = [self.sentence_embedding(sentence) for sentence in sentences] | |
# Compute document embedding as the mean of sentence embeddings | |
document_embedding = np.mean(sentence_embeddings, axis=0) | |
# Calculate similarity between each sentence and the document | |
similarities = cosine_similarity([document_embedding], sentence_embeddings).flatten() | |
# Rank sentences by similarity | |
ranked_indices = similarities.argsort()[::-1] | |
top_indices = ranked_indices[:num_sentences] | |
# Extract the top sentences | |
summary = '። '.join([sentences[i] for i in sorted(top_indices)]) + '።' | |
return summary | |