am_text_summary / fast_text_summarizer.py
berito's picture
text summarization functionality added
02b2abd
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
class FastTextSummarizer:
def __init__(self, model_file):
self.model = fasttext.load_model(model_file)
def sentence_embedding(self, sentence):
words = sentence.split()
word_vectors = [self.model.get_word_vector(word) for word in words if word in self.model.words]
if word_vectors:
return np.mean(word_vectors, axis=0)
else:
return np.zeros(self.model.get_dimension())
def summarize(self, text, num_sentences=3):
# Split text into sentences (adjust for Amharic, e.g., using '።')
sentences = text.split('።') # Amharic sentence delimiter
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# Compute embeddings for each sentence
sentence_embeddings = [self.sentence_embedding(sentence) for sentence in sentences]
# Compute document embedding as the mean of sentence embeddings
document_embedding = np.mean(sentence_embeddings, axis=0)
# Calculate similarity between each sentence and the document
similarities = cosine_similarity([document_embedding], sentence_embeddings).flatten()
# Rank sentences by similarity
ranked_indices = similarities.argsort()[::-1]
top_indices = ranked_indices[:num_sentences]
# Extract the top sentences
summary = '። '.join([sentences[i] for i in sorted(top_indices)]) + '።'
return summary