Spaces:
Runtime error
Runtime error
from datetime import datetime | |
from transformers import BartTokenizer, TFBartForConditionalGeneration | |
from Utils import get_input_chunks | |
import networkx as nx | |
from nltk.tokenize import sent_tokenize | |
import nltk | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import community | |
from title_generator import T5Summarizer | |
class BARTSummarizer: | |
def __init__(self, model_name: str = 'facebook/bart-large-cnn'): | |
self.model_name = model_name | |
self.tokenizer = BartTokenizer.from_pretrained(model_name) | |
self.model = TFBartForConditionalGeneration.from_pretrained(model_name) | |
self.max_length = self.model.config.max_position_embeddings | |
self.title_model = T5Summarizer() | |
def summarize(self, text: str, auto: bool = False): | |
encoded_input = self.tokenizer.encode(text, max_length=self.max_length, return_tensors='tf', truncation=True) | |
if auto: | |
summary_ids = self.model.generate(encoded_input, max_length=300, num_beams=1, no_repeat_ngram_size=2, min_length=60) | |
else: | |
summary_ids = self.model.generate(encoded_input, max_length=300, num_beams=4, early_stopping=True) | |
summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
return summary | |
def chunk_summarize(self, text: str, auto: bool = False): | |
# split the input into chunks | |
summaries = [] | |
input_chunks = get_input_chunks(text, self.max_length) | |
# summarize each input chunk separately | |
print(datetime.now().strftime("%H:%M:%S")) | |
for chunk in input_chunks: | |
summaries.append(self.summarize(chunk, auto)) | |
# # combine the summaries to get the final summary for the entire input | |
final_summary = " ".join(summaries) | |
print(datetime.now().strftime("%H:%M:%S")) | |
return final_summary | |
def preprocess_for_auto_chapters(self, text: str): | |
# Tokenize the text into sentences | |
try: | |
sentences = sent_tokenize(text) | |
except: | |
nltk.download('punkt') | |
sentences = sent_tokenize(text) | |
# Filter out empty sentences and sentences with less than 5 words | |
sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and len(sentence.split(" ")) > 4] | |
# Combine every 5 sentences into a single sentence | |
sentences = [' '.join(sentences[i:i + 6]) for i in range(0, len(sentences), 5)] | |
return sentences | |
def auto_chapters_summarize(self, text: str): | |
sentences = self.preprocess_for_auto_chapters(text) | |
vectorizer = TfidfVectorizer(stop_words='english') | |
X = vectorizer.fit_transform(sentences) | |
# Compute the similarity matrix using cosine similarity | |
similarity_matrix = X * X.T | |
# Convert the similarity matrix to a graph | |
graph = nx.from_scipy_sparse_array(similarity_matrix) | |
# Apply the Louvain algorithm to identify communities | |
partition = community.best_partition(graph, resolution=0.7, random_state=42) | |
# Cluster the sentences | |
clustered_sentences = [] | |
for cluster in set(partition.values()): | |
sentences_to_print = [] | |
for i, sentence in enumerate(sentences): | |
if partition[i] == cluster: | |
sentences_to_print.append(sentence) | |
if len(sentences_to_print) > 1: | |
clustered_sentences.append(" ".join(sentences_to_print)) | |
# Summarize each cluster | |
summaries_with_title = [] | |
for cluster in clustered_sentences: | |
title = self.title_model.summarize(cluster) | |
summary = self.chunk_summarize(cluster, auto=True) | |
summary_with_title = "#### " + title + "\n" + summary | |
summaries_with_title.append(summary_with_title) | |
# Combine the summaries to get the final summary for the entire input | |
final_summary = "\n\n".join(summaries_with_title) | |
return final_summary | |