File size: 3,730 Bytes
af38f47 3a2bd7e 9f2169b 3a2bd7e da7e970 af38f47 96daf57 b893078 6826d47 068ce9c 9f2169b 068ce9c 9f2169b 068ce9c a2e8dd9 b893078 a959d55 b893078 a2e8dd9 b893078 e8871b2 a2e8dd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import gradio as gr
import pickle
import zipfile
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
nltk.download('punkt')
nltk.download('stopwords')
model_checkpoint = "marefa-nlp/marefa-mt-en-ar"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
with zipfile.ZipFile("model.zip", 'r') as zip_ref:
zip_ref.extractall("./marian_model/")
# Define the model architecture
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)
# Load the weights from the .h5 file
model.load_weights("./marian_model/model.weights.h5")
# Load cleaned_word_embeddings
with open("cleaned_word_embeddings.pkl", "rb") as f:
cleaned_word_embeddings = pickle.load(f)
def get_clean_sentences(text):
sentences = sent_tokenize(text)
# Remove punctuations, numbers and special characters
cleaned_sentences = []
for sentence in sentences:
cleaned_sentence = re.sub(r"\\.|[^\\'\w ]", " ", sentence)
cleaned_sentences.append(cleaned_sentence)
return cleaned_sentences
def filter_sentences(text):
cleaned_sentences = get_clean_sentences(text)
# Remove stopwords
stop_words = set(stopwords.words("english"))
filtered_sentences = []
for sentence in cleaned_sentences:
words = nltk.word_tokenize(sentence)
filtered_sentence = " ".join(
[word for word in words if word.lower() not in stop_words]
)
filtered_sentences.append(filtered_sentence)
return filtered_sentences
def get_vector_representation(text):
filtered_sentences = filter_sentences(text)
# Get vector representations for each sentence in the articles
sentence_vectors = []
for sentence in filtered_sentences:
words = sentence.split()
sentence_vector = np.zeros((25,))
if len(words) != 0:
for word in words:
if word in cleaned_word_embeddings:
sentence_vector += cleaned_word_embeddings[word]
sentence_vector /= len(words)
sentence_vectors.append(sentence_vector)
return sentence_vectors
def calculate_cosine_similarity(sentence_vectors):
flat_sentence_vectors = np.array(
[vec for sublist in sentence_vectors for vec in sublist]
).reshape(1, -1)
# Calculate cosine similarity
similarity_matrix = cosine_similarity(sentence_vectors)
return similarity_matrix
def get_scores(similarity_matrix):
# Create a graph from the similarity matrix
nx_graph = nx.from_numpy_array(similarity_matrix)
# Get scores
scores = nx.pagerank(nx_graph)
return scores
def rank_sentences(text):
sentence_vectors = get_vector_representation(text)
similarity_matrix = calculate_cosine_similarity(sentence_vectors)
scores = get_scores(similarity_matrix)
ranked_sentences = sorted(
((scores[j], sentence) for j, sentence in enumerate(sent_tokenize(text))),
reverse=True,
)
return ranked_sentences
def summarize(text):
ranked_sentences = rank_sentences(text)
summary = ""
for j in range(len(ranked_sentences)//10):
summary += ranked_sentences[j][1] + " "
return summary
def translate(text):
summarized = summarize(text)
tokenized = tokenizer([summarized], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
arabic = tokenizer.decode(out[0], skip_special_tokens=True)
return arabic
demo = gr.Interface(fn=translate, inputs="text", outputs="text")
demo.launch(share=True)
|