Spaces:

khaled5321
/

ARtoEN

Runtime error

File size: 5,516 Bytes

import gradio as gr
import random
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import unicodedata
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer

nltk.download('punkt')


class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        return outputs, hidden
    

class Attention(nn.Module):
  def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)

  def forward(self, hidden, encoder_outputs):
    batch_size = encoder_outputs.shape[1]
    src_len = encoder_outputs.shape[0]

    hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
    encoder_outputs = encoder_outputs.permute(1, 0, 2)

    energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
    attention = self.v(energy).squeeze(2)

    return F.softmax(attention, dim=1)


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))

        assert (output == hidden).all()

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)

        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        return prediction, hidden.squeeze(0)
    
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5): 
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        encoder_outputs, hidden = self.encoder(src)

        input = trg[0,:]
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            outputs[t] = output
            
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1) 
            input = trg[t] if teacher_force else top1
        
        return outputs


def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def tokenize_ar(text):
    """
    Tokenizes Arabic text from a string into a list of strings (tokens) and reverses it
    """
    return [tok for tok in nltk.tokenize.wordpunct_tokenize(unicodeToAscii(text))]

src_vocab = torch.load("arabic_vocab.pth")
trg_vocab = torch.load("english_vocab.pth")

INPUT_DIM = 9790
OUTPUT_DIM = 5682
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, "cpu")
model.load_state_dict(torch.load('model.pt', map_location=torch.device('cpu')))


def infer(text, max_length=50):
    text = tokenize_ar(text)
    sequence = []
    sequence.append(src_vocab['<sos>'])
    sequence.extend([src_vocab[token] for token in text])
    sequence.append(src_vocab['<eos>'])

    sequence = torch.Tensor(sequence)
    sequence = sequence[:, None].to(torch.int64)
    target = torch.zeros(max_length, 1).to(torch.int64)

    with torch.no_grad():
        model.eval()
        output = model(sequence, target, 0)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
    
    prediction = []
    for i in output:
        prediction.append(torch.argmax(i).item())
    
    tokens = trg_vocab.lookup_tokens(prediction)
    en = TreebankWordDetokenizer().detokenize(tokens).replace('<eos>', "")
    
    return re.sub(r'[^\w\s]','',en).strip()

iface = gr.Interface(fn=infer, inputs="text", outputs="text") 

iface.launch()