|
--- |
|
library_name: keras |
|
tags: |
|
- translation |
|
license: apache-2.0 |
|
language: |
|
- en |
|
- pt |
|
--- |
|
# GRU EN-PT (Teeny-Tiny Castle) |
|
|
|
This model is part of a tutorial tied to the [Teeny-Tiny Castle](https://github.com/Nkluge-correa/TeenyTinyCastle), an open-source repository containing educational tools for AI Ethics and Safety research. |
|
|
|
## How to Use |
|
|
|
```python |
|
from huggingface_hub import from_pretrained_keras |
|
from huggingface_hub import hf_hub_download |
|
import tensorflow as tf |
|
import numpy as np |
|
import string |
|
import re |
|
|
|
# Select characters to strip, but preserve the "[" and "]" |
|
strip_chars = string.punctuation |
|
strip_chars = strip_chars.replace("[", "") |
|
strip_chars = strip_chars.replace("]", "") |
|
|
|
def custom_standardization(input_string): |
|
lowercase = tf.strings.lower(input_string) |
|
return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "") |
|
|
|
# Load the `seq2seq_rnn` from the Hub |
|
seq2seq_rnn = from_pretrained_keras("AiresPucrs/GRU-eng-por") |
|
|
|
# Load the portuguese vocabulary |
|
portuguese_vocabulary_path = hf_hub_download( |
|
repo_id="AiresPucrs/GRU-eng-por", |
|
filename="portuguese_vocabulary.txt", |
|
repo_type='model', |
|
local_dir="./") |
|
|
|
# Load the english vocabulary |
|
english_vocabulary_path = hf_hub_download( |
|
repo_id="AiresPucrs/GRU-eng-por", |
|
filename="english_vocabulary.txt", |
|
repo_type='model', |
|
local_dir="./") |
|
|
|
with open(portuguese_vocabulary_path, encoding='utf-8', errors='backslashreplace') as fp: |
|
portuguese_vocab = [line.strip() for line in fp] |
|
fp.close() |
|
|
|
with open(english_vocabulary_path, encoding='utf-8', errors='backslashreplace') as fp: |
|
english_vocab = [line.strip() for line in fp] |
|
fp.close() |
|
|
|
# Initialize the vectorizers with the learned vocabularies |
|
target_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000, |
|
output_mode="int", |
|
output_sequence_length=21, |
|
standardize=custom_standardization, |
|
vocabulary=portuguese_vocab) |
|
|
|
source_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000, |
|
output_mode="int", |
|
output_sequence_length=20, |
|
vocabulary=english_vocab) |
|
|
|
# Create a dictionary from `int`to portuguese words |
|
portuguese_index_lookup = dict(zip(range(len(portuguese_vocab)), portuguese_vocab)) |
|
max_decoded_sentence_length = 20 |
|
|
|
def decode_sequence(input_sentence): |
|
""" |
|
Decodes a sequence using a trained seq2seq RNN model. |
|
|
|
Args: |
|
input_sentence (str): the input sentence to be decoded |
|
|
|
Returns: |
|
decoded_sentence (str): the decoded sentence |
|
generated by the model |
|
""" |
|
tokenized_input_sentence = source_vectorization([input_sentence]) |
|
decoded_sentence = "[start]" |
|
|
|
for i in range(max_decoded_sentence_length): |
|
tokenized_target_sentence = target_vectorization([decoded_sentence]) |
|
next_token_predictions = seq2seq_rnn.predict([tokenized_input_sentence, tokenized_target_sentence], verbose=0) |
|
sampled_token_index = np.argmax(next_token_predictions[0, i, :]) |
|
sampled_token = portuguese_index_lookup[sampled_token_index] |
|
decoded_sentence += " " + sampled_token |
|
if sampled_token == "[end]": |
|
break |
|
return decoded_sentence |
|
|
|
eng_sentences =["What is its name?", |
|
"How old are you?", |
|
"I know you know where Mary is.", |
|
"We will show Tom.", |
|
"What do you all do?", |
|
"Don't do it!"] |
|
|
|
for sentence in eng_sentences: |
|
print(f"English sentence:\n{sentence}") |
|
print(f'Portuguese translation:\n{decode_sequence(sentence)}') |
|
print('-' * 50) |
|
|
|
|
|
``` |
|
|