--- library_name: keras tags: - translation license: apache-2.0 language: - en - pt --- # GRU EN-PT (Teeny-Tiny Castle) This model is part of a tutorial tied to the [Teeny-Tiny Castle](https://github.com/Nkluge-correa/TeenyTinyCastle), an open-source repository containing educational tools for AI Ethics and Safety research. ## How to Use ```python from huggingface_hub import from_pretrained_keras from huggingface_hub import hf_hub_download import tensorflow as tf import numpy as np import string import re # Select characters to strip, but preserve the "[" and "]" strip_chars = string.punctuation strip_chars = strip_chars.replace("[", "") strip_chars = strip_chars.replace("]", "") def custom_standardization(input_string): lowercase = tf.strings.lower(input_string) return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "") # Load the `seq2seq_rnn` from the Hub seq2seq_rnn = from_pretrained_keras("AiresPucrs/GRU-eng-por") # Load the portuguese vocabulary portuguese_vocabulary_path = hf_hub_download( repo_id="AiresPucrs/GRU-eng-por", filename="portuguese_vocabulary.txt", repo_type='model', local_dir="./") # Load the english vocabulary english_vocabulary_path = hf_hub_download( repo_id="AiresPucrs/GRU-eng-por", filename="english_vocabulary.txt", repo_type='model', local_dir="./") with open(portuguese_vocabulary_path, encoding='utf-8', errors='backslashreplace') as fp: portuguese_vocab = [line.strip() for line in fp] fp.close() with open(english_vocabulary_path, encoding='utf-8', errors='backslashreplace') as fp: english_vocab = [line.strip() for line in fp] fp.close() # Initialize the vectorizers with the learned vocabularies target_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000, output_mode="int", output_sequence_length=21, standardize=custom_standardization, vocabulary=portuguese_vocab) source_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000, output_mode="int", output_sequence_length=20, vocabulary=english_vocab) # Create a dictionary from `int`to portuguese words portuguese_index_lookup = dict(zip(range(len(portuguese_vocab)), portuguese_vocab)) max_decoded_sentence_length = 20 def decode_sequence(input_sentence): """ Decodes a sequence using a trained seq2seq RNN model. Args: input_sentence (str): the input sentence to be decoded Returns: decoded_sentence (str): the decoded sentence generated by the model """ tokenized_input_sentence = source_vectorization([input_sentence]) decoded_sentence = "[start]" for i in range(max_decoded_sentence_length): tokenized_target_sentence = target_vectorization([decoded_sentence]) next_token_predictions = seq2seq_rnn.predict([tokenized_input_sentence, tokenized_target_sentence], verbose=0) sampled_token_index = np.argmax(next_token_predictions[0, i, :]) sampled_token = portuguese_index_lookup[sampled_token_index] decoded_sentence += " " + sampled_token if sampled_token == "[end]": break return decoded_sentence eng_sentences =["What is its name?", "How old are you?", "I know you know where Mary is.", "We will show Tom.", "What do you all do?", "Don't do it!"] for sentence in eng_sentences: print(f"English sentence:\n{sentence}") print(f'Portuguese translation:\n{decode_sequence(sentence)}') print('-' * 50) ```