File size: 3,834 Bytes
6e74375
 
 
 
9c8ef39
47d9d7a
 
 
6e74375
a6cd0f4
6e74375
a6cd0f4
6e74375
a6cd0f4
58fabde
 
9c8ef39
 
 
 
 
 
6e74375
9c8ef39
 
 
 
6e74375
9c8ef39
 
 
6e74375
9c8ef39
 
6e74375
9c8ef39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47d9d7a
 
9c8ef39
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
---
library_name: keras
tags:
- translation
license: apache-2.0
language:
- en
- pt
---
# GRU EN-PT (Teeny-Tiny Castle)

This model is part of a tutorial tied to the [Teeny-Tiny Castle](https://github.com/Nkluge-correa/TeenyTinyCastle), an open-source repository containing educational tools for AI Ethics and Safety research. 

## How to Use

```python
from huggingface_hub import from_pretrained_keras
from huggingface_hub import hf_hub_download
import tensorflow as tf
import numpy as np
import string
import re

# Select characters to strip, but preserve the "[" and "]"
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")

# Load the `seq2seq_rnn` from the Hub
seq2seq_rnn = from_pretrained_keras("AiresPucrs/GRU-eng-por")

# Load the portuguese vocabulary
portuguese_vocabulary_path = hf_hub_download(
    repo_id="AiresPucrs/GRU-eng-por",
    filename="portuguese_vocabulary.txt",
    repo_type='model',
    local_dir="./")

# Load the english vocabulary
english_vocabulary_path = hf_hub_download(
    repo_id="AiresPucrs/GRU-eng-por",
    filename="english_vocabulary.txt",
    repo_type='model',
    local_dir="./")

with open(portuguese_vocabulary_path, encoding='utf-8',  errors='backslashreplace') as fp:
    portuguese_vocab = [line.strip() for line in fp]
    fp.close()

with open(english_vocabulary_path, encoding='utf-8',  errors='backslashreplace') as fp:
    english_vocab = [line.strip() for line in fp]
    fp.close()

# Initialize the vectorizers with the learned vocabularies
target_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000,
                                        output_mode="int",
                                        output_sequence_length=21,
                                        standardize=custom_standardization,
                                        vocabulary=portuguese_vocab)

source_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000,
                                        output_mode="int",
                                        output_sequence_length=20,
                                        vocabulary=english_vocab)

# Create a dictionary from `int`to portuguese words
portuguese_index_lookup = dict(zip(range(len(portuguese_vocab)), portuguese_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    """
    Decodes a sequence using a trained seq2seq RNN model.

    Args:
        input_sentence (str): the input sentence to be decoded

    Returns:
        decoded_sentence (str): the decoded sentence
            generated by the model
    """
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"

    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])
        next_token_predictions = seq2seq_rnn.predict([tokenized_input_sentence, tokenized_target_sentence], verbose=0)
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = portuguese_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

eng_sentences =["What is its name?",
                "How old are you?",
                "I know you know where Mary is.",
                "We will show Tom.",
                "What do you all do?",
                "Don't do it!"]

for sentence in eng_sentences:
    print(f"English sentence:\n{sentence}")
    print(f'Portuguese translation:\n{decode_sequence(sentence)}')
    print('-' * 50)


```