dieineb commited on
Commit
9c8ef39
1 Parent(s): 5bedb03

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +155 -11
README.md CHANGED
@@ -2,25 +2,169 @@
2
  library_name: keras
3
  tags:
4
  - translation
 
5
  ---
 
6
 
7
- ## Model description
8
 
9
- More information needed
10
 
11
- ## Intended uses & limitations
 
 
 
 
 
 
 
 
 
12
 
13
- More information needed
14
 
15
- ## Training and evaluation data
 
16
 
17
- More information needed
 
 
 
 
 
18
 
19
- ## Model Plot
 
 
 
20
 
21
- <details>
22
- <summary>View Model Plot</summary>
 
23
 
24
- ![Model Image](./model.png)
 
25
 
26
- </details>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  library_name: keras
3
  tags:
4
  - translation
5
+ license: apache-2.0
6
  ---
7
+ # Model Description
8
 
9
+ The GRU-eng-por model is used to translate English to Portuguese automatically.
10
 
11
+ The model was trained with a traduction dataset.
12
 
13
+ ## Details
14
+ - Size: 42,554,912 parameters
15
+ - Dataset: [`English-to-Portuguese`](https://www.kaggle.com/datasets/nageshsingh/englishportuguese-translation)
16
+ - Languages: English, Portuguese
17
+ - Number of Training Steps: 15
18
+ - Batch size: 32
19
+ - Optimizer: rmsprop
20
+ - Learning Rate: 0.001
21
+ - GPU: T4
22
+ - This repository has the source [code used](https://github.com/Nkluge-correa/teeny-tiny_castle/blob/master/ML%20Intro%20Course/16_sequence_to_sequence.ipynb) to train this model.
23
 
24
+ ## Usage
25
 
26
+ ```
27
+ !pip install huggingface_hub["tensorflow"] -q
28
 
29
+ from huggingface_hub import from_pretrained_keras
30
+ from huggingface_hub import hf_hub_download
31
+ import tensorflow as tf
32
+ import numpy as np
33
+ import string
34
+ import re
35
 
36
+ # Select characters to strip, but preserve the "[" and "]"
37
+ strip_chars = string.punctuation
38
+ strip_chars = strip_chars.replace("[", "")
39
+ strip_chars = strip_chars.replace("]", "")
40
 
41
+ def custom_standardization(input_string):
42
+ lowercase = tf.strings.lower(input_string)
43
+ return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")
44
 
45
+ # Load the `seq2seq_rnn` from the Hub
46
+ seq2seq_rnn = from_pretrained_keras("AiresPucrs/GRU-eng-por")
47
 
48
+ # Load the portuguese vocabulary
49
+ portuguese_vocabulary_path = hf_hub_download(
50
+ repo_id="AiresPucrs/GRU-eng-por",
51
+ filename="portuguese_vocabulary.txt",
52
+ repo_type='model',
53
+ local_dir="./")
54
+
55
+ # Load the english vocabulary
56
+ english_vocabulary_path = hf_hub_download(
57
+ repo_id="AiresPucrs/GRU-eng-por",
58
+ filename="english_vocabulary.txt",
59
+ repo_type='model',
60
+ local_dir="./")
61
+
62
+ with open(portuguese_vocabulary_path, encoding='utf-8', errors='backslashreplace') as fp:
63
+ portuguese_vocab = [line.strip() for line in fp]
64
+ fp.close()
65
+
66
+ with open(english_vocabulary_path, encoding='utf-8', errors='backslashreplace') as fp:
67
+ english_vocab = [line.strip() for line in fp]
68
+ fp.close()
69
+
70
+ # Initialize the vectorizers with the learned vocabularies
71
+ target_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000,
72
+ output_mode="int",
73
+ output_sequence_length=21,
74
+ standardize=custom_standardization,
75
+ vocabulary=portuguese_vocab)
76
+
77
+ source_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000,
78
+ output_mode="int",
79
+ output_sequence_length=20,
80
+ vocabulary=english_vocab)
81
+
82
+ # Create a dictionary from `int`to portuguese words
83
+ portuguese_index_lookup = dict(zip(range(len(portuguese_vocab)), portuguese_vocab))
84
+ max_decoded_sentence_length = 20
85
+
86
+ def decode_sequence(input_sentence):
87
+ """
88
+ Decodes a sequence using a trained seq2seq RNN model.
89
+
90
+ Args:
91
+ input_sentence (str): the input sentence to be decoded
92
+
93
+ Returns:
94
+ decoded_sentence (str): the decoded sentence
95
+ generated by the model
96
+ """
97
+ tokenized_input_sentence = source_vectorization([input_sentence])
98
+ decoded_sentence = "[start]"
99
+
100
+ for i in range(max_decoded_sentence_length):
101
+ tokenized_target_sentence = target_vectorization([decoded_sentence])
102
+ next_token_predictions = seq2seq_rnn.predict([tokenized_input_sentence, tokenized_target_sentence], verbose=0)
103
+ sampled_token_index = np.argmax(next_token_predictions[0, i, :])
104
+ sampled_token = portuguese_index_lookup[sampled_token_index]
105
+ decoded_sentence += " " + sampled_token
106
+ if sampled_token == "[end]":
107
+ break
108
+ return decoded_sentence
109
+
110
+ eng_sentences =["What is its name?",
111
+ "How old are you?",
112
+ "I know you know where Mary is.",
113
+ "We will show Tom.",
114
+ "What do you all do?",
115
+ "Don't do it!"]
116
+
117
+ for sentence in eng_sentences:
118
+ print(f"English sentence:\n{sentence}")
119
+ print(f'Portuguese translation:\n{decode_sequence(sentence)}')
120
+ print('-' * 50)
121
+ ```
122
+
123
+ This will output the following:
124
+ ```
125
+ English sentence:
126
+ What is its name?
127
+ Portuguese translation:
128
+ [start] qual é o nome [end]
129
+ --------------------------------------------------
130
+ English sentence:
131
+ How old are you?
132
+ Portuguese translation:
133
+ [start] quantos anos você tem [end]
134
+ --------------------------------------------------
135
+ English sentence:
136
+ I know you know where Mary is.
137
+ Portuguese translation:
138
+ [start] eu sei que você sabe onde maria está [end]
139
+ --------------------------------------------------
140
+ English sentence:
141
+ We will show Tom.
142
+ Portuguese translation:
143
+ [start] nós vamos tom [end]
144
+ --------------------------------------------------
145
+ English sentence:
146
+ What do you all do?
147
+ Portuguese translation:
148
+ [start] o que vocês faz [end]
149
+ --------------------------------------------------
150
+ English sentence:
151
+ Don't do it!
152
+ Portuguese translation:
153
+ [start] não faça isso [end]
154
+ --------------------------------------------------
155
+ ```
156
+
157
+ # Cite as 🤗
158
+ ```
159
+ @misc{teenytinycastle,
160
+ doi = {10.5281/zenodo.7112065},
161
+ url = {https://huggingface.co/AiresPucrs/GRU-eng-por},
162
+ author = {Nicholas Kluge Corr{\^e}a},
163
+ title = {Teeny-Tiny Castle},
164
+ year = {2023},
165
+ publisher = {HuggingFace},
166
+ journal = {HuggingFace repository},
167
+ }
168
+ ```
169
+ ## License
170
+ The GRU-eng-por is licensed under the Apache License, Version 2.0. See the LICENSE file for more details.