Michal Pleban commited on
Commit
57840f4
1 Parent(s): c5099d3

Initial commit

Browse files
README.md CHANGED
@@ -1,3 +1,64 @@
1
  ---
 
 
 
 
 
2
  license: apache-2.0
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - en
4
+ tags:
5
+ - text2text-generation
6
+ - paraphrase-generation
7
  license: apache-2.0
8
+ widget:
9
+ - text: "US to UK: My favorite color is yellow."
10
  ---
11
+
12
+ ### About the model
13
+
14
+ The model has been trained on a dataset containing [249525 sentences with US English spelling](https://www.englishvoice.ai/p/us-to-uk/ "249525 sentences with US English spelling"), along with their UK English equivalent.
15
+
16
+ The purpose of the model is to rewrite sentences from US English to UK English. It is capable not only of changing the spelling of words (such as "color" to "colour") but also changes the vocabulary appropriately (for example, "subway" to "underground", "lawyer" to "solicitor" and so on).
17
+
18
+ ### Generation examples
19
+
20
+ | Input | Output |
21
+ | :------------ | :------------ |
22
+ | My favorite color is yellow. | My favourite colour is yellow. |
23
+ | I saw a guy in yellow sneakers at the subway station. | I saw a bloke in yellow trainers at the underground station. |
24
+ | You could have gotten hurt! | You could have got hurt! |
25
+
26
+ ### The dataset
27
+
28
+ The dataset was developed by English Voice AI Labs. You can download it from our website:
29
+ [https://www.EnglishVoice.ai/](https://www.EnglishVoice.ai/ "https://www.EnglishVoice.ai/")
30
+
31
+ ### Sample code
32
+
33
+ Sample Python code:
34
+
35
+ ```python
36
+ import torch
37
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
38
+
39
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
40
+
41
+ model = T5ForConditionalGeneration.from_pretrained("EnglishVoice/t5-base-us-to-uk-english")
42
+ tokenizer = T5Tokenizer.from_pretrained("EnglishVoice/t5-base-us-to-uk-english")
43
+ model = model.to(device)
44
+
45
+ input = "My favorite color is yellow."
46
+
47
+ text = "US to UK: " + input
48
+ encoding = tokenizer.encode_plus(text, return_tensors = "pt")
49
+ input_ids = encoding["input_ids"].to(device)
50
+ attention_masks = encoding["attention_mask"].to(device)
51
+ beam_outputs = model.generate(
52
+ input_ids = input_ids,
53
+ attention_mask = attention_masks,
54
+ early_stopping = True,
55
+ )
56
+
57
+ result = tokenizer.decode(beam_outputs[0], skip_special_tokens=True)
58
+ print(result)
59
+
60
+ ```
61
+
62
+ Output:
63
+
64
+ ```My favourite colour is yellow.```
config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "upload",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 3072,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.1,
11
+ "eos_token_id": 1,
12
+ "feed_forward_proj": "relu",
13
+ "initializer_factor": 1.0,
14
+ "is_encoder_decoder": true,
15
+ "layer_norm_epsilon": 1e-06,
16
+ "model_type": "t5",
17
+ "n_positions": 512,
18
+ "num_decoder_layers": 12,
19
+ "num_heads": 12,
20
+ "num_layers": 12,
21
+ "output_past": true,
22
+ "pad_token_id": 0,
23
+ "relative_attention_num_buckets": 32,
24
+ "task_specific_params": {
25
+ "summarization": {
26
+ "early_stopping": true,
27
+ "length_penalty": 2.0,
28
+ "max_length": 200,
29
+ "min_length": 30,
30
+ "no_repeat_ngram_size": 3,
31
+ "num_beams": 4,
32
+ "prefix": "summarize: "
33
+ },
34
+ "translation_en_to_de": {
35
+ "early_stopping": true,
36
+ "max_length": 300,
37
+ "num_beams": 4,
38
+ "prefix": "translate English to German: "
39
+ },
40
+ "translation_en_to_fr": {
41
+ "early_stopping": true,
42
+ "max_length": 300,
43
+ "num_beams": 4,
44
+ "prefix": "translate English to French: "
45
+ },
46
+ "translation_en_to_ro": {
47
+ "early_stopping": true,
48
+ "max_length": 300,
49
+ "num_beams": 4,
50
+ "prefix": "translate English to Romanian: "
51
+ }
52
+ },
53
+ "torch_dtype": "float32",
54
+ "transformers_version": "4.17.0",
55
+ "use_cache": true,
56
+ "vocab_size": 32128
57
+ }
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd1712d439251f347ecbb7c1545df5385c649b906d5c60a65f4729b0599bedd7
3
+ size 891625348
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:802a64b11423fcbf2b35a7d7294c772d03c23abdd04af5c915370c154716f023
3
+ size 891730879
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"]}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9cc2c1aca8df08625a1c8a7dee8303cf117d5355aa613fa327144753d626551
3
+ size 892144912
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff