d.tsimerman commited on
Commit
cc2664b
·
1 Parent(s): e4e7e44
README.md CHANGED
@@ -1,3 +1,43 @@
1
  ---
2
  license: mit
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ pipeline_tag: text-generation
4
+ widget:
5
+ - text: "@@ПЕРВЫЙ@@ привет @@ВТОРОЙ@@ привет @@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@"
6
+ example_title: "how r u"
7
+ - text: "@@ПЕРВЫЙ@@ что ты делал на выходных? @@ВТОРОЙ@@"
8
+ example_title: "wyd"
9
+ language:
10
+ - ru
11
+ tags:
12
+ - conversational
13
  ---
14
+ This generation model is based on [sberbank-ai/rugpt3medium_based_on_gpt2](https://huggingface.co/sberbank-ai/rugpt3medium_based_on_gpt2). It's trained on large corpus of dialog data and can be used for buildning generative conversational agents
15
+
16
+ The model was trained with context size 3
17
+
18
+
19
+ On a validation set we calculated metrics introduced in [this paper](https://arxiv.org/pdf/2001.09977.pdf):
20
+ - Sensibleness: Operators are asked whether model's response makes sense given the context
21
+ - Specificity: Operators are asked whether model's response is specific for given context, in other words we don't want our model to give general and boring responses
22
+ - SSA which is the average of two metrics above (Sensibleness Specificity Average)
23
+
24
+ | | sensibleness | specificity | SSA |
25
+ |:----------------------------------------------------|---------------:|--------------:|------:|
26
+ | [tinkoff-ai/ruDialoGPT-small](https://huggingface.co/tinkoff-ai/ruDialoGPT-small) | 0.64 | 0.5 | 0.57 |
27
+ | [tinkoff-ai/ruDialoGPT-medium](https://huggingface.co/tinkoff-ai/ruDialoGPT-medium) | 0.78 | 0.69 | 0.735 |
28
+
29
+
30
+ How to use:
31
+
32
+ ```python
33
+ import torch
34
+ from transformers import AutoTokenizer, AutoModelWithLMHead
35
+
36
+ tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')
37
+ model = AutoModelWithLMHead.from_pretrained('tinkoff-ai/ruDialoGPT-medium')
38
+ inputs = tokenizer('@@ПЕРВЫЙ@@ привет @@ВТОРОЙ@@ привет @@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@', return_tensors='pt')
39
+ with torch.inference_mode():
40
+ generated_token_ids = model.generate(**inputs)
41
+ context_with_response = tokenizer.decode(generated_token_ids[0])
42
+ context_with_response
43
+ ```
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"@@ПЕРВЫЙ@@": 50257, "@@ВТОРОЙ@@": 50258, "<FIRST_SPEAKER>": 50259, "<SECOND_SPEAKER>": 50260}
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 3.4997777938842773,
4
+ "eval_runtime": 448.829,
5
+ "eval_samples": 40311,
6
+ "eval_samples_per_second": 89.814,
7
+ "eval_steps_per_second": 7.486,
8
+ "perplexity": 33.10809432022858,
9
+ "train_loss": 3.4927627832876236,
10
+ "train_runtime": 361845.2402,
11
+ "train_samples": 8570756,
12
+ "train_samples_per_second": 23.686,
13
+ "train_steps_per_second": 1.974
14
+ }
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "attn_pdrop": 0.1,
4
+ "bos_token_id": 50256,
5
+ "embd_pdrop": 0.1,
6
+ "eos_token_id": 50256,
7
+ "id2label": {
8
+ "0": "LABEL_0"
9
+ },
10
+ "initializer_range": 0.02,
11
+ "label2id": {
12
+ "LABEL_0": 0
13
+ },
14
+ "layer_norm_epsilon": 1e-05,
15
+ "model_type": "gpt2",
16
+ "n_ctx": 2048,
17
+ "n_embd": 1024,
18
+ "n_head": 16,
19
+ "n_inner": null,
20
+ "n_layer": 24,
21
+ "n_positions": 2048,
22
+ "n_special": 0,
23
+ "output_past": true,
24
+ "predict_special_tokens": true,
25
+ "reorder_and_upcast_attn": false,
26
+ "resid_pdrop": 0.1,
27
+ "scale_attn_by_inverse_layer_idx": false,
28
+ "scale_attn_weights": true,
29
+ "summary_activation": null,
30
+ "summary_first_dropout": 0.1,
31
+ "summary_proj_to_labels": true,
32
+ "summary_type": "cls_index",
33
+ "summary_use_proj": true,
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.15.0",
36
+ "use_cache": true,
37
+ "vocab_size": 50261
38
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 3.4997777938842773,
4
+ "eval_runtime": 448.829,
5
+ "eval_samples": 40311,
6
+ "eval_samples_per_second": 89.814,
7
+ "eval_steps_per_second": 7.486,
8
+ "perplexity": 33.10809432022858
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63c43b3834b279073e85862dddf924c2c961ba882a1e34fdb1008e2e353dff3b
3
+ size 1524289497
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": "<pad>"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"errors": "replace", "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "special_tokens_map_file": null, "use_fast": true, "tokenizer_class": "GPT2Tokenizer"}
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 3.4927627832876236,
4
+ "train_runtime": 361845.2402,
5
+ "train_samples": 8570756,
6
+ "train_samples_per_second": 23.686,
7
+ "train_steps_per_second": 1.974
8
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff