Kirill Gelvan
commited on
Commit
•
0b9e0db
1
Parent(s):
505f6f7
add emoji and some code
Browse files
README.md
CHANGED
@@ -3,17 +3,27 @@ language: ru
|
|
3 |
tags:
|
4 |
- conversational
|
5 |
---
|
6 |
-
### Description
|
7 |
|
8 |
DialoGPT trained on Russian language and fine tuned on my telegram chat.
|
9 |
|
10 |
|
11 |
This model was created by [sberbank-ai](https://hf.co/sberbank-ai) and trained on Russian forums (see [Grossmend's model](https://hf.co/Grossmend/rudialogpt3_medium_based_on_gpt2)). You can find info about how it has been trained on [habr](https://habr.com/ru/company/icl_services/blog/548244/) (in Russian). I have created a **simple pipeline** and **fine tuned** that model on my own **exported telegram chat** (~30mb json). It is in fact very easy to get the data from telegram and fine tune a model. Therefore, I made a **colab tutorial** for it: link
|
12 |
|
|
|
13 |
|
14 |
-
### How to use
|
15 |
|
16 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def get_length_param(text: str, tokenizer) -> str:
|
18 |
tokens_count = len(tokenizer.encode(text))
|
19 |
if tokens_count <= 15:
|
@@ -27,6 +37,7 @@ def get_length_param(text: str, tokenizer) -> str:
|
|
27 |
return len_param
|
28 |
|
29 |
|
|
|
30 |
def get_user_param(text: dict, machine_name_in_chat: str) -> str:
|
31 |
if text['from'] == machine_name_in_chat:
|
32 |
return '1' # machine
|
|
|
3 |
tags:
|
4 |
- conversational
|
5 |
---
|
6 |
+
### 📝 Description
|
7 |
|
8 |
DialoGPT trained on Russian language and fine tuned on my telegram chat.
|
9 |
|
10 |
|
11 |
This model was created by [sberbank-ai](https://hf.co/sberbank-ai) and trained on Russian forums (see [Grossmend's model](https://hf.co/Grossmend/rudialogpt3_medium_based_on_gpt2)). You can find info about how it has been trained on [habr](https://habr.com/ru/company/icl_services/blog/548244/) (in Russian). I have created a **simple pipeline** and **fine tuned** that model on my own **exported telegram chat** (~30mb json). It is in fact very easy to get the data from telegram and fine tune a model. Therefore, I made a **colab tutorial** for it: link
|
12 |
|
13 |
+
⚠️ Due to specifics of the data Hosted inference API may not work properly ⚠️
|
14 |
|
15 |
+
### ❓ How to use
|
16 |
|
17 |
```python
|
18 |
+
|
19 |
+
# Download model and tokenizer
|
20 |
+
checkpoint = "Kirili4ik/ruDialoGpt3-medium-finetuned-telegram"
|
21 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
22 |
+
model = AutoModelForCausalLM.from_pretrained(checkpoint)
|
23 |
+
model.eval()
|
24 |
+
|
25 |
+
|
26 |
+
# util function to get expected len after tokenizing
|
27 |
def get_length_param(text: str, tokenizer) -> str:
|
28 |
tokens_count = len(tokenizer.encode(text))
|
29 |
if tokens_count <= 15:
|
|
|
37 |
return len_param
|
38 |
|
39 |
|
40 |
+
# util function to get next person number (1/0) for Machine or Human in the dialogue
|
41 |
def get_user_param(text: dict, machine_name_in_chat: str) -> str:
|
42 |
if text['from'] == machine_name_in_chat:
|
43 |
return '1' # machine
|