File size: 5,190 Bytes
c58df45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
import json
import tqdm

tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")

class MultilingualChatData(Dataset):
    def __init__(self, file_path, tokenizer, max_length=512):
        with open(file_path, 'r', encoding='utf-8') as f:
            self.data = json.load(f)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = f"<startofstring> {item['input']} <bot>: {item['output']} <endofstring>"
        encoding = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        return encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze()

class MultilingualChatbot:
    def __init__(self):
        self.models = {
            'en': GPT2LMHeadModel.from_pretrained("microsoft/DialoGPT-medium"),
            'es': GPT2LMHeadModel.from_pretrained("DeepESP/gpt2-spanish"),
            'fr': GPT2LMHeadModel.from_pretrained("asi/gpt-fr-cased-small")
        }
        self.tokenizers = {
            'en': GPT2Tokenizer.from_pretrained("microsoft/DialoGPT-medium"),
            'es': GPT2Tokenizer.from_pretrained("DeepESP/gpt2-spanish"),
            'fr': GPT2Tokenizer.from_pretrained("asi/gpt-fr-cased-small")
        }
        for tokenizer in self.tokenizers.values():
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.add_special_tokens({
                "bos_token": "<startofstring>",
                "eos_token": "<endofstring>"
            })
            tokenizer.add_tokens(["<bot>:"])
        
        for model in self.models.values():
            model.resize_token_embeddings(len(self.tokenizers['en']))  # Assuming all tokenizers have the same vocabulary size
        
        self.device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
        for model in self.models.values():
            model.to(self.device)

    def train(self, lang, data_file, epochs=5, batch_size=32, learning_rate=1e-4):
        model = self.models[lang]
        tokenizer = self.tokenizers[lang]
        
        chat_data = MultilingualChatData(data_file, tokenizer)
        data_loader = DataLoader(chat_data, batch_size=batch_size, shuffle=True)
        
        optimizer = Adam(model.parameters(), lr=learning_rate)
        
        model.train()
        for epoch in range(epochs):
            total_loss = 0
            for batch in tqdm.tqdm(data_loader, desc=f"Epoch {epoch+1}/{epochs}"):
                input_ids, attention_mask = [b.to(self.device) for b in batch]
                
                optimizer.zero_grad()
                outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
            
            print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(data_loader):.4f}")
        
        torch.save(model.state_dict(), f"model_state_{lang}.pt")

    def generate_response(self, prompt, src_lang):
        model = self.models.get(src_lang, self.models['en'])
        tokenizer = self.tokenizers.get(src_lang, self.tokenizers['en'])
        
        input_text = f"<startofstring> {prompt} <bot>: "
        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(self.device)
        
        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=self.device)
        
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=1000,
            pad_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=3,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            num_return_sequences=1,
            length_penalty=1.0,
            repetition_penalty=1.2
        )
        
        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
        return decoded_output.split("<bot>:")[-1].strip()

def initialize_chatbot():
    return MultilingualChatbot()

def get_chatbot_response(chatbot, prompt, src_lang):
    return chatbot.generate_response(prompt, src_lang)

# Ejemplo de uso
if __name__ == "__main__":
    chatbot = initialize_chatbot()
    
    # Entrenar el modelo en español (asumiendo que tienes un archivo de datos en español)
    chatbot.train('es', './spanish_chat_data.json', epochs=3)
    
    # Generar respuestas
    print(get_chatbot_response(chatbot, "Hola, ¿cómo estás?", 'es'))
    print(get_chatbot_response(chatbot, "Hello, how are you?", 'en'))
    print(get_chatbot_response(chatbot, "Bonjour, comment allez-vous?", 'fr'))