File size: 2,214 Bytes
1a76661
09d32b7
5010a63
09d32b7
 
 
 
 
 
599bc02
09d32b7
9ff1173
09d32b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ff1173
eaca639
 
 
 
1c50f5e
 
 
 
 
eaca639
1c50f5e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from transformers import AutoTokenizer, MT5ForConditionalGeneration
from transformers import T5Tokenizer
import streamlit as st
import pandas as pd
from datasets import Dataset
import torch
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments


tokenizer = T5Tokenizer.from_pretrained('google/mt5-base')
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base")
#st.write(model)

df = pd.read_csv('proverbs.csv')
df
dataset = Dataset.from_pandas(df)

def preprocess_function(examples):
    inputs = examples['Proverb']
    targets = examples['Meaning']
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(preprocess_function, batched=True)


dataset_split = tokenized_dataset.train_test_split(test_size=0.2)


train_dataset = dataset_split['train']
test_dataset = dataset_split['test']


print(f"Training dataset size: {len(train_dataset)}")
print(f"Testing dataset size: {len(test_dataset)}")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Typically you'd have a separate eval dataset
)

# Fine-tune the model
trainer.train()

model.save_pretrained("./fine-tuned-mt5-marathi-proverbs")
tokenizer.save_pretrained("./fine-tuned-mt5-marathi-proverbs")

prompt = st.text_input("Enter your proverb: ")

# Tokenize the input prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate the output
output_ids = model.generate(input_ids, max_length=256)

# Decode the output to text
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
st.write(output_text)