ridahabbash commited on
Commit
9865ad8
·
verified ·
1 Parent(s): d9c82ef

Create transformers.py

Browse files
Files changed (1) hide show
  1. transformers.py +43 -0
transformers.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+
4
+ # Step 1: Load pre-trained model and tokenizer
5
+ model_name = "google/mt5-base" # Example: T5 model
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
8
+
9
+ # Step 2: Load dataset
10
+ dataset = load_dataset("csv", data_files="accounting_data.jsonl")
11
+
12
+ # Step 3: Preprocess the dataset
13
+ def preprocess_function(examples):
14
+ inputs = [f"Generate report: {ledger}" for ledger in examples["prompt"]]
15
+ targets = examples["completion"]
16
+ model_inputs = tokenizer(inputs, max_length=512, truncation=True)
17
+ labels = tokenizer(targets, max_length=128, truncation=True)
18
+ model_inputs["labels"] = labels["input_ids"]
19
+ return model_inputs
20
+
21
+ tokenized_datasets = dataset.map(preprocess_function, batched=True)
22
+
23
+ # Step 4: Define training arguments
24
+ training_args = TrainingArguments(
25
+ output_dir="./results",
26
+ evaluation_strategy="epoch",
27
+ learning_rate=5e-5,
28
+ per_device_train_batch_size=8,
29
+ num_train_epochs=3,
30
+ save_total_limit=3,
31
+ predict_with_generate=True,
32
+ )
33
+
34
+ # Step 5: Initialize Trainer
35
+ trainer = Trainer(
36
+ model=model,
37
+ args=training_args,
38
+ train_dataset=tokenized_datasets["train"],
39
+ eval_dataset=tokenized_datasets["test"],
40
+ )
41
+
42
+ # Step 6: Train the model
43
+ trainer.train()