File size: 1,487 Bytes
690a0b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
from datasets import load_dataset
import json

# Read requirements.txt file
with open('requirements.txt', 'r') as req_file:
    requirements = req_file.read().splitlines()

# Install the required libraries
for requirement in requirements:
    # Use your preferred method to install the libraries
    # e.g., subprocess, pip, etc.

# Load and preprocess the IMDB dataset in JSON format
with open('IMDB Dataset.json', 'r') as json_file:
    imdb_data = json.load(json_file)

# Select only 30 words from the dataset
preprocessed_data = []
for entry in imdb_data:
    text = entry['text']
    words = text.split()[:30]
    preprocessed_entry = {
        'text': ' '.join(words),
        'label': entry['label']
    }
    preprocessed_data.append(preprocessed_entry)

# Convert the preprocessed data to a dataset
dataset = load_dataset('json', data=preprocessed_data)

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Fine-tune the Bloom model
model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloom-560m", num_labels=2)

training_args = TrainingArguments(output_dir="test_trainer")

import numpy as np
import evaluate

metric = evaluate.load("accuracy")