Spaces:
Build error
Build error
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments | |
from datasets import load_dataset | |
import json | |
# Read requirements.txt file | |
with open('requirements.txt', 'r') as req_file: | |
requirements = req_file.read().splitlines() | |
# Load and preprocess the IMDB dataset in JSON format | |
with open('IMDB Dataset.json', 'r') as json_file: | |
imdb_data = json.load(json_file) | |
# Select only 30 words from the dataset | |
preprocessed_data = [] | |
for entry in imdb_data: | |
text = entry['text'] | |
words = text.split()[:30] | |
preprocessed_entry = { | |
'text': ' '.join(words), | |
'label': entry['label'] | |
} | |
preprocessed_data.append(preprocessed_entry) | |
# Convert the preprocessed data to a dataset | |
dataset = load_dataset('json', data=preprocessed_data) | |
# Tokenize the dataset | |
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m") | |
def tokenize_function(examples): | |
return tokenizer(examples["text"], padding="max_length", truncation=True) | |
tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
# Fine-tune the Bloom model | |
model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloom-560m", num_labels=2) | |
training_args = TrainingArguments(output_dir="test_trainer") | |
import numpy as np | |
import evaluate | |
metric = evaluate.load("accuracy") | |