Spaces:
Build error
Build error
File size: 1,358 Bytes
690a0b1 a3ff196 690a0b1 a3ff196 690a0b1 a3ff196 690a0b1 e39127e 690a0b1 a3ff196 690a0b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
from datasets import load_dataset
import json
# Read requirements.txt file
with open('requirements.txt', 'r') as req_file:
requirements = req_file.read().splitlines()
# Load and preprocess the IMDB dataset in JSON format
with open('IMDB Dataset.json', 'r') as json_file:
imdb_data = json.load(json_file)
# Select only 30 words from the dataset
preprocessed_data = []
for entry in imdb_data:
text = entry['text']
words = text.split()[:30]
preprocessed_entry = {
'text': ' '.join(words),
'label': entry['label']
}
preprocessed_data.append(preprocessed_entry)
# Convert the preprocessed data to a dataset
dataset = load_dataset('json', data=preprocessed_data)
# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Fine-tune the Bloom model
model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloom-560m", num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer")
import numpy as np
import evaluate
metric = evaluate.load("accuracy")
|