from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments from datasets import pandas, Dataset import csv # Read requirements.txt file with open('requirements.txt', 'r') as req_file: requirements = req_file.read().splitlines for requirement in requirements: ('pip install --use-feature=build-backend') # Load and preprocess the IMDB dataset from CSV preprocessed_data = [] with open('IMDB Dataset.csv', 'r') as csv_file: csv_reader = csv.DictReader(csv_file) for row in csv_reader: text = row['review'] label = row['sentiment'] preprocessed_entry = { 'text': text, 'label': label } preprocessed_data.append(preprocessed_entry) # Convert the preprocessed data to a pandas DataFrame df = pandas.DataFrame(preprocessed_data) # Convert the DataFrame to a datasets dataset dataset = Dataset.from_pandas(df) # Tokenize the dataset tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m") def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) tokenized_datasets = dataset.map(tokenize_function, batched=True) # Fine-tune the Bloom model model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloom-560m", num_labels=2) training_args = TrainingArguments(output_dir="test_trainer") import numpy as np import evaluate metric = evaluate.load("accuracy")