text_tagging / app.py
seemapatil's picture
Update app.py
bf65949
raw
history blame
1.31 kB
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
from datasets import load_dataset
import csv
# Read requirements.txt file
with open('requirements.txt', 'r') as req_file:
requirements = req_file.read().splitlines()
# Load and preprocess the IMDB dataset from CSV
preprocessed_data = []
with open('IMDB Dataset.csv', 'r') as csv_file:
csv_reader = csv.DictReader(csv_file)
for row in csv_reader:
text = row['review']
label = row['sentiment']
preprocessed_entry = {
'text': text,
'label': label
}
preprocessed_data.append(preprocessed_entry)
# Convert the preprocessed data to a dataset
dataset = load_dataset('csv', data=preprocessed_data, delimiter=',')
# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Fine-tune the Bloom model
model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloom-560m", num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer")
import numpy as np
import evaluate
metric = evaluate.load("accuracy")