Spaces:

eerrffuunn
/

semEval

Runtime error

App Files Files Community

eerrffuunn commited on Nov 24, 2024

Commit

46e8b69

verified ·

1 Parent(s): 227ce73

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -25

app.py CHANGED Viewed

@@ -1,29 +1,55 @@
-import gradio as gr
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
-# Load the tokenizer and model
-model_name = "roberta-large"  # Replace with your trained model if uploaded
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSequenceClassification.from_pretrained(model_name)
-# Define the prediction function
-def classify_text(text):
-    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
-    outputs = model(**inputs)
-    probabilities = torch.softmax(outputs.logits, dim=-1).tolist()[0]
-    labels = ["Speculating War Outcomes", "Discrediting Ukraine", "Praise of Russia"]  # Replace with your actual labels
-    predictions = {label: prob for label, prob in zip(labels, probabilities)}
-    return predictions
-# Create the Gradio interface
-demo = gr.Interface(
-    fn=classify_text,
-    inputs=gr.Textbox(lines=3, placeholder="Enter text to classify..."),
-    outputs=gr.Label(num_top_classes=3),
-    title="Narrative Classification",
-    description="Classify text into predefined narrative categories."
 )
-# Launch the app
-demo.launch()

+from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
+from datasets import Dataset
+import pandas as pd
 import torch
+# Load the dataset
+df = pd.read_csv("processed_step3.csv")
+# Prepare the dataset for Hugging Face
+def preprocess_data(row):
+    return {"text": row["full_text"], "labels": row["narratives"]}
+# Create a Dataset object
+hf_dataset = Dataset.from_pandas(df).map(preprocess_data)
+# Load pre-trained tokenizer and model
+tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+model = RobertaForSequenceClassification.from_pretrained(
+    "roberta-base", num_labels=len(set(df["narratives"])))
+# Tokenize the data
+def tokenize_function(examples):
+    return tokenizer(examples["text"], padding="max_length", truncation=True)
+hf_dataset = hf_dataset.map(tokenize_function, batched=True)
+# Set Hugging Face TrainingArguments
+training_args = TrainingArguments(
+    output_dir="./results",
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    per_device_train_batch_size=8,
+    num_train_epochs=3,
+    load_best_model_at_end=True,
+    logging_dir="./logs",
+    logging_steps=10,
+    push_to_hub=True,  # Push to Hugging Face Model Hub
+    hub_model_id="eerrffuunn/semeval-task"
+)
+# Trainer for training the model
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=hf_dataset["train"],
+    eval_dataset=hf_dataset["validation"],
+    tokenizer=tokenizer
 )
+# Train the model
+trainer.train()
+# Save the model and tokenizer
+trainer.save_model("semeval_model")
+tokenizer.save_pretrained("semeval_model")