eerrffuunn commited on
Commit
46e8b69
·
verified ·
1 Parent(s): 227ce73

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -25
app.py CHANGED
@@ -1,29 +1,55 @@
1
- import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
3
  import torch
4
 
5
- # Load the tokenizer and model
6
- model_name = "roberta-large" # Replace with your trained model if uploaded
7
- tokenizer = AutoTokenizer.from_pretrained(model_name)
8
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
9
-
10
- # Define the prediction function
11
- def classify_text(text):
12
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
13
- outputs = model(**inputs)
14
- probabilities = torch.softmax(outputs.logits, dim=-1).tolist()[0]
15
- labels = ["Speculating War Outcomes", "Discrediting Ukraine", "Praise of Russia"] # Replace with your actual labels
16
- predictions = {label: prob for label, prob in zip(labels, probabilities)}
17
- return predictions
18
-
19
- # Create the Gradio interface
20
- demo = gr.Interface(
21
- fn=classify_text,
22
- inputs=gr.Textbox(lines=3, placeholder="Enter text to classify..."),
23
- outputs=gr.Label(num_top_classes=3),
24
- title="Narrative Classification",
25
- description="Classify text into predefined narrative categories."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  )
27
 
28
- # Launch the app
29
- demo.launch()
 
 
 
 
 
1
+ from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
2
+ from datasets import Dataset
3
+ import pandas as pd
4
  import torch
5
 
6
+ # Load the dataset
7
+ df = pd.read_csv("processed_step3.csv")
8
+
9
+ # Prepare the dataset for Hugging Face
10
+ def preprocess_data(row):
11
+ return {"text": row["full_text"], "labels": row["narratives"]}
12
+
13
+ # Create a Dataset object
14
+ hf_dataset = Dataset.from_pandas(df).map(preprocess_data)
15
+
16
+ # Load pre-trained tokenizer and model
17
+ tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
18
+ model = RobertaForSequenceClassification.from_pretrained(
19
+ "roberta-base", num_labels=len(set(df["narratives"])))
20
+
21
+ # Tokenize the data
22
+ def tokenize_function(examples):
23
+ return tokenizer(examples["text"], padding="max_length", truncation=True)
24
+
25
+ hf_dataset = hf_dataset.map(tokenize_function, batched=True)
26
+
27
+ # Set Hugging Face TrainingArguments
28
+ training_args = TrainingArguments(
29
+ output_dir="./results",
30
+ evaluation_strategy="epoch",
31
+ save_strategy="epoch",
32
+ per_device_train_batch_size=8,
33
+ num_train_epochs=3,
34
+ load_best_model_at_end=True,
35
+ logging_dir="./logs",
36
+ logging_steps=10,
37
+ push_to_hub=True, # Push to Hugging Face Model Hub
38
+ hub_model_id="eerrffuunn/semeval-task"
39
+ )
40
+
41
+ # Trainer for training the model
42
+ trainer = Trainer(
43
+ model=model,
44
+ args=training_args,
45
+ train_dataset=hf_dataset["train"],
46
+ eval_dataset=hf_dataset["validation"],
47
+ tokenizer=tokenizer
48
  )
49
 
50
+ # Train the model
51
+ trainer.train()
52
+
53
+ # Save the model and tokenizer
54
+ trainer.save_model("semeval_model")
55
+ tokenizer.save_pretrained("semeval_model")