eerrffuunn commited on
Commit
1749233
·
verified ·
1 Parent(s): 1020b4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -6
app.py CHANGED
@@ -1,7 +1,7 @@
1
  from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
2
- from datasets import Dataset
3
  import pandas as pd
4
- import torch
5
 
6
  # Load the dataset
7
  df = pd.read_csv("processed_step3.csv")
@@ -10,13 +10,20 @@ df = pd.read_csv("processed_step3.csv")
10
  def preprocess_data(row):
11
  return {"text": row["full_text"], "labels": row["narratives"]}
12
 
 
 
 
 
13
  # Create a Dataset object
14
- hf_dataset = Dataset.from_pandas(df).map(preprocess_data)
 
 
 
15
 
16
  # Load pre-trained tokenizer and model
17
  tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
18
  model = RobertaForSequenceClassification.from_pretrained(
19
- "roberta-base", num_labels=len(set(df["narratives"])))
20
 
21
  # Tokenize the data
22
  def tokenize_function(examples):
@@ -42,8 +49,8 @@ training_args = TrainingArguments(
42
  trainer = Trainer(
43
  model=model,
44
  args=training_args,
45
- train_dataset=hf_dataset["train"],
46
- eval_dataset=hf_dataset["validation"],
47
  tokenizer=tokenizer
48
  )
49
 
 
1
  from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
2
+ from datasets import Dataset, DatasetDict
3
  import pandas as pd
4
+ from sklearn.preprocessing import LabelEncoder
5
 
6
  # Load the dataset
7
  df = pd.read_csv("processed_step3.csv")
 
10
  def preprocess_data(row):
11
  return {"text": row["full_text"], "labels": row["narratives"]}
12
 
13
+ # Apply label encoding to narratives to turn them into numeric labels
14
+ label_encoder = LabelEncoder()
15
+ df["labels"] = label_encoder.fit_transform(df["narratives"])
16
+
17
  # Create a Dataset object
18
+ hf_dataset = Dataset.from_pandas(df)
19
+
20
+ # Split the dataset into train and validation sets (80/20 split)
21
+ hf_dataset = hf_dataset.train_test_split(test_size=0.2)
22
 
23
  # Load pre-trained tokenizer and model
24
  tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
25
  model = RobertaForSequenceClassification.from_pretrained(
26
+ "roberta-base", num_labels=len(label_encoder.classes_)) # Use the number of unique labels
27
 
28
  # Tokenize the data
29
  def tokenize_function(examples):
 
49
  trainer = Trainer(
50
  model=model,
51
  args=training_args,
52
+ train_dataset=hf_dataset["train"], # Train set
53
+ eval_dataset=hf_dataset["test"], # Validation set
54
  tokenizer=tokenizer
55
  )
56