rviana commited on
Commit
68ecee6
·
1 Parent(s): 556d74a

Add project files for IMDb sentiment analysis

Browse files
Files changed (3) hide show
  1. README.md +12 -0
  2. main.py +44 -0
  3. requirements.txt +3 -0
README.md CHANGED
@@ -11,3 +11,15 @@ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+
16
+ # IMDb Sentiment Analysis
17
+
18
+ This project demonstrates a sentiment analysis model trained on the IMDb dataset using the Hugging Face Transformers library.
19
+
20
+ ## Installation
21
+
22
+ To install the necessary dependencies, run:
23
+
24
+ ```bash
25
+ pip install -r requirements.txt
main.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ print(torch.cuda.is_available())
3
+
4
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
5
+ from datasets import load_dataset
6
+
7
+ # Load the IMDb dataset
8
+ dataset = load_dataset('imdb')
9
+
10
+ # Initialize the tokenizer and model
11
+ tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
12
+ model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
13
+
14
+ # Tokenize the dataset
15
+ def tokenize_function(examples):
16
+ return tokenizer(examples['text'], padding="max_length", truncation=True)
17
+
18
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
19
+
20
+ # Set up training arguments
21
+ training_args = TrainingArguments(
22
+ output_dir="./results",
23
+ evaluation_strategy="epoch",
24
+ learning_rate=2e-5,
25
+ per_device_train_batch_size=16,
26
+ per_device_eval_batch_size=16,
27
+ num_train_epochs=1, # Start with fewer epochs for quicker runs
28
+ weight_decay=0.01,
29
+ )
30
+
31
+ # Initialize the Trainer
32
+ trainer = Trainer(
33
+ model=model,
34
+ args=training_args,
35
+ train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(1000)), # Use a subset for quicker runs
36
+ eval_dataset=tokenized_datasets["test"].shuffle(seed=42).select(range(1000)),
37
+ )
38
+
39
+ # Train the model
40
+ trainer.train()
41
+
42
+ # Evaluate the model
43
+ results = trainer.evaluate()
44
+ print(results)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ datasets
2
+ transformers
3
+ torch