seemapatil commited on
Commit
1f75f58
·
1 Parent(s): 8fdf77c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -15
app.py CHANGED
@@ -1,34 +1,36 @@
1
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
2
  from datasets import load_dataset
3
- import json
4
 
5
  # Read requirements.txt file
6
  with open('requirements.txt', 'r') as req_file:
7
- requirements = req_file.read().splitlines()
8
 
 
 
 
 
9
 
10
- # Load and preprocess the IMDB dataset in JSON format
11
- with open('IMDB Dataset.json', 'r') as json_file:
12
- imdb_data = json.load(json_file)
13
-
14
- # Select only 30 words from the dataset
15
  preprocessed_data = []
16
- for entry in imdb_data:
17
- text = entry['text']
18
- words = text.split()[:30]
 
 
19
  preprocessed_entry = {
20
- 'text': ' '.join(words),
21
- 'label': entry['label']
22
- }
23
  preprocessed_data.append(preprocessed_entry)
24
 
25
  # Convert the preprocessed data to a dataset
26
- dataset = load_dataset('json', data=preprocessed_data)
27
 
28
  # Tokenize the dataset
29
  tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
30
  def tokenize_function(examples):
31
- return tokenizer(examples["text"], padding="max_length", truncation=True)
32
 
33
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
34
 
 
1
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
2
  from datasets import load_dataset
3
+ import csv
4
 
5
  # Read requirements.txt file
6
  with open('requirements.txt', 'r') as req_file:
7
+ requirements = req_file.read().splitlines()
8
 
9
+ # Install the required libraries
10
+ for requirement in requirements:
11
+ # Use your preferred method to install the libraries
12
+ # e.g., subprocess, pip, etc.
13
 
14
+ # Load and preprocess the IMDB dataset from CSV
 
 
 
 
15
  preprocessed_data = []
16
+ with open('IMDB Dataset.csv', 'r') as csv_file:
17
+ csv_reader = csv.DictReader(csv_file)
18
+ for row in csv_reader:
19
+ text = row['review']
20
+ label = row['sentiment']
21
  preprocessed_entry = {
22
+ 'text': text,
23
+ 'label': label
24
+ }
25
  preprocessed_data.append(preprocessed_entry)
26
 
27
  # Convert the preprocessed data to a dataset
28
+ dataset = load_dataset('csv', data=preprocessed_data, delimiter=',')
29
 
30
  # Tokenize the dataset
31
  tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
32
  def tokenize_function(examples):
33
+ return tokenizer(examples["text"], padding="max_length", truncation=True)
34
 
35
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
36