seemapatil commited on
Commit
a3ff196
·
1 Parent(s): 6e1398d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -10
app.py CHANGED
@@ -4,23 +4,19 @@ import json
4
 
5
  # Read requirements.txt file
6
  with open('requirements.txt', 'r') as req_file:
7
- requirements = req_file.read().splitlines()
8
 
9
- # Install the required libraries
10
- for requirement in requirements:
11
- # Use your preferred method to install the libraries
12
- # e.g., subprocess, pip, etc.
13
 
14
  # Load and preprocess the IMDB dataset in JSON format
15
  with open('IMDB Dataset.json', 'r') as json_file:
16
- imdb_data = json.load(json_file)
17
 
18
  # Select only 30 words from the dataset
19
  preprocessed_data = []
20
  for entry in imdb_data:
21
- text = entry['text']
22
- words = text.split()[:30]
23
- preprocessed_entry = {
24
  'text': ' '.join(words),
25
  'label': entry['label']
26
  }
@@ -32,7 +28,7 @@ dataset = load_dataset('json', data=preprocessed_data)
32
  # Tokenize the dataset
33
  tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
34
  def tokenize_function(examples):
35
- return tokenizer(examples["text"], padding="max_length", truncation=True)
36
 
37
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
38
 
 
4
 
5
  # Read requirements.txt file
6
  with open('requirements.txt', 'r') as req_file:
7
+ requirements = req_file.read().splitlines()
8
 
 
 
 
 
9
 
10
  # Load and preprocess the IMDB dataset in JSON format
11
  with open('IMDB Dataset.json', 'r') as json_file:
12
+ imdb_data = json.load(json_file)
13
 
14
  # Select only 30 words from the dataset
15
  preprocessed_data = []
16
  for entry in imdb_data:
17
+ text = entry['text']
18
+ words = text.split()[:30]
19
+ preprocessed_entry = {
20
  'text': ' '.join(words),
21
  'label': entry['label']
22
  }
 
28
  # Tokenize the dataset
29
  tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
30
  def tokenize_function(examples):
31
+ return tokenizer(examples["text"], padding="max_length", truncation=True)
32
 
33
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
34