File size: 1,854 Bytes
e2bac03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pandas as pd
from sklearn.model_selection import train_test_split

dataFromCsv = pd.read_csv('test.csv')

text = dataFromCsv['text'].tolist()
label = dataFromCsv['label'].tolist()

print(text)
print(label)

sorted_lable = sorted((dataFromCsv['label']).unique())
print(sorted_lable)


lableList = {label: i for i, label in enumerate(sorted_lable)}
dataFromCsv['label'] = dataFromCsv['label'].map(lableList)

print("label list = ",lableList)
print("from csv file label = ",dataFromCsv['label'])

 

# Split the dataset 
train_df, test_df = train_test_split(dataFromCsv, test_size=0.2, random_state=42) 
print("Training Set:") 
print(train_df) 
print("\nTesting Set:") 
print(test_df)

lableList = { 0: "lastmonth", 1: "nextweek", 2: "sevendays", 3: "today", 4: "yesterday" }

print("After = ", lableList[4])
print("label items = ", lableList.items())

#test of dataset
# from torch.utils.data import Dataset
# import torch

# class IntentDataset(Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#             item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#             label = self.labels[idx]
#             item['labels'] = torch.tensor(self.labels[idx])
   
#             return item

#     def __len__(self):
#         return len(self.labels)

# # Sample data 
# encodings = { 
#     'input_ids': [[101, 102, 103], [104, 105, 106], [107, 108, 109]], 
#     'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1]] 
# } 
# labels = [0, 1, 0]

# dataset = IntentDataset(encodings, labels)
# dataset_length = len(dataset) 
# print(f"The dataset contains {dataset_length} items. {dataset.labels}")

# dataset_show = dataset[2]
# print(dataset_show)