import pandas as pd from sklearn.model_selection import train_test_split dataFromCsv = pd.read_csv('test.csv') text = dataFromCsv['text'].tolist() label = dataFromCsv['label'].tolist() print(text) print(label) sorted_lable = sorted((dataFromCsv['label']).unique()) print(sorted_lable) lableList = {label: i for i, label in enumerate(sorted_lable)} dataFromCsv['label'] = dataFromCsv['label'].map(lableList) print("label list = ",lableList) print("from csv file label = ",dataFromCsv['label']) # Split the dataset train_df, test_df = train_test_split(dataFromCsv, test_size=0.2, random_state=42) print("Training Set:") print(train_df) print("\nTesting Set:") print(test_df) lableList = { 0: "lastmonth", 1: "nextweek", 2: "sevendays", 3: "today", 4: "yesterday" } print("After = ", lableList[4]) print("label items = ", lableList.items()) #test of dataset # from torch.utils.data import Dataset # import torch # class IntentDataset(Dataset): # def __init__(self, encodings, labels): # self.encodings = encodings # self.labels = labels # def __getitem__(self, idx): # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} # label = self.labels[idx] # item['labels'] = torch.tensor(self.labels[idx]) # return item # def __len__(self): # return len(self.labels) # # Sample data # encodings = { # 'input_ids': [[101, 102, 103], [104, 105, 106], [107, 108, 109]], # 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1]] # } # labels = [0, 1, 0] # dataset = IntentDataset(encodings, labels) # dataset_length = len(dataset) # print(f"The dataset contains {dataset_length} items. {dataset.labels}") # dataset_show = dataset[2] # print(dataset_show)