Spaces:

HaiderSultanArc
/

Unani-Medicine-AI-Engine

Runtime error

App Files Files Community

Unani-Medicine-AI-Engine / tasks /training /preprocessing.py

HaiderSultanArc

AI Engine API

ba600a6 about 2 years ago

raw

history blame contribute delete

4.72 kB

	import pandas as pd
	import torch
	import transformers
	from sklearn import preprocessing
	from sklearn.model_selection import train_test_split
	from torch.nn import ConstantPad1d
	from torch.nn.utils.rnn import pad_sequence
	from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
	TensorDataset)


	def preprocess(data: pd.DataFrame, tokenizer: transformers.BertTokenizer):
	"""
	Preprocesses the data.
	======================
	Parameters:
	data (pd.DataFrame): The data to be preprocessed.
	----------------------
	Returns:
	trainDataloader (DataLoader): The training dataloader.
	validationDataloader (DataLoader): The validation dataloader.
	testDataloader (DataLoader): The testing dataloader.
	labelEncoder (LabelEncoder): The label encoder.
	"""
	sentences = data.sentence.values
	labels = data.disease.values

	encodedSentences = []

	for sentence in sentences:
	# `encode` will:
	# (1) Tokenize the sentence.
	# (2) Prepend the `[CLS]` token to the start.
	# (3) Append the `[SEP]` token to the end.
	# (4) Map tokens to their IDs.
	encSentence = tokenizer.encode(
	sentence, # Sentence to encode.
	add_special_tokens = True, # Add '[CLS]' and '[SEP]'

	# This function also supports truncation and conversion
	# to pytorch tensors, but we need to do padding, so we
	# can't use these features :( .
	#max_length = 128, # Truncate all sentences.
	#return_tensors = 'pt', # Return pytorch tensors.
	)

	# Add the encoded sentence to the list.
	encodedSentences.append(encSentence)

	MAX_LEN = max([len(sen) for sen in encodedSentences]) + 10

	seq = [torch.tensor(sen) for sen in encodedSentences]
	padSequences = [ConstantPad1d((0, MAX_LEN - len(sen)), 0)(sen) for sen in seq]
	encodedSentences = pad_sequence(padSequences, batch_first=True)

	attentionMasks = []

	for sentence in encodedSentences:
	# Create the attention mask.
	# - If a token ID is 0, then it's padding, set the mask to 0.
	# - If a token ID is > 0, then it's a real token, set the mask to 1.
	attMask = [int(token_id > 0) for token_id in sentence]

	# Store the attention mask for this sentence.
	attentionMasks.append(attMask)


	labelEncoder = preprocessing.LabelEncoder()
	labels = labelEncoder.fit_transform(labels) # type: ignore

	trainingSentences, testingSentences, trainingLabels, testingLabels = train_test_split(encodedSentences, labels, test_size=0.3, random_state=2018)

	# Use 90% for training and 10% for validation.
	trainInputs, validationInputs, trainLabels, validationLabels = train_test_split(trainingSentences, trainingLabels,
	random_state=2018, test_size=0.1)
	# Do the same for the masks.
	trainingMasks, testingMasks, _, _ = train_test_split(attentionMasks, labels, random_state=2018, test_size=0.3)

	train_masks, validationMasks, _, _ = train_test_split(trainingMasks, trainingLabels, random_state=2018, test_size=0.1)

	# Convert all inputs and labels into torch tensors, the required datatype
	# for our model.
	trainInputs = torch.tensor(trainInputs)
	validationInputs = torch.tensor(validationInputs)
	testInputs = torch.tensor(testingSentences)

	trainLabels = torch.tensor(trainLabels)
	validationLabels = torch.tensor(validationLabels)
	testLabels = torch.tensor(testingLabels)

	train_masks = torch.tensor(train_masks)
	validationMasks = torch.tensor(validationMasks)
	testMasks = torch.tensor(testingMasks)

	batchSize = 32

	trainData = TensorDataset(trainInputs, train_masks, trainLabels)
	trainSampler = RandomSampler(trainData)
	trainDataloader = DataLoader(trainData, sampler=trainSampler, batch_size=batchSize)

	validationData = TensorDataset(validationInputs, validationMasks, validationLabels)
	validationSampler = SequentialSampler(validationData)
	validationDataloader = DataLoader(validationData, sampler=validationSampler, batch_size=batchSize)

	testData = TensorDataset(testInputs, testMasks, testLabels)
	testSampler = SequentialSampler(testData)
	testDataloader = DataLoader(testData, sampler=testSampler, batch_size=batchSize)

	return trainDataloader, validationDataloader, testDataloader, labelEncoder