Spaces:

greco
/

survey_analytics_spaces

Runtime error

App Files Files Community

survey_analytics_spaces / survey_analytics_library.py

greco

update library

0c0c0b8 almost 3 years ago

raw

history blame contribute delete

5.25 kB


	# imports
	import pandas as pd
	import re



	# replace text with multiple replacements
	def replace_text(string, dict_of_replacements):
	'''
	replace multiple substrings in a string with a dictionary of replacements
	to be used if replacements are fixed and do not require regex as replace() is faster than re.sub()
	for regex replacements use clean_text()
	arguments:
	string (str): string for replacement
	dict_of_replacements (dict): dictionary of substring to replace and replacement
	e.g. {'to replace this':'with this',...}
	returns:
	a string with substrings replaced
	'''
	# loop through dict
	for key, value in dict_of_replacements.items():
	# perform replacement
	string = string.replace(key, value)
	# return
	return string



	# clean text string
	def clean_text(text_string, list_of_replacements, lowercase=True, ignorecase=False):
	'''
	clean text string
	lower case string
	regex sub user defined patterns with user defined replacements

	arguments:
	text_string (str): text string to clean
	list_of_replacements (list): a list of tuples consisting of regex pattern and replacement value
	e.g. [('[^a-z\s]+', ''), ...]
	lowercase (bool): default to True, if True, convert text to lowercase
	ignorecase (bool): default to False, if True, ignore case when applying re.sub()

	returns:
	a cleaned text string
	'''

	# check lowercase argument
	if lowercase:
	# lower case text string
	clean_string = text_string.lower()
	else:
	# keep text as is
	clean_string = text_string

	if ignorecase:
	# loop through each pattern and replacement
	for pattern, replacement in list_of_replacements:
	# replace defined pattern with defined replacement value
	clean_string = re.sub(pattern, replacement, clean_string, flags=re.IGNORECASE)
	else:
	# loop through each pattern and replacement
	for pattern, replacement in list_of_replacements:
	# replace defined pattern with defined replacement value
	clean_string = re.sub(pattern, replacement, clean_string)

	# return
	return clean_string




	# convert transformer model zero shot classification prediction into dataframe
	def convert_zero_shot_classification_output_to_dataframe(model_output):
	'''
	convert zero shot classification output to dataframe
	model's prediction is a list dictionaries
	e.g. each prediction consists of the sequence being predicted, the user defined labels,
	and the respective scores.
	[
	{'sequence': 'the organisation is generally...',
	'labels': ['rewards', 'resourcing', 'leadership'],
	'scores': [0.905086100101471, 0.06712279468774796, 0.027791114524006844]},
	...
	]
	the function pairs the label and scores and stores it as a dataframe
	it also identifies the label with the highest score

	arguments:
	model_output (list): output from transformer.pipeline(task='zero-shot-classification')

	returns:
	a dataframe of label and scores for each prediction

	'''

	# store results as dataframe
	results = pd.DataFrame(model_output)
	# zip labels and scores as dictionary
	results['labels_scores'] = results.apply(lambda x: dict(zip(x['labels'], x['scores'])), axis=1)
	# convert labels_scores to dataframe
	labels_scores = pd.json_normalize(results['labels_scores'])
	# get label of maximum score as new column
	labels_scores['label'] = labels_scores.idxmax(axis=1)
	# get score of maximum score as new column
	labels_scores['score'] = labels_scores.max(axis=1)
	# concat labels_scores to results
	results = pd.concat([results, labels_scores], axis=1)
	# drop unused columns
	results = results.drop(['labels', 'scores'], axis=1)

	# return
	return results


	# convert transformer model sentiment classification prediction into dataframe
	def convert_sentiment_classification_output_to_dataframe(text_input, model_output):
	'''
	convert sentiment classification output into a dataframe

	the model used distilbert-base-uncased-finetuned-sst-2-english outputs a list of lists with two dictionaries,
	within each dictionary is a label negative or postive and the respective score
	[
	[
	{'label': 'NEGATIVE', 'score': 0.18449656665325165},
	{'label': 'POSITIVE', 'score': 0.8155034780502319}
	],
	...
	]
	the scores sum up to 1, and we extract only the positive score in this function,
	append the scores to the model's input and return a dataframe

	arguments:
	text_input (list): a list of sequences that is input for the model
	model_output (list): a list of labels and scores

	return:
	a dataframe of sequences and sentiment score

	'''
	# store model positive scores as dataframe
	results = pd.DataFrame(model_output)[[1]]
	# get score from column
	results = results[1].apply(lambda x: x.get('score'))
	# store input sequences and scores as dataframe
	results = pd.DataFrame({'sequence':text_input, 'score':results})

	# return
	return results