Spaces:

aarnow
/

modelDemo

Sleeping

modelDemo / app.py

aftersix

initial commit

88cf11f about 1 year ago

4.01 kB

	import streamlit as st
	from presidio_analyzer import AnalyzerEngine
	from presidio_anonymizer import AnonymizerEngine
	from transformers import AutoTokenizer, AutoModel
	from torch.nn import functional as F
	import matplotlib.pyplot as plt
	import torch
	model = AutoModel.from_pretrained("aarnow/distilbert-base-uncased-1212-test")
	tokenizer = AutoTokenizer.from_pretrained("aarnow/distilbert-base-uncased-1212-test")


	with st.sidebar:
	st.title('Technical Demonstration')
	st.header('powered by rascal')
	st.markdown('''
	## About
	This is a tool that shows the classification and PII redaction capabilities of the auditory skills model. PII redaction is powered by Microsoft's presidio tool and the text classification model is trained on a combination of synthetic and human annotated data from the HATCH (Helping Adults Talk to Children) Lab at Idaho State University. Erber's Hierarchy is used to benchmark the text classification model.

	''')



	def main():
	st.subheader("Enter Text for Evaluation")

	sentence = st.text_input('Type text to classify below')
	if sentence != "":
	#with PII redacted
	analyzer = AnalyzerEngine()
	# Call analyzer to get results
	results = analyzer.analyze(text=sentence,
	language='en')

	# Analyzer results are passed to the AnonymizerEngine for anonymization
	anonymizer = AnonymizerEngine()
	anonymized_text = anonymizer.anonymize(text=sentence,analyzer_results=results)
	st.markdown("Your text with PII redacted: "+anonymized_text.text)
	st.text(results)
	st.subheader("Classification Details")
	#use classification model below
	#sentence = 'My child is able to comprehend a voice when the TV is on'
	labels = ['DETECTION', 'DISCRIMINATION', 'IDENTIFICATION','COMPREHENSION']

	# run inputs through model and mean-pool over the sequence
	# dimension to get sequence-level representations
	inputs = tokenizer.batch_encode_plus([sentence] + labels,
	return_tensors='pt',
	pad_to_max_length=True)
	input_ids = inputs['input_ids']
	attention_mask = inputs['attention_mask']
	output = model(input_ids, attention_mask=attention_mask)[0]
	sentence_rep = output[:1].mean(dim=1)
	label_reps = output[1:].mean(dim=1)

	# now find the labels with the highest cosine similarities to
	# the sentence
	similarities = F.cosine_similarity(sentence_rep, label_reps)
	closest = similarities.argsort(descending=True)
	st.markdown("The classification that best fits your entry is: "+labels[closest[0]])


	#map the labels
	tensor_datalbl = label_reps.detach()
	x_values = tensor_datalbl[:, 0].numpy()
	y_values = tensor_datalbl[:, 1].numpy()

	# Create a scatter plot for labels
	plt.scatter(x_values, y_values)

	# Add labels to specific points (adjust indices as needed)
	for i in range(len(tensor_datalbl)):
	plt.text(x_values[i], y_values[i], str(labels[i]), fontsize=8, ha='right', va='bottom')


	#map the sentence
	tensor_datasen = sentence_rep.detach()

	# Extract the individual dimensions for the scatter plot
	x_values = tensor_datasen[:, 0].numpy()
	y_values = tensor_datasen[:, 1].numpy()

	plt.scatter(x_values, y_values)

	plt.title('2D Representation of Similarity Estimates (2D)')
	plt.xlabel('X-axis')
	plt.ylabel('Y-axis')
	#plt.show()
	plt.savefig('foo.png', bbox_inches='tight')
	st.image("foo.png")
	st.subheader("Classification Details")
	for ind in closest:
	#print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')
	st.write(f'label: {labels[ind]} \t similarity: {similarities[ind]}')


	#run main
	if __name__ == '__main__':
	main()