aftersix commited on
Commit
88cf11f
·
1 Parent(s): be29538

initial commit

Browse files
Files changed (1) hide show
  1. app.py +98 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from presidio_analyzer import AnalyzerEngine
3
+ from presidio_anonymizer import AnonymizerEngine
4
+ from transformers import AutoTokenizer, AutoModel
5
+ from torch.nn import functional as F
6
+ import matplotlib.pyplot as plt
7
+ import torch
8
+ model = AutoModel.from_pretrained("aarnow/distilbert-base-uncased-1212-test")
9
+ tokenizer = AutoTokenizer.from_pretrained("aarnow/distilbert-base-uncased-1212-test")
10
+
11
+
12
+ with st.sidebar:
13
+ st.title('Technical Demonstration')
14
+ st.header('powered by rascal')
15
+ st.markdown('''
16
+ ## About
17
+ This is a tool that shows the classification and PII redaction capabilities of the auditory skills model. PII redaction is powered by Microsoft's presidio tool and the text classification model is trained on a combination of synthetic and human annotated data from the HATCH (Helping Adults Talk to Children) Lab at Idaho State University. Erber's Hierarchy is used to benchmark the text classification model.
18
+
19
+ ''')
20
+
21
+
22
+
23
+ def main():
24
+ st.subheader("Enter Text for Evaluation")
25
+
26
+ sentence = st.text_input('Type text to classify below')
27
+ if sentence != "":
28
+ #with PII redacted
29
+ analyzer = AnalyzerEngine()
30
+ # Call analyzer to get results
31
+ results = analyzer.analyze(text=sentence,
32
+ language='en')
33
+
34
+ # Analyzer results are passed to the AnonymizerEngine for anonymization
35
+ anonymizer = AnonymizerEngine()
36
+ anonymized_text = anonymizer.anonymize(text=sentence,analyzer_results=results)
37
+ st.markdown("**Your text with PII redacted:** "+anonymized_text.text)
38
+ st.text(results)
39
+ st.subheader("Classification Details")
40
+ #use classification model below
41
+ #sentence = 'My child is able to comprehend a voice when the TV is on'
42
+ labels = ['DETECTION', 'DISCRIMINATION', 'IDENTIFICATION','COMPREHENSION']
43
+
44
+ # run inputs through model and mean-pool over the sequence
45
+ # dimension to get sequence-level representations
46
+ inputs = tokenizer.batch_encode_plus([sentence] + labels,
47
+ return_tensors='pt',
48
+ pad_to_max_length=True)
49
+ input_ids = inputs['input_ids']
50
+ attention_mask = inputs['attention_mask']
51
+ output = model(input_ids, attention_mask=attention_mask)[0]
52
+ sentence_rep = output[:1].mean(dim=1)
53
+ label_reps = output[1:].mean(dim=1)
54
+
55
+ # now find the labels with the highest cosine similarities to
56
+ # the sentence
57
+ similarities = F.cosine_similarity(sentence_rep, label_reps)
58
+ closest = similarities.argsort(descending=True)
59
+ st.markdown("The classification that best fits your entry is: "+labels[closest[0]])
60
+
61
+
62
+ #map the labels
63
+ tensor_datalbl = label_reps.detach()
64
+ x_values = tensor_datalbl[:, 0].numpy()
65
+ y_values = tensor_datalbl[:, 1].numpy()
66
+
67
+ # Create a scatter plot for labels
68
+ plt.scatter(x_values, y_values)
69
+
70
+ # Add labels to specific points (adjust indices as needed)
71
+ for i in range(len(tensor_datalbl)):
72
+ plt.text(x_values[i], y_values[i], str(labels[i]), fontsize=8, ha='right', va='bottom')
73
+
74
+
75
+ #map the sentence
76
+ tensor_datasen = sentence_rep.detach()
77
+
78
+ # Extract the individual dimensions for the scatter plot
79
+ x_values = tensor_datasen[:, 0].numpy()
80
+ y_values = tensor_datasen[:, 1].numpy()
81
+
82
+ plt.scatter(x_values, y_values)
83
+
84
+ plt.title('2D Representation of Similarity Estimates (2D)')
85
+ plt.xlabel('X-axis')
86
+ plt.ylabel('Y-axis')
87
+ #plt.show()
88
+ plt.savefig('foo.png', bbox_inches='tight')
89
+ st.image("foo.png")
90
+ st.subheader("Classification Details")
91
+ for ind in closest:
92
+ #print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')
93
+ st.write(f'label: {labels[ind]} \t similarity: {similarities[ind]}')
94
+
95
+
96
+ #run main
97
+ if __name__ == '__main__':
98
+ main()