|
import streamlit as st |
|
from presidio_analyzer import AnalyzerEngine |
|
from presidio_anonymizer import AnonymizerEngine |
|
from transformers import AutoTokenizer, AutoModel |
|
from torch.nn import functional as F |
|
import matplotlib.pyplot as plt |
|
import torch |
|
model = AutoModel.from_pretrained("aarnow/distilbert-base-uncased-1212-test") |
|
tokenizer = AutoTokenizer.from_pretrained("aarnow/distilbert-base-uncased-1212-test") |
|
|
|
|
|
with st.sidebar: |
|
st.title('Technical Demonstration') |
|
st.header('powered by rascal') |
|
st.markdown(''' |
|
## About |
|
This is a tool that shows the classification and PII redaction capabilities of the auditory skills model. PII redaction is powered by Microsoft's presidio tool and the text classification model is trained on a combination of synthetic and human annotated data from the HATCH (Helping Adults Talk to Children) Lab at Idaho State University. Erber's Hierarchy is used to benchmark the text classification model. |
|
|
|
''') |
|
|
|
|
|
|
|
def main(): |
|
st.subheader("Enter Text for Evaluation") |
|
|
|
sentence = st.text_input('Type text to classify below') |
|
if sentence != "": |
|
|
|
analyzer = AnalyzerEngine() |
|
|
|
results = analyzer.analyze(text=sentence, |
|
language='en') |
|
|
|
|
|
anonymizer = AnonymizerEngine() |
|
anonymized_text = anonymizer.anonymize(text=sentence,analyzer_results=results) |
|
st.markdown("**Your text with PII redacted:** "+anonymized_text.text) |
|
st.text(results) |
|
st.subheader("Classification Details") |
|
|
|
|
|
labels = ['DETECTION', 'DISCRIMINATION', 'IDENTIFICATION','COMPREHENSION'] |
|
|
|
|
|
|
|
inputs = tokenizer.batch_encode_plus([sentence] + labels, |
|
return_tensors='pt', |
|
pad_to_max_length=True) |
|
input_ids = inputs['input_ids'] |
|
attention_mask = inputs['attention_mask'] |
|
output = model(input_ids, attention_mask=attention_mask)[0] |
|
sentence_rep = output[:1].mean(dim=1) |
|
label_reps = output[1:].mean(dim=1) |
|
|
|
|
|
|
|
similarities = F.cosine_similarity(sentence_rep, label_reps) |
|
closest = similarities.argsort(descending=True) |
|
st.markdown("The classification that best fits your entry is: "+labels[closest[0]]) |
|
|
|
|
|
|
|
tensor_datalbl = label_reps.detach() |
|
x_values = tensor_datalbl[:, 0].numpy() |
|
y_values = tensor_datalbl[:, 1].numpy() |
|
|
|
|
|
plt.scatter(x_values, y_values) |
|
|
|
|
|
for i in range(len(tensor_datalbl)): |
|
plt.text(x_values[i], y_values[i], str(labels[i]), fontsize=8, ha='right', va='bottom') |
|
|
|
|
|
|
|
tensor_datasen = sentence_rep.detach() |
|
|
|
|
|
x_values = tensor_datasen[:, 0].numpy() |
|
y_values = tensor_datasen[:, 1].numpy() |
|
|
|
plt.scatter(x_values, y_values) |
|
|
|
plt.title('2D Representation of Similarity Estimates (2D)') |
|
plt.xlabel('X-axis') |
|
plt.ylabel('Y-axis') |
|
|
|
plt.savefig('foo.png', bbox_inches='tight') |
|
st.image("foo.png") |
|
st.subheader("Classification Details") |
|
for ind in closest: |
|
|
|
st.write(f'label: {labels[ind]} \t similarity: {similarities[ind]}') |
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|