Ryan Kim
adding these files as a backup of an older project that got mangled by Git LFS's size limit
6410115
import os | |
import json | |
import random | |
import streamlit as st | |
from transformers import TextClassificationPipeline, pipeline | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizerFast, DistilBertForSequenceClassification | |
emotion_model_names = ( | |
"cardiffnlp/twitter-roberta-base-sentiment", | |
"finiteautomata/beto-sentiment-analysis", | |
"bhadresh-savani/distilbert-base-uncased-emotion", | |
"siebert/sentiment-roberta-large-english" | |
) | |
class ModelImplementation(object): | |
def __init__( | |
self, | |
transformer_model_name, | |
model_transformer, | |
tokenizer_model_name, | |
tokenizer_func, | |
pipeline_func, | |
parser_func, | |
classifier_args={}, | |
placeholders=[""] | |
): | |
self.transformer_model_name = transformer_model_name | |
self.tokenizer_model_name = tokenizer_model_name | |
self.placeholders = placeholders | |
self.model = model_transformer.from_pretrained(self.transformer_model_name) | |
self.tokenizer = tokenizer_func.from_pretrained(self.tokenizer_model_name) | |
self.classifier = pipeline_func(model=self.model, tokenizer=self.tokenizer, padding=True, truncation=True, **classifier_args) | |
self.parser = parser_func | |
def predict(self, val): | |
result = self.classifier(val) | |
return self.parser(self, result) | |
def ParseEmotionOutput(self, result): | |
label = result[0]['label'] | |
score = result[0]['score'] | |
output_func = st.info | |
if self.transformer_model_name == "cardiffnlp/twitter-roberta-base-sentiment": | |
if label == "LABEL_0": | |
label = "NEGATIVE" | |
output_func = st.error | |
elif label == "LABEL_2": | |
label = "POSITIVE" | |
output_func = st.success | |
else: | |
label = "NEUTRAL" | |
elif self.transformer_model_name == "finiteautomata/beto-sentiment-analysis": | |
if label == "NEG": | |
label = "NEGATIVE" | |
output_func = st.error | |
elif label == "POS": | |
label = "POSITIVE" | |
output_func = st.success | |
else: | |
label = "NEUTRAL" | |
elif self.transformer_model_name == "bhadresh-savani/distilbert-base-uncased-emotion": | |
if label == "sadness": | |
output_func = st.info | |
elif label == "joy": | |
output_func = st.success | |
elif label == "love": | |
output_func = st.success | |
elif label == "anger": | |
output_func = st.error | |
elif label == "fear": | |
output_func = st.info | |
elif label == "surprise": | |
output_func = st.error | |
label = label.upper() | |
elif self.transformer_model_name == "siebert/sentiment-roberta-large-english": | |
if label == "NEGATIVE": | |
output_func = st.error | |
elif label == "POSITIVE": | |
output_func = st.success | |
return label, score, output_func | |
def ParsePatentOutput(self, result): | |
return result | |
def emotion_model_change(): | |
st.session_state.emotion_model = ModelImplementation( | |
st.session_state.emotion_model_name, | |
AutoModelForSequenceClassification, | |
st.session_state.emotion_model_name, | |
AutoTokenizer, | |
pipeline, | |
ParseEmotionOutput, | |
classifier_args={ "task" : "sentiment-analysis" }, | |
placeholders=["@AmericanAir just landed - 3hours Late Flight - and now we need to wait TWENTY MORE MINUTES for a gate! I have patience but none for incompetence."] | |
) | |
if "emotion_model_name" not in st.session_state: | |
st.session_state.emotion_model_name = "cardiffnlp/twitter-roberta-base-sentiment" | |
emotion_model_change() | |
if "patent_data" not in st.session_state: | |
f = open('./data/val.json') | |
valData = json.load(f) | |
f.close() | |
patent_data = {} | |
for num, label, abstract, claim in zip(valData["patent_numbers"],valData["labels"], valData["abstracts"], valData["claims"]): | |
patent_data[num] = {"patent_number":num,"label":label,"abstract":abstract,"claim":claim} | |
st.session_state.patent_data = patent_data | |
st.session_state.patent_num = list(patent_data.keys())[0] | |
st.session_state.weight = 0.5 | |
st.session_state.patent_abstract_model = ModelImplementation( | |
'rk2546/uspto-patents-abstracts', | |
DistilBertForSequenceClassification, | |
'distilbert-base-uncased', | |
DistilBertTokenizerFast, | |
TextClassificationPipeline, | |
ParsePatentOutput, | |
classifier_args={"return_all_scores":True}, | |
) | |
print("Patent abstracts model initialized") | |
st.session_state.patent_claim_model = ModelImplementation( | |
'rk2546/uspto-patents-claims', | |
DistilBertForSequenceClassification, | |
'distilbert-base-uncased', | |
DistilBertTokenizerFast, | |
TextClassificationPipeline, | |
ParsePatentOutput, | |
classifier_args={"return_all_scores":True}, | |
) | |
print("Patent claims model initialized") | |
# Title | |
st.title("CSGY-6613 Project") | |
# Subtitle | |
st.markdown("_**Ryan Kim (rk2546)**_") | |
sentimentTab, patentTab = st.tabs([ | |
"Emotion Analysis [Milestone #2]", | |
"Patent Prediction [Milestone #3]" | |
]) | |
with sentimentTab: | |
st.subheader("Sentiment Analysis") | |
if "emotion_model" not in st.session_state: | |
st.write("Loading model...") | |
else: | |
model_option = st.selectbox( | |
"What sentiment analysis model do you want to use? NOTE: Lag may occur when loading a new model!", | |
emotion_model_names, | |
on_change=emotion_model_change, | |
key="emotion_model_name" | |
) | |
form = st.form(key='sentiment-analysis-form') | |
text_input = form.text_area( | |
"Enter some text for sentiment analysis! If you just want to test it out without entering anything, just press the \"Submit\" button and the model will look at the placeholder.", | |
placeholder=st.session_state.emotion_model.placeholders[0] | |
) | |
submit = form.form_submit_button('Submit') | |
if submit: | |
if text_input is None or len(text_input.strip()) == 0: | |
to_eval = st.session_state.emotion_model.placeholders[0] | |
else: | |
to_eval = text_input.strip() | |
label, score, output_func = st.session_state.emotion_model.predict(to_eval) | |
output_func("**{}**: {}".format(label,score)) | |
with patentTab: | |
st.subheader("USPTO Patent Evaluation") | |
st.markdown("Below are two inputs - one for an **ABSTRACT** and another for a list of **CLAIMS**. Enter both and select the \"Submit\" button to evaluate the patenteability of your idea.") | |
patent_select_list = list(st.session_state.patent_data.keys()) | |
patent_index_option = st.selectbox( | |
"Want to pre-populate with an existing patent? Select the index number of below.", | |
patent_select_list, | |
key="patent_num", | |
) | |
if "patent_abstract_model" not in st.session_state or "patent_claim_model" not in st.session_state: | |
st.write("Loading models...") | |
else: | |
with st.form(key='patent-form'): | |
col1, col2 = st.columns(2) | |
with col1: | |
abstract_input = st.text_area( | |
"Enter the abstract of the patent below", | |
placeholder=st.session_state.patent_data[st.session_state.patent_num]["abstract"], | |
height=200 | |
) | |
with col2: | |
claim_input = st.text_area( | |
"Enter the claims of the patent below", | |
placeholder=st.session_state.patent_data[st.session_state.patent_num]["claim"], | |
height=200 | |
) | |
weight_val = st.slider( | |
"How much do the abstract and claims weight when aggregating a total softmax score?", | |
min_value=-1.0, | |
max_value=1.0, | |
value=0.5, | |
) | |
submit = st.form_submit_button('Submit') | |
if submit: | |
is_custom = False | |
if abstract_input is None or len(abstract_input.strip()) == 0: | |
abstract_to_eval = st.session_state.patent_data[st.session_state.patent_num]["abstract"].strip() | |
else: | |
abstract_to_eval = abstract_input.strip() | |
is_custom = True | |
if claim_input is None or len(claim_input.strip()) == 0: | |
claim_to_eval = st.session_state.patent_data[st.session_state.patent_num]["claim"].strip() | |
else: | |
claim_to_eval = claim_input.strip() | |
is_custom = True | |
abstract_response = st.session_state.patent_abstract_model.predict(abstract_to_eval) | |
claim_response = st.session_state.patent_claim_model.predict(claim_to_eval) | |
claim_weight = (1+weight_val)/2 | |
abstract_weight = 1-claim_weight | |
aggregate_score = [ | |
{'label':'REJECTED','score':abstract_response[0][0]['score']*abstract_weight + claim_response[0][0]['score']*claim_weight}, | |
{'label':'ACCEPTED','score':abstract_response[0][1]['score']*abstract_weight + claim_response[0][1]['score']*claim_weight} | |
] | |
aggregate_score_sorted = sorted(aggregate_score, key=lambda d: d['score'], reverse=True) | |
answerCol1, answerCol2, answerCol3 = st.columns(3) | |
with answerCol1: | |
st.slider( | |
"Abstract Acceptance Likelihood", | |
min_value=0.0, | |
max_value=100.0, | |
value=abstract_response[0][1]["score"]*100.0, | |
disabled=True | |
) | |
with answerCol2: | |
output_func = st.info | |
if aggregate_score_sorted[0]["label"] == "REJECTED": | |
output_func = st.error | |
else: | |
output_func = st.success | |
output_func(""" | |
**Final Rating: {}** | |
{}% | |
""".format(aggregate_score_sorted[0]["label"],aggregate_score_sorted[0]["score"]*100.0)) | |
with answerCol3: | |
st.slider( | |
"Claim Acceptance Likelihood", | |
min_value=0.0, | |
max_value=100.0, | |
value=claim_response[0][1]["score"]*100.0, | |
disabled=True | |
) | |
#if not is_custom: | |
# st.markdown('**Original Score:**') | |
# st.markdown(st.session_state.patent_data[st.session_state.patent_num]["label"]) | |
st.write("") |