Nihal D'Souza
Final app release
e41b03f
raw
history blame
7.65 kB
import os
import nltk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import streamlit as st
from src.doc2vec import inference
from src.abstractive_sum import summarize_text_with_model
from src.textrank import custom_textrank_summarizer, get_labels_for_license
from src.clean import clean_license_text
from src.read_data import read_license_text_data
from src.diff import strikethrough_diff
from src.parameters import help_messages, captions, options
nltk.download('punkt')
if __name__ == "__main__":
CUSTOM_MODEL_NAME = "utkarshsaboo45/ClearlyDefinedLicenseSummarizer"
SIMILARITY_THRESHOLD = 0.8
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
with st.spinner(captions.LOADING):
model = AutoModelForSeq2SeqLM.from_pretrained(CUSTOM_MODEL_NAME).to(device)
tokenizer = AutoTokenizer.from_pretrained(CUSTOM_MODEL_NAME)
summarization_type = st.sidebar.selectbox(
captions.SELECT_SUMMARIZATION_TYPE,
(options.EXTRACTIVE, options.ABSTRACTIVE, options.BOTH),
help=help_messages.SUMMARIZATION_TYPE
)
cleaned_view = None
exceptions = ""
definitions = ""
if summarization_type == options.ABSTRACTIVE:
st.sidebar.caption(captions.SUMMARY_BY_T5)
st.sidebar.caption(captions.WARNING_ABSTRACTIVE)
elif summarization_type == options.EXTRACTIVE:
st.sidebar.caption(captions.SUMMARY_BY_TEXTRANK)
summary_len = st.sidebar.slider(
captions.SUMMARY_LENGTH_PERCENTAGE,
1,
100,
30,
help=help_messages.SLIDER
)
summary_view = st.sidebar.selectbox(
captions.SUMMARY_VIEW, (
options.DISPLAY_SUMMARY_ONLY,
options.DISPLAY_HIGHLIGHTED_SUMMARY
),
help=help_messages.SUMMARY_VIEW
)
if summary_view == options.DISPLAY_SUMMARY_ONLY:
st.sidebar.caption(captions.DISPLAY_SUMMARY_ONLY_DESC)
elif summary_view == options.DISPLAY_HIGHLIGHTED_SUMMARY:
st.sidebar.caption(captions.DISPLAY_HIGHLIGHTED_SUMMARY_DESC)
cleaned_view = st.sidebar.selectbox(
captions.CLEANED_LICENSE_VIEW, (
options.HIDE_CLEANED_LICENSE,
options.DISPLAY_CLEANED_LICENSE,
options.DISPLAY_CLEANED_DIFF
),
help=help_messages.CLEANED_LICENSE_VIEW
)
if cleaned_view == options.DISPLAY_CLEANED_LICENSE:
st.sidebar.caption(captions.CLEANED_LICENSE_ONLY)
elif cleaned_view == options.DISPLAY_CLEANED_DIFF:
st.sidebar.caption(captions.CLEANED_LICENSE_WITH_DIFF)
elif cleaned_view == options.HIDE_CLEANED_LICENSE:
st.sidebar.caption(captions.HIDE_CLEANED_LICENSE)
elif summarization_type == options.BOTH:
st.sidebar.caption(captions.SUMMARY_BY_BOTH)
st.sidebar.caption(captions.WARNING_BOTH)
st.title(captions.APP_TITLE)
st.caption(captions.APP_DISCLAIMER)
license_input = st.text_area(
captions.LICENSE_TEXT,
placeholder=captions.ENTER_LICENSE_CONTENT
)
if len(license_input) > 0:
cleaned_modified_license_text = clean_license_text(license_input)[0]
with st.spinner(captions.LOADING):
if summarization_type == options.ABSTRACTIVE:
summary, definitions = summarize_text_with_model(
license_input,
model,
tokenizer
)
if summarization_type == options.EXTRACTIVE:
if summary_view == options.DISPLAY_SUMMARY_ONLY:
summary, definitions, exceptions = custom_textrank_summarizer(
license_input,
summary_len=summary_len / 100
)
elif summary_view == options.DISPLAY_HIGHLIGHTED_SUMMARY:
summary, definitions, exceptions = custom_textrank_summarizer(
license_input,
summary_len=summary_len / 100,
return_summary_only=False
)
if summarization_type == options.BOTH:
summary, definitions = summarize_text_with_model(
license_input,
model,
tokenizer
)
summary, definitions, exceptions = custom_textrank_summarizer(
summary,
summary_len=1
)
st.header(captions.SUMMARY)
st.markdown(summary, unsafe_allow_html=True)
prediction_scores = inference(license_input)
top1_result = prediction_scores.loc[0, :]
st.header(captions.SIMILARITY_INDEX)
st.caption(captions.SIMILARITY_INDEX_DISCLAIMER)
st.dataframe(prediction_scores)
if cleaned_view == options.DISPLAY_CLEANED_DIFF:
st.header(captions.CLEANED_LICENSE_DIFF)
if top1_result["Similarity Scores"] > SIMILARITY_THRESHOLD:
st.caption("Comparing against the official " + " ".join(
top1_result["License"].split("-")
) + " license")
top_license_name = top1_result["License"].lower()
original_license_text = read_license_text_data(
top_license_name
)
cleaned_original_license_text = clean_license_text(
original_license_text
)[0]
st.markdown(
strikethrough_diff(
cleaned_original_license_text,
cleaned_modified_license_text
),
unsafe_allow_html=True
)
else:
st.caption(captions.NO_SIMILAR_LICENSE_FOUND)
elif cleaned_view == options.DISPLAY_CLEANED_LICENSE:
st.header(captions.CLEANED_LICENSE_TEXT)
st.write(cleaned_modified_license_text)
if st.sidebar.checkbox(
options.SHOW_LICENSE_PROPERTIES,
disabled = False if top1_result["Similarity Scores"] > SIMILARITY_THRESHOLD else True,
value=False,
help=help_messages.PROPERTIES_CHECKBOX):
license_properties = get_labels_for_license(top1_result["License"].lower())
st.header(captions.PROPERTIES)
st.caption(captions.PROPERTIES_DISCLAIMER)
st.dataframe(license_properties)
if st.sidebar.checkbox(
options.SHOW_LICENSE_DEFINITIONS,
disabled=False if len(definitions.strip()) > 10 else True,
value=False,
help=help_messages.DEFINITIONS_CHECKBOX
):
if len(definitions.strip()) > 10:
st.header(captions.DEFINITIONS)
st.write(definitions)
if st.sidebar.checkbox(
options.SHOW_LICENSE_EXCEPTIONS,
disabled=False if len(exceptions.strip()) > 10 else True,
value=False,
help=help_messages.EXCEPTIONS_CHECKBOX
):
if len(exceptions.strip()) > 10:
st.header(captions.EXCEPTIONS)
st.write(exceptions)