Spaces:
Sleeping
Sleeping
import streamlit as st | |
from sentence_transformers import SentenceTransformer, util | |
import joblib | |
import numpy as np | |
import requests | |
from sklearn.metrics.pairwise import cosine_similarity | |
model_name = "chukbert/paraphrase-multilingual-MiniLM-L12-v2-MSRP-Indo-finetuned-2-epoch" | |
model = SentenceTransformer(model_name) | |
url_xgb_model = "https://huggingface.co/chukbert/xgb-msrp-indo/resolve/main/xgboost_best_model.pkl" | |
response = requests.get(url_xgb_model) | |
with open("xgboost_best_model.pkl", "wb") as f: | |
f.write(response.content) | |
xgb_model = joblib.load("xgboost_best_model.pkl") | |
st.title("Paraphrase Detection with SentenceTransformer and XGBoost for Indonesian Sentences") | |
st.write( | |
""" | |
This application uses a fine-tuned SentenceTransformer model for detecting paraphrases in Indonesian text, | |
followed by an XGBoost classifier for final prediction. The model was trained on a dataset of sentence pairs | |
and aims to identify if two sentences convey the same meaning. | |
### How to Use the Application | |
- Enter two sentences in the input fields provided in Bahasa Indonesia. | |
- Click the 'Check Paraphrase' button to check if the sentences are paraphrases of each other. | |
- The application will provide the cosine similarity between the sentences and the final prediction by the XGBoost model. | |
### F1-Macro Scores | |
- **Validation F1-Macro Score**: 79.1% | |
- **Test F1-Macro Score**: 72.5% | |
""" | |
) | |
st.header("Try It Out!") | |
sentence1 = st.text_input("Enter the first sentence:") | |
sentence2 = st.text_input("Enter the second sentence:") | |
if st.button("Check Paraphrase"): | |
if sentence1 and sentence2: | |
with st.spinner("Processing..."): | |
embedding1 = model.encode(sentence1) | |
embedding2 = model.encode(sentence2) | |
similarity = cosine_similarity([embedding1], [embedding2])[0][0] | |
st.write(f"Cosine Similarity: {similarity:.4f}") | |
prediction = xgb_model.predict(np.array([[similarity]])) | |
if prediction == 1: | |
st.success("The sentences are likely paraphrases of each other.") | |
else: | |
st.warning("The sentences are not likely to be paraphrases.") | |
else: | |
st.error("Please enter both sentences to proceed.") | |
st.sidebar.header("About the Model") | |
st.sidebar.write( | |
"This model is a fine-tuned version of 'paraphrase-multilingual-MiniLM-L12-v2' using Indonesian paraphrase datasets Microsoft Paraphrase Corpus, combined with an XGBoost classifier. " | |
"The training process focused on maximizing F1-macro scores for both validation and test sets." | |
) |