# Similarity Score
This Notebook imports the questions and answers from the QuestionGeneration.ipynb output and scores the similarity

Heavy inspiration taken from:

https://github.com/karndeepsingh/sentence_similarity/blob/main/Finding_Similar_Sentence.ipynb

## Import packages

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


## Load the model
Uncomment to store the model locally for easy retrieval, but delete the model before uploading to GitHub as too large

In [2]:
model = SentenceTransformer('nli-distilroberta-base-v2')
# model.save("./Model/model")
# model = SentenceTransformer("./Model/model")

## Load in the questions and responses

In [3]:
Data = pd.read_csv("./Results/Compare.csv", index_col=0)
questions = Data.index.values
company = Data['Company'].values
gold = Data['Gold'].values
sentences = np.concatenate((company, gold))

## Create the sentence embeddings and obtain scores

In [4]:
sentence_embeddings = model.encode(sentences)
similarity_score = []
for i in range(len(company)):
    similarity_score.append(cosine_similarity(
        [sentence_embeddings[i]],
        [sentence_embeddings[len(company) + i]]
    ).flatten()[0])
Similarity = pd.DataFrame({'Score': similarity_score}, index=questions)

## Print a few of the scores

In [5]:
print(Similarity.head())

                                                       Score
Legal and regulatory requirements involving AI ...  0.820508
The characteristics of trustworthy AI are integ...  0.847748
Processes, procedures, and practices are in pla...  0.793549
The risk management process and its outcomes ar...  0.837404
Ongoing monitoring and periodic review of the r...  0.810052


## Save the scores to a .csv file

In [6]:
Similarity.to_csv('./Results/Similarity_Scores.csv')