Spaces:
Runtime error
Runtime error
import gensim | |
from gensim.models.doc2vec import Doc2Vec, TaggedDocument | |
import pandas as pd | |
import nltk | |
import json | |
MODEL_PATH = 'models/d2v.model' | |
LICENSE_INDEX_PATH = 'data/index_license_map.json' | |
license_index_name_map = json.load(open(LICENSE_INDEX_PATH)) | |
def load_model(): | |
''' | |
Load trained model parameters from file | |
Args: | |
Returns: Doc2Vec | |
Model object | |
''' | |
model = Doc2Vec.load(MODEL_PATH) | |
return model | |
def preprocess(input): | |
''' | |
Preprocess the input from the textbox | |
Args: | |
input: str | |
Input string containing contents of license text | |
Return: TaggedDocument | |
TaggedDocument Object | |
''' | |
tokens = gensim.utils.simple_preprocess(input) | |
tagged_doc = TaggedDocument(words=tokens, tags=[1]) | |
return tagged_doc | |
def inference_vector(model, tagged_doc): | |
''' | |
Return inference vector | |
Args: | |
tagged_doc: TaggedDocument | |
Input processed by 'preprocess' and converted to TaggedDocument | |
model: Doc2Vec | |
Doc2Vec Model object | |
Return: | |
model.infer_vector object | |
Inference vector from model | |
''' | |
return model.infer_vector(tagged_doc.words) | |
def similarity_ranking(model, infer_vector): | |
''' | |
Returns a list of tuples containing predictions and confidence scores | |
Args: | |
model: Doc2Vec | |
infer_vector: Doc2Vec.infer_vector | |
Returns: list | |
list of tuples containing predictions and confidence scores | |
''' | |
similar_doc = model.dv.most_similar([infer_vector], topn=len(model.dv)) | |
pred_ranking = [] | |
for pred in similar_doc: | |
pred_ranking.append((license_index_name_map[pred[0]], pred[1])) | |
return pred_ranking | |
def scores_to_df(scores): | |
'''' | |
Covert list of tuples containing predictions and confidence values to a df | |
Args: | |
scores: list | |
list of tuples containing predictions and confidence | |
Return: DataFrame | |
Dataframe containing license names and confidence scores | |
''' | |
license_names = [] | |
license_scores = [] | |
for score in scores: | |
license_names.append(score[0]) | |
license_scores.append(score[1]) | |
data = {'License': license_names, 'Scores': license_scores} | |
return pd.DataFrame.from_dict(data) | |
def inference(input): | |
''' | |
Given text input, returns list of tuples containing predictions and confidence scores | |
Args: | |
input: str | |
the input from the textbox | |
Returns: list | |
list of tuples containing predictions and confidence scores | |
''' | |
model = load_model() | |
processed_text = preprocess(input) | |
infer_vec = inference_vector(model, processed_text) | |
results = similarity_ranking(model, infer_vec) | |
results_df = scores_to_df(results) | |
return results_df | |