File size: 3,462 Bytes
ac750db
fc6772f
 
 
 
e41b03f
fc6772f
19dab1b
 
 
 
fc6772f
e41b03f
fc6772f
 
 
ac750db
 
 
 
 
 
fc6772f
 
 
 
 
 
 
 
 
 
 
ac750db
 
 
 
 
 
 
 
fc6772f
 
 
 
 
 
 
 
 
 
 
 
 
 
ac750db
 
fc6772f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e41b03f
fc6772f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac750db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
import json
import streamlit as st

try:
    from src.clean import preprocess_text, script_cleaner
except:
    from clean import preprocess_text, script_cleaner


MODEL_PATH = 'models/d2v.model'
LICENSE_INDEX_PATH = 'data/index_license_map.json'

if os.path.exists(LICENSE_INDEX_PATH):
    license_index_name_map = json.load(open(LICENSE_INDEX_PATH))
elif os.path.exists("../" + LICENSE_INDEX_PATH):
    license_index_name_map = json.load(open("../" + LICENSE_INDEX_PATH))
else:
    print("index_license_map Not Found!")


def load_model():
    '''
    Load trained model parameters from file

    Args:

    Returns: Doc2Vec
        Model object
    '''
    if os.path.exists(MODEL_PATH):
        model = Doc2Vec.load(MODEL_PATH)
    elif os.path.exists("../" + MODEL_PATH):
        model = Doc2Vec.load("../" + MODEL_PATH)
    else:
        print("d2v.model Not Found!")
        return None
    
    return model


def preprocess(input):
    '''
    Preprocess the input from the textbox

    Args:
        input: str
            Input string containing contents of license text

    Return: TaggedDocument
        TaggedDocument Object
    '''
    clean_input = preprocess_text(script_cleaner(input))
    tokens = gensim.utils.simple_preprocess(clean_input)
    tagged_doc = TaggedDocument(words=tokens, tags=[1])
    return tagged_doc


def inference_vector(model, tagged_doc):
    '''
    Return inference vector 

    Args:
        tagged_doc: TaggedDocument
            Input processed by 'preprocess' and converted to TaggedDocument
        model: Doc2Vec
            Doc2Vec Model object

    Return:
        model.infer_vector object
            Inference vector from model
    '''
    return model.infer_vector(tagged_doc.words)


def similarity_ranking(model, infer_vector):
    '''
    Returns a list of tuples containing predictions and confidence scores

    Args:
        model: Doc2Vec
        infer_vector: Doc2Vec.infer_vector

    Returns: list
        list of tuples containing predictions and confidence scores

    '''
    similar_doc = model.dv.most_similar([infer_vector], topn=len(model.dv))
    pred_ranking = []
    for pred in similar_doc:
        pred_ranking.append((license_index_name_map[pred[0]], pred[1]))
    return pred_ranking

def scores_to_df(scores):
    ''''
    Covert list of tuples containing predictions and confidence values to a df

    Args:
        scores: list
            list of tuples containing predictions and confidence

    Return: DataFrame
        Dataframe containing license names and confidence scores
    '''
    license_names = []
    license_scores = []
    for score in scores:
        license_names.append(score[0])
        license_scores.append(score[1])
    
    data = {'License': license_names, 'Similarity Scores': license_scores}
    return pd.DataFrame.from_dict(data)

def inference(input):
    '''
    Given text input, returns list of tuples containing predictions and confidence scores

    Args:
        input: str
            the input from the textbox

    Returns: list
        list of tuples containing predictions and confidence scores
    '''
    model = load_model()
    processed_text = preprocess(input)
    infer_vec = inference_vector(model, processed_text)
    results = similarity_ranking(model, infer_vec)
    results_df = scores_to_df(results)
    return results_df