File size: 2,825 Bytes
fc6772f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
import nltk
import json


MODEL_PATH = 'models/d2v.model'
LICENSE_INDEX_PATH = 'data/index_license_map.json'

license_index_name_map = json.load(open(LICENSE_INDEX_PATH))


def load_model():
    '''
    Load trained model parameters from file

    Args:

    Returns: Doc2Vec
        Model object
    '''
    model = Doc2Vec.load(MODEL_PATH)
    return model


def preprocess(input):
    '''
    Preprocess the input from the textbox

    Args:
        input: str
            Input string containing contents of license text

    Return: TaggedDocument
        TaggedDocument Object
    '''
    tokens = gensim.utils.simple_preprocess(input)
    tagged_doc = TaggedDocument(words=tokens, tags=[1])
    return tagged_doc


def inference_vector(model, tagged_doc):
    '''
    Return inference vector 

    Args:
        tagged_doc: TaggedDocument
            Input processed by 'preprocess' and converted to TaggedDocument
        model: Doc2Vec
            Doc2Vec Model object

    Return:
        model.infer_vector object
            Inference vector from model
    '''
    return model.infer_vector(tagged_doc.words)


def similarity_ranking(model, infer_vector):
    '''
    Returns a list of tuples containing predictions and confidence scores

    Args:
        model: Doc2Vec
        infer_vector: Doc2Vec.infer_vector

    Returns: list
        list of tuples containing predictions and confidence scores

    '''
    similar_doc = model.dv.most_similar([infer_vector], topn=len(model.dv))
    pred_ranking = []
    for pred in similar_doc:
        pred_ranking.append((license_index_name_map[pred[0]], pred[1]))
    return pred_ranking

def scores_to_df(scores):
    ''''
    Covert list of tuples containing predictions and confidence values to a df

    Args:
        scores: list
            list of tuples containing predictions and confidence

    Return: DataFrame
        Dataframe containing license names and confidence scores
    '''
    license_names = []
    license_scores = []
    for score in scores:
        license_names.append(score[0])
        license_scores.append(score[1])
    
    data = {'License': license_names, 'Scores': license_scores}
    return pd.DataFrame.from_dict(data)


def inference(input):
    '''
    Given text input, returns list of tuples containing predictions and confidence scores

    Args:
        input: str
            the input from the textbox

    Returns: list
        list of tuples containing predictions and confidence scores
    '''
    model = load_model()
    processed_text = preprocess(input)
    infer_vec = inference_vector(model, processed_text)
    results = similarity_ranking(model, infer_vec)
    results_df = scores_to_df(results)
    return results_df