Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

File size: 2,825 Bytes

fc6772f

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
import nltk
import json


MODEL_PATH = 'models/d2v.model'
LICENSE_INDEX_PATH = 'data/index_license_map.json'

license_index_name_map = json.load(open(LICENSE_INDEX_PATH))


def load_model():
    '''
    Load trained model parameters from file

    Args:

    Returns: Doc2Vec
        Model object
    '''
    model = Doc2Vec.load(MODEL_PATH)
    return model


def preprocess(input):
    '''
    Preprocess the input from the textbox

    Args:
        input: str
            Input string containing contents of license text

    Return: TaggedDocument
        TaggedDocument Object
    '''
    tokens = gensim.utils.simple_preprocess(input)
    tagged_doc = TaggedDocument(words=tokens, tags=[1])
    return tagged_doc


def inference_vector(model, tagged_doc):
    '''
    Return inference vector 

    Args:
        tagged_doc: TaggedDocument
            Input processed by 'preprocess' and converted to TaggedDocument
        model: Doc2Vec
            Doc2Vec Model object

    Return:
        model.infer_vector object
            Inference vector from model
    '''
    return model.infer_vector(tagged_doc.words)


def similarity_ranking(model, infer_vector):
    '''
    Returns a list of tuples containing predictions and confidence scores

    Args:
        model: Doc2Vec
        infer_vector: Doc2Vec.infer_vector

    Returns: list
        list of tuples containing predictions and confidence scores

    '''
    similar_doc = model.dv.most_similar([infer_vector], topn=len(model.dv))
    pred_ranking = []
    for pred in similar_doc:
        pred_ranking.append((license_index_name_map[pred[0]], pred[1]))
    return pred_ranking

def scores_to_df(scores):
    ''''
    Covert list of tuples containing predictions and confidence values to a df

    Args:
        scores: list
            list of tuples containing predictions and confidence

    Return: DataFrame
        Dataframe containing license names and confidence scores
    '''
    license_names = []
    license_scores = []
    for score in scores:
        license_names.append(score[0])
        license_scores.append(score[1])
    
    data = {'License': license_names, 'Scores': license_scores}
    return pd.DataFrame.from_dict(data)


def inference(input):
    '''
    Given text input, returns list of tuples containing predictions and confidence scores

    Args:
        input: str
            the input from the textbox

    Returns: list
        list of tuples containing predictions and confidence scores
    '''
    model = load_model()
    processed_text = preprocess(input)
    infer_vec = inference_vector(model, processed_text)
    results = similarity_ranking(model, infer_vec)
    results_df = scores_to_df(results)
    return results_df