File size: 4,308 Bytes
e1b1d60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
import pickle
from typing import List, Text
import logging
from summa import keywords

try:
    import streamlit as st    
except ImportError:
    logging.info("Streamlit not installed")


def sort_coo(coo_matrix):
    """
    It takes Coordinate format scipy sparse matrix and extracts info from same.\
    1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
    """
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
    """get the feature names and tf-idf score of top n items
    
    Params
    ---------
    feature_names: list of words from vectorizer
    sorted_items: tuple returned by sort_coo function defined in  \
    keyword_extraction.py
    topn: topn words to be extracted using tfidf

    Return
    ----------
    results: top extracted keywords

    """
    
    #use only topn items from vector
    sorted_items = sorted_items[:top_n]
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results


def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
    """
    TFIDF based keywords extraction
    
    Params
    ---------
    vectorizer: trained cont vectorizer model
    tfidfmodel: TFIDF Tranformer model
    top_n: Top N keywords to be extracted
    textdata: text data to which needs keyword extraction

    Return
    ----------
    keywords: top extracted keywords

    """
    features = vectorizer.get_feature_names_out()
    tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    results=extract_topn_from_vector(features,sorted_items,top_n)
    keywords = [keyword for keyword in results]
    return keywords

def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
    """
    TFIDF based keywords extraction
    
    Params
    ---------
    sdg: which sdg tfidf model to be used
    sdgdata: text data to which needs keyword extraction


    Return
    ----------
    keywords: top extracted keywords

    """
    model_path = "docStore/sdg{}/".format(sdg)
    vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
    tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
    features = vectorizer.get_feature_names_out()
    tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    top_n = top_n
    results=extract_topn_from_vector(features,sorted_items,top_n)
    keywords = [keyword for keyword in results]
    return keywords

@st.cache(allow_output_mutation=True)
def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
    """
    wrappper function to perform textrank, uses either ratio or wordcount to
    extract top keywords limited by words or ratio.
    1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py

    Params
    --------
    textdata: text data to perform the textrank.
    ratio: float to limit the number of keywords as proportion of total token \
        in textdata
    words: number of keywords to be extracted. Takes priority over ratio if \
        Non zero. Howevr incase the pagerank returns lesser keywords than \
        compared to fix value then ratio is used.
    
    Return
    --------
    results: extracted keywords
    """
    if words == 0:
        logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
        results = keywords.keywords(textdata, ratio= ratio).split("\n")
    else:
        try:
            results = keywords.keywords(textdata, words= words).split("\n")
        except:
            results = keywords.keywords(textdata, ratio = ratio).split("\n")

    return results