Spaces:
Runtime error
Runtime error
Nihal D'Souza
commited on
Commit
·
ac750db
1
Parent(s):
cf72a29
Pushing latest development branch
Browse files- src/doc2vec.py +19 -6
- src/textrank.py +203 -35
src/doc2vec.py
CHANGED
@@ -1,14 +1,20 @@
|
|
|
|
1 |
import gensim
|
2 |
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
3 |
import pandas as pd
|
4 |
-
import nltk
|
5 |
import json
|
6 |
|
|
|
7 |
|
8 |
MODEL_PATH = 'models/d2v.model'
|
9 |
LICENSE_INDEX_PATH = 'data/index_license_map.json'
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
def load_model():
|
@@ -20,7 +26,14 @@ def load_model():
|
|
20 |
Returns: Doc2Vec
|
21 |
Model object
|
22 |
'''
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
return model
|
25 |
|
26 |
|
@@ -35,7 +48,8 @@ def preprocess(input):
|
|
35 |
Return: TaggedDocument
|
36 |
TaggedDocument Object
|
37 |
'''
|
38 |
-
|
|
|
39 |
tagged_doc = TaggedDocument(words=tokens, tags=[1])
|
40 |
return tagged_doc
|
41 |
|
@@ -112,5 +126,4 @@ def inference(input):
|
|
112 |
infer_vec = inference_vector(model, processed_text)
|
113 |
results = similarity_ranking(model, infer_vec)
|
114 |
results_df = scores_to_df(results)
|
115 |
-
return results_df
|
116 |
-
|
|
|
1 |
+
import os
|
2 |
import gensim
|
3 |
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
4 |
import pandas as pd
|
|
|
5 |
import json
|
6 |
|
7 |
+
from clean import preprocess_text, script_cleaner
|
8 |
|
9 |
MODEL_PATH = 'models/d2v.model'
|
10 |
LICENSE_INDEX_PATH = 'data/index_license_map.json'
|
11 |
|
12 |
+
if os.path.exists(LICENSE_INDEX_PATH):
|
13 |
+
license_index_name_map = json.load(open(LICENSE_INDEX_PATH))
|
14 |
+
elif os.path.exists("../" + LICENSE_INDEX_PATH):
|
15 |
+
license_index_name_map = json.load(open("../" + LICENSE_INDEX_PATH))
|
16 |
+
else:
|
17 |
+
print("index_license_map Not Found!")
|
18 |
|
19 |
|
20 |
def load_model():
|
|
|
26 |
Returns: Doc2Vec
|
27 |
Model object
|
28 |
'''
|
29 |
+
if os.path.exists(MODEL_PATH):
|
30 |
+
model = Doc2Vec.load(MODEL_PATH)
|
31 |
+
elif os.path.exists("../" + MODEL_PATH):
|
32 |
+
model = Doc2Vec.load("../" + MODEL_PATH)
|
33 |
+
else:
|
34 |
+
print("d2v.model Not Found!")
|
35 |
+
return None
|
36 |
+
|
37 |
return model
|
38 |
|
39 |
|
|
|
48 |
Return: TaggedDocument
|
49 |
TaggedDocument Object
|
50 |
'''
|
51 |
+
clean_input = preprocess_text(script_cleaner(input))
|
52 |
+
tokens = gensim.utils.simple_preprocess(clean_input)
|
53 |
tagged_doc = TaggedDocument(words=tokens, tags=[1])
|
54 |
return tagged_doc
|
55 |
|
|
|
126 |
infer_vec = inference_vector(model, processed_text)
|
127 |
results = similarity_ranking(model, infer_vec)
|
128 |
results_df = scores_to_df(results)
|
129 |
+
return results_df
|
|
src/textrank.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import nltk
|
2 |
import numpy as np
|
3 |
import gensim
|
@@ -5,68 +6,235 @@ import spacy
|
|
5 |
import math
|
6 |
from collections import Counter
|
7 |
|
8 |
-
|
9 |
-
from src.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
properties_dict = {
|
12 |
-
"
|
13 |
-
|
14 |
-
"
|
15 |
-
"
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
}
|
18 |
|
|
|
19 |
properties_scores = {
|
20 |
-
"
|
21 |
-
"
|
22 |
-
"
|
23 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
}
|
25 |
|
26 |
-
nlp = spacy.load('en_core_web_sm')
|
27 |
|
28 |
def lemmatize_tokens(sent):
|
29 |
-
#TODO: Docstrings
|
30 |
-
|
31 |
-
|
|
|
|
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
TODO: Doctrings
|
37 |
-
|
38 |
-
sent_scores =
|
|
|
39 |
cleaned_license_text, definitions = clean_license_text(license_text)
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
summary_len = math.ceil(summary_len*len(cleaned_license_sentences))
|
|
|
42 |
if debug:
|
43 |
-
print(f
|
44 |
-
if debug:
|
45 |
print(cleaned_license_sentences)
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
if
|
50 |
continue
|
|
|
|
|
|
|
|
|
51 |
score = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
for prop, prop_words in properties_dict.items():
|
53 |
prop_score = 0
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
57 |
if prop_word in word_count.keys():
|
58 |
prop_score += properties_scores[prop]
|
|
|
|
|
59 |
if debug:
|
60 |
-
print(prop, "=", prop_score)
|
|
|
61 |
score += prop_score
|
62 |
-
|
|
|
|
|
63 |
if debug:
|
64 |
-
print(f
|
65 |
print()
|
|
|
66 |
if debug:
|
67 |
print(sent_scores)
|
68 |
-
sorted_sent_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))
|
69 |
-
summary = '.\n'.join(list(sorted_sent_scores.keys())[:summary_len])
|
70 |
-
return summary, definitions
|
71 |
|
|
|
|
|
|
|
72 |
|
|
|
|
1 |
+
import re
|
2 |
import nltk
|
3 |
import numpy as np
|
4 |
import gensim
|
|
|
6 |
import math
|
7 |
from collections import Counter
|
8 |
|
9 |
+
try:
|
10 |
+
from src.clean import clean_license_text
|
11 |
+
from src.read_data import read_file
|
12 |
+
except:
|
13 |
+
from clean import clean_license_text
|
14 |
+
from read_data import read_file
|
15 |
+
|
16 |
+
|
17 |
+
NEGATION_WEIGHT = 0.2
|
18 |
+
|
19 |
+
nlp = spacy.load("en_core_web_sm")
|
20 |
+
|
21 |
+
modal_verbs = {
|
22 |
+
"can",
|
23 |
+
"may",
|
24 |
+
"must",
|
25 |
+
"shall",
|
26 |
+
"will",
|
27 |
+
# "could",
|
28 |
+
# "might",
|
29 |
+
"should",
|
30 |
+
"would"
|
31 |
+
}
|
32 |
+
|
33 |
+
neg_modal = {
|
34 |
+
"cannot",
|
35 |
+
"may not",
|
36 |
+
"must not",
|
37 |
+
"shall not",
|
38 |
+
"will not",
|
39 |
+
# "could not",
|
40 |
+
# "might not",
|
41 |
+
"should not",
|
42 |
+
"would not"
|
43 |
+
}
|
44 |
+
|
45 |
+
# TODO Move these structures to another file
|
46 |
+
license_stopwords = {
|
47 |
+
",",
|
48 |
+
"(",
|
49 |
+
")",
|
50 |
+
"software",
|
51 |
+
"license",
|
52 |
+
"work",
|
53 |
+
# "copyright",
|
54 |
+
"program",
|
55 |
+
# "use",
|
56 |
+
# "copy",
|
57 |
+
"source",
|
58 |
+
# "may",
|
59 |
+
# "terms",
|
60 |
+
"code",
|
61 |
+
# "without",
|
62 |
+
# "free",
|
63 |
+
# "distribute",
|
64 |
+
# "rights",
|
65 |
+
# "notice",
|
66 |
+
# "shall",
|
67 |
+
"provided",
|
68 |
+
# "permission",
|
69 |
+
# "including",
|
70 |
+
"version",
|
71 |
+
"library",
|
72 |
+
# "condition",
|
73 |
+
"covered",
|
74 |
+
# "must",
|
75 |
+
"public",
|
76 |
+
# "modify",
|
77 |
+
# "distribution",
|
78 |
+
# "warranty",
|
79 |
+
}.union(nlp.Defaults.stop_words) - modal_verbs
|
80 |
+
|
81 |
+
negation_words = {
|
82 |
+
"no",
|
83 |
+
"not",
|
84 |
+
"non"
|
85 |
+
}
|
86 |
+
|
87 |
+
# TODO: Consider adding these words to the vocab:
|
88 |
+
# no-charge
|
89 |
+
#
|
90 |
+
#
|
91 |
+
#
|
92 |
+
#
|
93 |
+
|
94 |
+
verbs = [
|
95 |
+
"permit", "copy", "modify", "change", "sell", "reproduce",
|
96 |
+
"transfer", "rent", "lease", "assign", "sublet", "distribute",
|
97 |
+
"redistribute", "allow", "require", "merge", "publish", "use",
|
98 |
+
"include", "grant", "run", "affirm", "propagate", "acknowledge"
|
99 |
+
]
|
100 |
+
|
101 |
+
neg_verbs = [f"not-{verb}" for verb in verbs]
|
102 |
|
103 |
properties_dict = {
|
104 |
+
"0.1": [
|
105 |
+
],
|
106 |
+
"0.2": ["everyone"],
|
107 |
+
"0.3": ["irrevocable"],
|
108 |
+
"0.4": [],
|
109 |
+
"0.5": [],
|
110 |
+
"0.6": [
|
111 |
+
"distribution", "redistribution",
|
112 |
+
"permission", "modification",
|
113 |
+
"copyright",
|
114 |
+
"permission",
|
115 |
+
"limitation",
|
116 |
+
"free", "charge",
|
117 |
+
"warranty",
|
118 |
+
"term", "terms", "condition",
|
119 |
+
"right",
|
120 |
+
"sublicense",
|
121 |
+
"commercial", "non-commercial",
|
122 |
+
"exception"
|
123 |
+
],
|
124 |
+
"0.7": verbs + [
|
125 |
+
|
126 |
+
],
|
127 |
+
"0.8": [],
|
128 |
+
"0.9": neg_verbs + [],
|
129 |
+
"1.0": [],
|
130 |
+
"3.0": modal_verbs
|
131 |
}
|
132 |
|
133 |
+
|
134 |
properties_scores = {
|
135 |
+
"0.1": 0.1,
|
136 |
+
"0.2": 0.2,
|
137 |
+
"0.3": 0.3,
|
138 |
+
"0.4": 0.4,
|
139 |
+
"0.5": 0.5,
|
140 |
+
"0.6": 0.6,
|
141 |
+
"0.7": 0.7,
|
142 |
+
"0.8": 0.8,
|
143 |
+
"0.9": 0.9,
|
144 |
+
"1.0": 1.0,
|
145 |
+
"3.0": 3.0
|
146 |
}
|
147 |
|
|
|
148 |
|
149 |
def lemmatize_tokens(sent):
|
150 |
+
# TODO: Docstrings
|
151 |
+
"""Each word in input sentence is converted to lemma"""
|
152 |
+
lemmas = list()
|
153 |
+
|
154 |
+
nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)]
|
155 |
|
156 |
+
for tok_i, token in enumerate(nlp_sent):
|
157 |
+
if (token
|
158 |
+
and token not in license_stopwords
|
159 |
+
and token not in negation_words):
|
160 |
+
if tok_i > 0 and nlp_sent[tok_i-1] in negation_words:
|
161 |
+
lemmas.append(f"{nlp_sent[tok_i-1]}-{token}")
|
162 |
+
elif tok_i > 1 and nlp_sent[tok_i-1] in " -" and nlp_sent[tok_i-2] in negation_words:
|
163 |
+
lemmas.append(f"{nlp_sent[tok_i-2]}-{token}")
|
164 |
+
else:
|
165 |
+
lemmas.append(token)
|
166 |
|
167 |
+
return lemmas
|
168 |
+
|
169 |
+
|
170 |
+
def custom_textrank_summarizer(license_text,
|
171 |
+
min_sent_len=3,
|
172 |
+
summary_len=0.3,
|
173 |
+
debug=False):
|
174 |
+
"""
|
175 |
TODO: Doctrings
|
176 |
+
"""
|
177 |
+
sent_scores = Counter()
|
178 |
+
|
179 |
cleaned_license_text, definitions = clean_license_text(license_text)
|
180 |
+
|
181 |
+
cleaned_license_sentences = re.split('(\n{2,}|\.)', cleaned_license_text)
|
182 |
+
cleaned_license_sentences = [
|
183 |
+
text.strip() for text in cleaned_license_sentences
|
184 |
+
if text.strip() not in ["", ".", "\n", "\n\n"]
|
185 |
+
]
|
186 |
+
|
187 |
summary_len = math.ceil(summary_len*len(cleaned_license_sentences))
|
188 |
+
|
189 |
if debug:
|
190 |
+
print(f"summary length:{summary_len}")
|
|
|
191 |
print(cleaned_license_sentences)
|
192 |
+
|
193 |
+
for sent_i, sent in enumerate(cleaned_license_sentences):
|
194 |
+
|
195 |
+
if sent_i < 0:
|
196 |
continue
|
197 |
+
|
198 |
+
if len(sent.split()) < min_sent_len:
|
199 |
+
continue
|
200 |
+
|
201 |
score = 0
|
202 |
+
|
203 |
+
lemmatized_tokens = lemmatize_tokens(sent)
|
204 |
+
|
205 |
+
if debug:
|
206 |
+
print("-"*50)
|
207 |
+
print(f"\nOriginal Sentence = {sent}")
|
208 |
+
print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}")
|
209 |
+
|
210 |
+
word_count = Counter([tok for tok in lemmatized_tokens])
|
211 |
+
|
212 |
for prop, prop_words in properties_dict.items():
|
213 |
prop_score = 0
|
214 |
+
|
215 |
+
imp_words = list()
|
216 |
+
|
217 |
+
for prop_i, prop_word in enumerate(prop_words):
|
218 |
if prop_word in word_count.keys():
|
219 |
prop_score += properties_scores[prop]
|
220 |
+
imp_words.append(prop_word)
|
221 |
+
|
222 |
if debug:
|
223 |
+
print(prop, "=", imp_words, "=", prop_score)
|
224 |
+
|
225 |
score += prop_score
|
226 |
+
|
227 |
+
sent_scores[sent] = score / len(lemmatized_tokens)
|
228 |
+
|
229 |
if debug:
|
230 |
+
print(f"Sentence score: {sent_scores[sent]}")
|
231 |
print()
|
232 |
+
|
233 |
if debug:
|
234 |
print(sent_scores)
|
|
|
|
|
|
|
235 |
|
236 |
+
sorted_sent_scores = sent_scores.most_common()
|
237 |
+
|
238 |
+
summary = ".\n".join(sent for sent, score in sorted_sent_scores[:summary_len])
|
239 |
|
240 |
+
return summary, definitions
|