Nihal D'Souza commited on
Commit
ac750db
·
1 Parent(s): cf72a29

Pushing latest development branch

Browse files
Files changed (2) hide show
  1. src/doc2vec.py +19 -6
  2. src/textrank.py +203 -35
src/doc2vec.py CHANGED
@@ -1,14 +1,20 @@
 
1
  import gensim
2
  from gensim.models.doc2vec import Doc2Vec, TaggedDocument
3
  import pandas as pd
4
- import nltk
5
  import json
6
 
 
7
 
8
  MODEL_PATH = 'models/d2v.model'
9
  LICENSE_INDEX_PATH = 'data/index_license_map.json'
10
 
11
- license_index_name_map = json.load(open(LICENSE_INDEX_PATH))
 
 
 
 
 
12
 
13
 
14
  def load_model():
@@ -20,7 +26,14 @@ def load_model():
20
  Returns: Doc2Vec
21
  Model object
22
  '''
23
- model = Doc2Vec.load(MODEL_PATH)
 
 
 
 
 
 
 
24
  return model
25
 
26
 
@@ -35,7 +48,8 @@ def preprocess(input):
35
  Return: TaggedDocument
36
  TaggedDocument Object
37
  '''
38
- tokens = gensim.utils.simple_preprocess(input)
 
39
  tagged_doc = TaggedDocument(words=tokens, tags=[1])
40
  return tagged_doc
41
 
@@ -112,5 +126,4 @@ def inference(input):
112
  infer_vec = inference_vector(model, processed_text)
113
  results = similarity_ranking(model, infer_vec)
114
  results_df = scores_to_df(results)
115
- return results_df
116
-
 
1
+ import os
2
  import gensim
3
  from gensim.models.doc2vec import Doc2Vec, TaggedDocument
4
  import pandas as pd
 
5
  import json
6
 
7
+ from clean import preprocess_text, script_cleaner
8
 
9
  MODEL_PATH = 'models/d2v.model'
10
  LICENSE_INDEX_PATH = 'data/index_license_map.json'
11
 
12
+ if os.path.exists(LICENSE_INDEX_PATH):
13
+ license_index_name_map = json.load(open(LICENSE_INDEX_PATH))
14
+ elif os.path.exists("../" + LICENSE_INDEX_PATH):
15
+ license_index_name_map = json.load(open("../" + LICENSE_INDEX_PATH))
16
+ else:
17
+ print("index_license_map Not Found!")
18
 
19
 
20
  def load_model():
 
26
  Returns: Doc2Vec
27
  Model object
28
  '''
29
+ if os.path.exists(MODEL_PATH):
30
+ model = Doc2Vec.load(MODEL_PATH)
31
+ elif os.path.exists("../" + MODEL_PATH):
32
+ model = Doc2Vec.load("../" + MODEL_PATH)
33
+ else:
34
+ print("d2v.model Not Found!")
35
+ return None
36
+
37
  return model
38
 
39
 
 
48
  Return: TaggedDocument
49
  TaggedDocument Object
50
  '''
51
+ clean_input = preprocess_text(script_cleaner(input))
52
+ tokens = gensim.utils.simple_preprocess(clean_input)
53
  tagged_doc = TaggedDocument(words=tokens, tags=[1])
54
  return tagged_doc
55
 
 
126
  infer_vec = inference_vector(model, processed_text)
127
  results = similarity_ranking(model, infer_vec)
128
  results_df = scores_to_df(results)
129
+ return results_df
 
src/textrank.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import nltk
2
  import numpy as np
3
  import gensim
@@ -5,68 +6,235 @@ import spacy
5
  import math
6
  from collections import Counter
7
 
8
- from src.clean import clean_license_text
9
- from src.read_data import read_file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  properties_dict = {
12
- "modify":['modify', 'modification', 'change'],
13
- "distribute":['distribute', 'distribution'],
14
- "copy":['copy'],
15
- "copyright": ['copyright']
16
- # "exception"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  }
18
 
 
19
  properties_scores = {
20
- "modify": 0.8,
21
- "distribute": 0.8,
22
- "copy": 0.8,
23
- "copyright": 0.9
 
 
 
 
 
 
 
24
  }
25
 
26
- nlp = spacy.load('en_core_web_sm')
27
 
28
  def lemmatize_tokens(sent):
29
- #TODO: Docstrings
30
- '''each word in input sentence is converted to lemma'''
31
- return [token.lemma_.lower() for token in nlp(sent)]
 
 
32
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, debug=False):
35
- '''
 
 
 
 
 
 
36
  TODO: Doctrings
37
- '''
38
- sent_scores = {}
 
39
  cleaned_license_text, definitions = clean_license_text(license_text)
40
- cleaned_license_sentences = cleaned_license_text.split('.')
 
 
 
 
 
 
41
  summary_len = math.ceil(summary_len*len(cleaned_license_sentences))
 
42
  if debug:
43
- print(f'summary length:{summary_len}')
44
- if debug:
45
  print(cleaned_license_sentences)
46
- for i in cleaned_license_sentences:
47
- if debug:
48
- print(i.split())
49
- if len(i.split()) < min_sent_len:
50
  continue
 
 
 
 
51
  score = 0
 
 
 
 
 
 
 
 
 
 
52
  for prop, prop_words in properties_dict.items():
53
  prop_score = 0
54
- lemmatized_tokens = lemmatize_tokens(i)
55
- word_count = Counter([tok for tok in lemmatized_tokens])
56
- for prop_word in prop_words:
 
57
  if prop_word in word_count.keys():
58
  prop_score += properties_scores[prop]
 
 
59
  if debug:
60
- print(prop, "=", prop_score)
 
61
  score += prop_score
62
- sent_scores[i] = score/len(lemmatized_tokens)
 
 
63
  if debug:
64
- print(f'Sentence score: {sent_scores[i]}')
65
  print()
 
66
  if debug:
67
  print(sent_scores)
68
- sorted_sent_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))
69
- summary = '.\n'.join(list(sorted_sent_scores.keys())[:summary_len])
70
- return summary, definitions
71
 
 
 
 
72
 
 
 
1
+ import re
2
  import nltk
3
  import numpy as np
4
  import gensim
 
6
  import math
7
  from collections import Counter
8
 
9
+ try:
10
+ from src.clean import clean_license_text
11
+ from src.read_data import read_file
12
+ except:
13
+ from clean import clean_license_text
14
+ from read_data import read_file
15
+
16
+
17
+ NEGATION_WEIGHT = 0.2
18
+
19
+ nlp = spacy.load("en_core_web_sm")
20
+
21
+ modal_verbs = {
22
+ "can",
23
+ "may",
24
+ "must",
25
+ "shall",
26
+ "will",
27
+ # "could",
28
+ # "might",
29
+ "should",
30
+ "would"
31
+ }
32
+
33
+ neg_modal = {
34
+ "cannot",
35
+ "may not",
36
+ "must not",
37
+ "shall not",
38
+ "will not",
39
+ # "could not",
40
+ # "might not",
41
+ "should not",
42
+ "would not"
43
+ }
44
+
45
+ # TODO Move these structures to another file
46
+ license_stopwords = {
47
+ ",",
48
+ "(",
49
+ ")",
50
+ "software",
51
+ "license",
52
+ "work",
53
+ # "copyright",
54
+ "program",
55
+ # "use",
56
+ # "copy",
57
+ "source",
58
+ # "may",
59
+ # "terms",
60
+ "code",
61
+ # "without",
62
+ # "free",
63
+ # "distribute",
64
+ # "rights",
65
+ # "notice",
66
+ # "shall",
67
+ "provided",
68
+ # "permission",
69
+ # "including",
70
+ "version",
71
+ "library",
72
+ # "condition",
73
+ "covered",
74
+ # "must",
75
+ "public",
76
+ # "modify",
77
+ # "distribution",
78
+ # "warranty",
79
+ }.union(nlp.Defaults.stop_words) - modal_verbs
80
+
81
+ negation_words = {
82
+ "no",
83
+ "not",
84
+ "non"
85
+ }
86
+
87
+ # TODO: Consider adding these words to the vocab:
88
+ # no-charge
89
+ #
90
+ #
91
+ #
92
+ #
93
+
94
+ verbs = [
95
+ "permit", "copy", "modify", "change", "sell", "reproduce",
96
+ "transfer", "rent", "lease", "assign", "sublet", "distribute",
97
+ "redistribute", "allow", "require", "merge", "publish", "use",
98
+ "include", "grant", "run", "affirm", "propagate", "acknowledge"
99
+ ]
100
+
101
+ neg_verbs = [f"not-{verb}" for verb in verbs]
102
 
103
  properties_dict = {
104
+ "0.1": [
105
+ ],
106
+ "0.2": ["everyone"],
107
+ "0.3": ["irrevocable"],
108
+ "0.4": [],
109
+ "0.5": [],
110
+ "0.6": [
111
+ "distribution", "redistribution",
112
+ "permission", "modification",
113
+ "copyright",
114
+ "permission",
115
+ "limitation",
116
+ "free", "charge",
117
+ "warranty",
118
+ "term", "terms", "condition",
119
+ "right",
120
+ "sublicense",
121
+ "commercial", "non-commercial",
122
+ "exception"
123
+ ],
124
+ "0.7": verbs + [
125
+
126
+ ],
127
+ "0.8": [],
128
+ "0.9": neg_verbs + [],
129
+ "1.0": [],
130
+ "3.0": modal_verbs
131
  }
132
 
133
+
134
  properties_scores = {
135
+ "0.1": 0.1,
136
+ "0.2": 0.2,
137
+ "0.3": 0.3,
138
+ "0.4": 0.4,
139
+ "0.5": 0.5,
140
+ "0.6": 0.6,
141
+ "0.7": 0.7,
142
+ "0.8": 0.8,
143
+ "0.9": 0.9,
144
+ "1.0": 1.0,
145
+ "3.0": 3.0
146
  }
147
 
 
148
 
149
  def lemmatize_tokens(sent):
150
+ # TODO: Docstrings
151
+ """Each word in input sentence is converted to lemma"""
152
+ lemmas = list()
153
+
154
+ nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)]
155
 
156
+ for tok_i, token in enumerate(nlp_sent):
157
+ if (token
158
+ and token not in license_stopwords
159
+ and token not in negation_words):
160
+ if tok_i > 0 and nlp_sent[tok_i-1] in negation_words:
161
+ lemmas.append(f"{nlp_sent[tok_i-1]}-{token}")
162
+ elif tok_i > 1 and nlp_sent[tok_i-1] in " -" and nlp_sent[tok_i-2] in negation_words:
163
+ lemmas.append(f"{nlp_sent[tok_i-2]}-{token}")
164
+ else:
165
+ lemmas.append(token)
166
 
167
+ return lemmas
168
+
169
+
170
+ def custom_textrank_summarizer(license_text,
171
+ min_sent_len=3,
172
+ summary_len=0.3,
173
+ debug=False):
174
+ """
175
  TODO: Doctrings
176
+ """
177
+ sent_scores = Counter()
178
+
179
  cleaned_license_text, definitions = clean_license_text(license_text)
180
+
181
+ cleaned_license_sentences = re.split('(\n{2,}|\.)', cleaned_license_text)
182
+ cleaned_license_sentences = [
183
+ text.strip() for text in cleaned_license_sentences
184
+ if text.strip() not in ["", ".", "\n", "\n\n"]
185
+ ]
186
+
187
  summary_len = math.ceil(summary_len*len(cleaned_license_sentences))
188
+
189
  if debug:
190
+ print(f"summary length:{summary_len}")
 
191
  print(cleaned_license_sentences)
192
+
193
+ for sent_i, sent in enumerate(cleaned_license_sentences):
194
+
195
+ if sent_i < 0:
196
  continue
197
+
198
+ if len(sent.split()) < min_sent_len:
199
+ continue
200
+
201
  score = 0
202
+
203
+ lemmatized_tokens = lemmatize_tokens(sent)
204
+
205
+ if debug:
206
+ print("-"*50)
207
+ print(f"\nOriginal Sentence = {sent}")
208
+ print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}")
209
+
210
+ word_count = Counter([tok for tok in lemmatized_tokens])
211
+
212
  for prop, prop_words in properties_dict.items():
213
  prop_score = 0
214
+
215
+ imp_words = list()
216
+
217
+ for prop_i, prop_word in enumerate(prop_words):
218
  if prop_word in word_count.keys():
219
  prop_score += properties_scores[prop]
220
+ imp_words.append(prop_word)
221
+
222
  if debug:
223
+ print(prop, "=", imp_words, "=", prop_score)
224
+
225
  score += prop_score
226
+
227
+ sent_scores[sent] = score / len(lemmatized_tokens)
228
+
229
  if debug:
230
+ print(f"Sentence score: {sent_scores[sent]}")
231
  print()
232
+
233
  if debug:
234
  print(sent_scores)
 
 
 
235
 
236
+ sorted_sent_scores = sent_scores.most_common()
237
+
238
+ summary = ".\n".join(sent for sent, score in sorted_sent_scores[:summary_len])
239
 
240
+ return summary, definitions