Nihal D'Souza commited on
Commit
19dab1b
·
1 Parent(s): ac750db

updating src module imports with try-except clause

Browse files
Files changed (2) hide show
  1. src/clean.py +643 -48
  2. src/doc2vec.py +4 -1
src/clean.py CHANGED
@@ -2,12 +2,62 @@ import re
2
  import json
3
  from bs4 import BeautifulSoup
4
  from striprtf.striprtf import rtf_to_text
 
5
 
6
 
7
  PARA_BREAK = "para___break"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def php_cleaner(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  try:
12
  return re.findall("\/\*[\S\s]*?\*\/", text)[0]
13
  except:
@@ -16,6 +66,20 @@ def php_cleaner(text):
16
 
17
 
18
  def html_cleaner(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  soup = BeautifulSoup(text)
20
  text = soup.body.text
21
  if not text:
@@ -24,6 +88,20 @@ def html_cleaner(text):
24
 
25
 
26
  def json_cleaner(text_dict):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  out = ""
28
  for key in text_dict.keys():
29
  if key in ("description", "license"):
@@ -34,60 +112,101 @@ def json_cleaner(text_dict):
34
  return out
35
 
36
 
37
- def discard_text_after_tnc(text):
38
- return text.split("END OF TERMS AND CONDITIONS")[0]
 
39
 
 
 
 
 
40
 
41
- def gnu_cleaner(text):
42
- t = text.split('END OF TERMS AND CONDITIONS')[0]
43
- definitions = ""
44
- if 'Preamble' in text:
45
- if len(t.split('Preamble')[0])>100:
46
- t0 = t.split('Preamble')[0]
47
- try:
48
- t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
49
- except:
50
- try:
51
- t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
52
- except:
53
- t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
54
- t = t0+t1
55
- else:
56
- t = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
57
- if 'Definitions' in text:
58
- try:
59
- def_pos = re.search(r"[0-9]\.? (Additional )?Definitions",t).span()
60
- other_start_pos = re.search(r"[0-9]\.? [A-Z][a-z]+",t[def_pos[1]:]).span()[0]
61
- definitions = t[def_pos[0]: def_pos[1] + other_start_pos]
62
- t = t[:def_pos[0]] + t[def_pos[1]+other_start_pos:]
63
- except:
64
- t = t
65
- return t, definitions
66
-
67
 
68
- def rtf_cleaner(text):
69
  return rtf_to_text(text)
70
 
71
 
72
  def url_cleaner(text):
73
- return re.sub(r"http\S+", "", text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
 
76
  def email_cleaner(text):
77
- return re.sub(r"\S*@\S*", "", text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
 
80
  def var_cleaner(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  text = re.sub(r"\$\w+", "", text)
82
  text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text)
 
83
  return text
84
 
85
 
86
  def character_cleaner(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  text = url_cleaner(text)
88
  text = email_cleaner(text)
89
  text = var_cleaner(text)
90
 
 
91
  text = re.sub("[\n]{2,}", ". ", text)
92
  text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text)
93
  text = re.sub("[\. ]{2,}", ". ", text)
@@ -95,6 +214,20 @@ def character_cleaner(text):
95
 
96
 
97
  def isEnglish(s):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  try:
99
  s.encode(encoding="utf-8").decode("ascii")
100
  except UnicodeDecodeError:
@@ -103,15 +236,227 @@ def isEnglish(s):
103
  return True
104
 
105
 
106
- def preprocess_text(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  definitions = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  if "GNU" in text or "Apache" in text:
109
- text, definitions = gnu_cleaner(text)
110
- definitions = definitions.strip()
111
- return text, definitions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
 
114
  def script_cleaner(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  if "<?php" in text:
116
  text = php_cleaner(text)
117
  elif "</html>" in text:
@@ -122,50 +467,300 @@ def script_cleaner(text):
122
  text = rtf_cleaner(text)
123
  if not text:
124
  return ""
 
 
125
  return text
126
 
127
 
128
- def split_paras(text):
129
- if "\n\n\n\n" in text:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  paras = text.split("\n\n\n\n")
131
- elif "\n\n\n" in text:
132
- paras = text.split("\n\n\n")
133
- elif "\n\n" in text:
134
  paras = text.split("\n\n")
 
 
 
135
  else:
136
  paras = [text]
 
 
 
 
 
 
 
 
 
 
137
  return paras
138
 
139
 
140
- def clean_paras(paras):
141
- return paras
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- def clean_license_text(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  if len(text) == 0:
147
  return text
148
 
 
 
 
149
  text = script_cleaner(text)
150
- text, definitions = preprocess_text(text)
151
- paras = clean_paras(split_paras(text))
 
 
152
  text = PARA_BREAK.join(paras)
153
  text = character_cleaner(text)
154
  text = re.sub(PARA_BREAK, "\n\n", text)
155
  text = text.strip()
156
 
 
157
  if not isEnglish(text):
158
  if not isEnglish(" ".join(text.split()[-5:-1])):
159
  return "", ""
160
-
 
 
 
 
 
 
161
  return text, definitions
162
 
163
 
164
  """
165
  Notes:
166
 
167
- 1. Regex for other definitions: --------> ".{0,20}".{0,40}means
168
- 2. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something)
169
  -> Merge with the next para
170
  Ex. "8. Termination."
171
  """
 
2
  import json
3
  from bs4 import BeautifulSoup
4
  from striprtf.striprtf import rtf_to_text
5
+ from collections import defaultdict
6
 
7
 
8
  PARA_BREAK = "para___break"
9
+ seperator = "=" * 50
10
+ verbosity = 0
11
+
12
+
13
+ def extract_author_details(text, verbosity=0):
14
+ """
15
+ Extracts important author information from the license text.
16
+
17
+ Parameters
18
+ ----------
19
+ text : str
20
+ Raw License text.
21
+ verbosity : int, optional
22
+ The level of print statements on the output console. The default is 0.
23
+
24
+ Returns
25
+ -------
26
+ text : str
27
+ License text with author details removed.
28
+ author_details : list
29
+ A list of important author details.
30
+
31
+ """
32
+ author_details_pattern = r"(@(author|license|copyright|package).*)"
33
+ author_details = list()
34
+ text = re.sub(author_details_pattern, lambda m: author_details.append(m.group(1)), text)
35
+ if author_details and verbosity != 0:
36
+ print(seperator)
37
+ print(seperator)
38
+ print("Following author details were extracted:")
39
+ print(seperator)
40
+ print(author_details)
41
+ print()
42
+
43
+ return text, author_details
44
 
45
 
46
  def php_cleaner(text):
47
+ """
48
+ Cleans the license file in PHP format.
49
+
50
+ Parameters
51
+ ----------
52
+ text : str
53
+ Raw License text.
54
+
55
+ Returns
56
+ -------
57
+ str
58
+ Cleaned License text with PHP script removed.
59
+
60
+ """
61
  try:
62
  return re.findall("\/\*[\S\s]*?\*\/", text)[0]
63
  except:
 
66
 
67
 
68
  def html_cleaner(text):
69
+ """
70
+ Cleans the license file in HTML format.
71
+
72
+ Parameters
73
+ ----------
74
+ text : str
75
+ Raw License text.
76
+
77
+ Returns
78
+ -------
79
+ str
80
+ Cleaned License text with HTML script removed.
81
+
82
+ """
83
  soup = BeautifulSoup(text)
84
  text = soup.body.text
85
  if not text:
 
88
 
89
 
90
  def json_cleaner(text_dict):
91
+ """
92
+ Cleans the license file in JSON format.
93
+
94
+ Parameters
95
+ ----------
96
+ text : str
97
+ Raw License text.
98
+
99
+ Returns
100
+ -------
101
+ str
102
+ Cleaned License text with JSON format normalized to text.
103
+
104
+ """
105
  out = ""
106
  for key in text_dict.keys():
107
  if key in ("description", "license"):
 
112
  return out
113
 
114
 
115
+ def rtf_cleaner(text):
116
+ """
117
+ Cleans the license file in RTF format.
118
 
119
+ Parameters
120
+ ----------
121
+ text : str
122
+ Raw License text.
123
 
124
+ Returns
125
+ -------
126
+ str
127
+ Cleaned License text with RTF script removed.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ """
130
  return rtf_to_text(text)
131
 
132
 
133
  def url_cleaner(text):
134
+ """
135
+ Removes URLs from the License text.
136
+
137
+ Parameters
138
+ ----------
139
+ text : str
140
+ Raw License text.
141
+
142
+ Returns
143
+ -------
144
+ str
145
+ Cleaned License text with URLs removed.
146
+
147
+ """
148
+ return re.sub(r"\(?http\S+\)?", "", text)
149
 
150
 
151
  def email_cleaner(text):
152
+ """
153
+ Removes emails from the License text.
154
+
155
+ Parameters
156
+ ----------
157
+ text : str
158
+ Raw License text.
159
+
160
+ Returns
161
+ -------
162
+ str
163
+ Cleaned License text with emails removed.
164
+
165
+ """
166
+ return re.sub(r"\S{3,}@\S{2,}\.\S+", "", text)
167
 
168
 
169
  def var_cleaner(text):
170
+ """
171
+ Removes potential variable names from the License text.
172
+
173
+ Parameters
174
+ ----------
175
+ text : str
176
+ Raw License text.
177
+
178
+ Returns
179
+ -------
180
+ str
181
+ Cleaned License text with variable names removed.
182
+
183
+ """
184
  text = re.sub(r"\$\w+", "", text)
185
  text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text)
186
+ # text = re.sub(r"[a-zA-Z\(\)_'\"]+\.[a-zA-Z_]+", "", text)
187
  return text
188
 
189
 
190
  def character_cleaner(text):
191
+ """
192
+ Removes unnecessary special characters from the License text.
193
+
194
+ Parameters
195
+ ----------
196
+ text : str
197
+ Raw License text.
198
+
199
+ Returns
200
+ -------
201
+ str
202
+ Cleaned License text with some special characters removed.
203
+
204
+ """
205
  text = url_cleaner(text)
206
  text = email_cleaner(text)
207
  text = var_cleaner(text)
208
 
209
+ text = re.sub("\s*(;quot;|&amp)\s*", " ", text)
210
  text = re.sub("[\n]{2,}", ". ", text)
211
  text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text)
212
  text = re.sub("[\. ]{2,}", ". ", text)
 
214
 
215
 
216
  def isEnglish(s):
217
+ """
218
+ Checks whether the License text is in English or not.
219
+
220
+ Parameters
221
+ ----------
222
+ s : str
223
+ Raw License text.
224
+
225
+ Returns
226
+ -------
227
+ bool
228
+ True if complete License text is in English, False otherwise.
229
+
230
+ """
231
  try:
232
  s.encode(encoding="utf-8").decode("ascii")
233
  except UnicodeDecodeError:
 
236
  return True
237
 
238
 
239
+ def split_definitions_exceptions(text, verbosity=0):
240
+ """
241
+ Extract definitions from the License text
242
+
243
+ Parameters
244
+ ----------
245
+ text : str
246
+ Raw License text.
247
+
248
+ Returns
249
+ -------
250
+ paras : list
251
+ A list of paragraphs from License text with definitions and exceptions
252
+ removed.
253
+ definitions : str
254
+ Definitions extracted from the License text.
255
+ exceptions : list
256
+ A list of paragraphs which contain exceptions .
257
+
258
+ """
259
  definitions = ""
260
+
261
+ if "Definitions" in text:
262
+ try:
263
+ def_pattern = r"([S|s]ection )?[0-9] ?[\.|-|–]? ?([A|a]dditional )?[D|d]efinitions"
264
+ after_def_pattern = r"\s+(Section )?[0-9]\.? [\.|-|–]? ?[A-Z][a-z]+"
265
+ def_pos = re.search(def_pattern, text).span()
266
+ other_start_pos = re.search(after_def_pattern, text[def_pos[1]:]).span()[0]
267
+ definitions = text[def_pos[0]: def_pos[1] + other_start_pos].strip() + "\n\n"
268
+ text = text[:def_pos[0]] + text[def_pos[1] + other_start_pos:]
269
+ except:
270
+ pass
271
+
272
+ paras, more_defs = extract_relevant_paras(
273
+ split_paras(text, verbosity=verbosity),
274
+ verbosity=verbosity
275
+ )
276
+
277
+ definitions += more_defs.strip()
278
+ paras, exceptions = get_exeptions(paras, verbosity=verbosity)
279
+
280
+ return paras, definitions, exceptions
281
+
282
+
283
+ def discard_text_after_end_tnc(text):
284
+ """
285
+ Discards text after "END OF TERMS AND CONDITIONS"
286
+
287
+ Parameters
288
+ ----------
289
+ text : str
290
+ Raw License text.
291
+
292
+ Returns
293
+ -------
294
+ str
295
+ License text with irrelavant information after "END OF TERMS AND CONDITIONS" removed.
296
+
297
+ """
298
+ return text.split("END OF TERMS AND CONDITIONS")[0]
299
+
300
+
301
+ def clear_preamble(text):
302
+ """
303
+ Cleans Preamble from the License text
304
+
305
+ Parameters
306
+ ----------
307
+ text : str
308
+ Raw License text.
309
+
310
+ Returns
311
+ -------
312
+ text : str
313
+ License text with Preamble removed.
314
+
315
+ """
316
+ preamble_pattern = "Preamble"
317
+ dist_and_mod_pattern = "distribution\s+and\s+modification\s+follow\.?"
318
+
319
+ if preamble_pattern in text:
320
+ preamble_split = text.split(preamble_pattern)
321
+
322
+ if len(preamble_split) != 2:
323
+ return text
324
+
325
+ try:
326
+ after_preamble_end = re.split(dist_and_mod_pattern, preamble_split[1])[1]
327
+
328
+ # TODO Why do we need this condition?
329
+ if len(preamble_split[0]) > 100:
330
+ text = preamble_split[0] + after_preamble_end.strip()
331
+ except:
332
+ pass
333
+ return text
334
+
335
+
336
+ def gnu_cleaner(text):
337
+ """
338
+ Cleans GNU text such as discarding Preamble and text after end of terms
339
+ and conditions.
340
+
341
+ Parameters
342
+ ----------
343
+ text : str
344
+ Raw License text.
345
+
346
+ Returns
347
+ -------
348
+ preamble_cleared_text : str
349
+ License text with irrelavant information in Preamble and text after end
350
+ of terms and conditions removed.
351
+
352
+ """
353
+
354
+ before_end_tnc = discard_text_after_end_tnc(text)
355
+ preamble_cleared_text = clear_preamble(before_end_tnc)
356
+
357
+ return preamble_cleared_text
358
+
359
+
360
+ def preprocess_text(text):
361
+ """
362
+ Preprocesses License text considering different License types.
363
+
364
+ Parameters
365
+ ----------
366
+ text : str
367
+ Raw License text.
368
+
369
+ Returns
370
+ -------
371
+ text : str
372
+ Cleaned License text.
373
+
374
+ """
375
+
376
+ # if most_likely_license_type in [
377
+ # "GPL-3.0-only",
378
+ # "AGPL-3.0-only",
379
+ # "GPL-2.0-only",
380
+ # "LGPL-3.0-only",
381
+ # "LGPL-2.1-only",
382
+ # ]:
383
+
384
+ # # We need to take care of these cases too:
385
+ # # https://choosealicense.com/licenses/ofl-1.1/
386
+ # # https://choosealicense.com/licenses/lodbl-1.0/
387
+ # # https://choosealicense.com/licenses/odbl-1.0/
388
+ # # https://choosealicense.com/licenses/lms-rl/
389
+ # # https://choosealicense.com/licenses/lms-pl/
390
+ # # https://choosealicense.com/licenses/lmpl-2.0/
391
+ # # https://choosealicense.com/licenses/lppl-1.3c/
392
+ # # https://choosealicense.com/licenses/eupl-1.2/
393
+ # # https://choosealicense.com/licenses/eupl-1.1/
394
+ # # https://choosealicense.com/licenses/epl-2.0/
395
+ # # https://choosealicense.com/licenses/epl-1.0/
396
+ # # https://choosealicense.com/licenses/ecl-2.0/
397
+ # # https://choosealicense.com/licenses/cecill-2.1/
398
+ # # https://choosealicense.com/licenses/cc-by-sa-4.0/
399
+ # # https://choosealicense.com/licenses/cc-by-4.0/
400
+ # # https://choosealicense.com/licenses/artistic-2.0/
401
+ # # https://choosealicense.com/licenses/apache-2.0/
402
+
403
+ # TODO This condition will not work, fix it:
404
  if "GNU" in text or "Apache" in text:
405
+ text = gnu_cleaner(text)
406
+ return text
407
+
408
+
409
+ def clean_if_else(text):
410
+ """
411
+ Removes specific if-else conditions from the License text
412
+
413
+ Parameters
414
+ ----------
415
+ text : str
416
+ Raw License text.
417
+
418
+ Returns
419
+ -------
420
+ str
421
+ Cleaned License text with if-else conditions removed.
422
+
423
+ """
424
+ return re.sub(r"#\bif[\s\S]+?#endif\s*", "", text).strip()
425
+
426
+
427
+ def clean_comments(text):
428
+ """
429
+ Cleans specific comment formats from the License texts
430
+
431
+ Parameters
432
+ ----------
433
+ text : str
434
+ Raw License text.
435
+
436
+ Returns
437
+ -------
438
+ str
439
+ Cleaned License text with comments conditions removed.
440
+
441
+ """
442
+ return re.sub(r"[\`'\"]{3,}[\s\S]*?[\`'\"]{3,}", "", text).strip()
443
 
444
 
445
  def script_cleaner(text):
446
+ """
447
+ Cleans the script text from License text to extract the main content.
448
+
449
+ Parameters
450
+ ----------
451
+ text : str
452
+ Raw License text.
453
+
454
+ Returns
455
+ -------
456
+ str
457
+ Cleaned License text without scripts.
458
+
459
+ """
460
  if "<?php" in text:
461
  text = php_cleaner(text)
462
  elif "</html>" in text:
 
467
  text = rtf_cleaner(text)
468
  if not text:
469
  return ""
470
+ text = clean_if_else(text)
471
+ text = clean_comments(text)
472
  return text
473
 
474
 
475
+ def split_paras(text, verbosity=0):
476
+ """
477
+ Splits the text into paragraphs.
478
+
479
+ Parameters
480
+ ----------
481
+ text : str
482
+ Raw License text.
483
+ verbosity : int, optional
484
+ The level of print statements on the output console. The default is 0.
485
+
486
+ Returns
487
+ -------
488
+ paras : list
489
+ A list of split paragraphs.
490
+
491
+ """
492
+ text = re.sub(r"\n{4,}", "\n"*4, text)
493
+ if len(re.findall("\n\n\n\n", text)) >= 2:
494
  paras = text.split("\n\n\n\n")
495
+ paras = [re.sub(r"\n{1,3}", " ", para) for para in paras]
496
+ elif len(re.findall("\n\n", text)) >= 2:
 
497
  paras = text.split("\n\n")
498
+ paras = [re.sub(r"\n", " ", para) for para in paras]
499
+ elif len(re.findall("\n", text)) >= 2:
500
+ paras = text.split("\n")
501
  else:
502
  paras = [text]
503
+ if verbosity != 0:
504
+ print(seperator)
505
+ print(seperator)
506
+ print("These are the split paras in the text:")
507
+ for para in paras:
508
+ if not para.strip():
509
+ continue
510
+ print(seperator)
511
+ print(para)
512
+ print()
513
  return paras
514
 
515
 
516
+ def extract_relevant_paras(paras, verbosity=0):
517
+ """
518
+ Extracts relevant paragraphs from the list of all paragraphs.
519
+
520
+ Parameters
521
+ ----------
522
+ paras : list
523
+ A list of split paragraphs.
524
+ verbosity : int, optional
525
+ The level of print statements on the output console. The default is 0.
526
+
527
+ Returns
528
+ -------
529
+ cleaned_paras : list
530
+ A list of relevant paragraphs.
531
+ definitions : str
532
+ Definition text as extracted by the "clean_definitions_pattern", which
533
+ is to be appended to other definitons in the License text if any.
534
 
535
+ """
536
+ cleaned_paras = list()
537
+ definitions = ""
538
+
539
+ # TODO This might be interesting to look into:
540
+ # https://choosealicense.com/licenses/eupl-1.2/
541
+
542
+ clean_definitions_pattern = r"""\".{0,20}\".{0,40}(mean|include|refer)s?"""
543
+
544
+ if verbosity != 0:
545
+ print(seperator)
546
+ print(seperator)
547
+ print("Following paragraphs were considered unnecessary and removed:")
548
+ for para in paras:
549
+ if not para.strip():
550
+ continue
551
+ if re.search(clean_definitions_pattern, para):
552
+ definitions += para + "\n\n"
553
+ if verbosity != 0:
554
+ print(seperator)
555
+ print(para)
556
+ else:
557
+ cleaned_paras.append(para)
558
+ if verbosity != 0:
559
+ print()
560
+
561
+ definitions = definitions.strip()
562
+
563
+ return cleaned_paras, definitions
564
+
565
+
566
+ def get_all_caps(text, verbosity=0):
567
+ """
568
+ Extracts text with all caps content from the License text.
569
+
570
+ Parameters
571
+ ----------
572
+ text : str
573
+ Raw License text.
574
+ verbosity : int, optional
575
+ The level of print statements on the output console. The default is 0.
576
+
577
+ Returns
578
+ -------
579
+ text : str
580
+ License text with all caps sentences removed.
581
+ all_caps : list
582
+ A list of all caps sentences from the License text.
583
+
584
+ """
585
+ all_caps_pattern = r"([^a-z\n]{50,})"
586
+ all_caps = list()
587
+ text = re.sub(all_caps_pattern, lambda m: all_caps.append(m.group(1)), text)
588
+ text = re.sub(r"\n{3,}", "\n\n", text)
589
+ if all_caps and verbosity != 0:
590
+ print(seperator)
591
+ print(seperator)
592
+ print("Following all caps were removed from the text:")
593
+ print(all_caps)
594
+ print()
595
+ return text, all_caps
596
+
597
+
598
+ def get_exeptions(paras, verbosity=0):
599
+ """
600
+ Extracts a list of exceptions from the License text.
601
+
602
+ Parameters
603
+ ----------
604
+ paras : list
605
+ A list of paragraphs from the License text.
606
+ verbosity : int, optional
607
+ The level of print statements on the output console. The default is 0.
608
+
609
+ Returns
610
+ -------
611
+ non_exception_paras : list
612
+ A list of all paragraphs not containing exceptions from the License text.
613
+ exceptions : list
614
+ A list of all paragraphs containing exceptions from the License text.
615
+
616
+ """
617
+ non_exception_paras = list()
618
+ exceptions = list()
619
+
620
+ for para in paras:
621
+ if re.search("exception", para.lower()):
622
+ exceptions.append(para)
623
+ else:
624
+ non_exception_paras.append(para)
625
+
626
+ if exceptions and verbosity != 0:
627
+ print(seperator)
628
+ print(seperator)
629
+ print("Following exceptions were found in the text:")
630
+ for exception in exceptions:
631
+ print(seperator)
632
+ print(exception)
633
+ print()
634
+
635
+ return non_exception_paras, exceptions
636
+
637
+
638
+ def get_MIT_content(text):
639
+ """
640
+ Returns the content of the MIT-like-licenses segregated into categories like
641
+ Copyright, main content, etc.
642
+
643
+ Parameters
644
+ ----------
645
+ text : str
646
+ Cleaned MIT License text.
647
+
648
+ Returns
649
+ -------
650
+ dictionary
651
+ A dictionary of content from the MIT license. Keys are the type of
652
+ content and values are the License contents from License text.
653
+ """
654
+ paras = split_paras(text)
655
+
656
+ mit_content = defaultdict(list)
657
+
658
+ for para in paras:
659
+ para = para.strip()
660
+ if len(para) < 1:
661
+ continue
662
+ if len(para.split()) <= 10 and ("Licens" in para or "licens" in para) and "Copyright" not in para:
663
+ mit_content["header"].append(para)
664
+ elif "Copyright" in para:
665
+ if "is hereby granted" in para:
666
+ mit_content["copyright+content"].append(para)
667
+ else:
668
+ mit_content["copyright"].append(para)
669
+ elif "Permission is hereby granted" in para:
670
+ mit_content["content"].append(para)
671
+ elif "The above copyright notice" in para or len(para.split()) < 18:
672
+ mit_content["sentence"].append(para)
673
+ elif get_all_caps(para)[1]:
674
+ mit_content["all_cap"].append(para)
675
+ else:
676
+ mit_content["content"].append(para)
677
+
678
+ for key, value in mit_content.items():
679
+ mit_content[key] = "\n\n".join(value)
680
+
681
+ return mit_content
682
+
683
+
684
+ def get_most_likely_license_type(text):
685
+ """
686
+ Returns the most likely license type based on Doc2Vec scores
687
+ (similarity > 0.9).
688
+
689
+ Parameters
690
+ ----------
691
+ text : str
692
+ Raw License text.
693
+
694
+ Returns
695
+ -------
696
+ str
697
+ The type of the most likely license. "Not found" if no license score is
698
+ above 0.9
699
+ """
700
+ from src.doc2vec import inference
701
+
702
+ top1_result = inference(text).loc[0, :]
703
+
704
+ if top1_result["Scores"] > 0.9:
705
+ return top1_result["License"]
706
+ else:
707
+ return "Not Found"
708
+
709
+
710
+ def clean_license_text(text, verbosity=0):
711
+ """
712
+ Cleans License text.
713
 
714
+ Parameters
715
+ ----------
716
+ text : str
717
+ Raw License text.
718
+ verbosity : int, optional
719
+ The level of print statements on the output console. The default is 0.
720
+
721
+ Returns
722
+ -------
723
+ text : str
724
+ Cleaned License text.
725
+ definitions : str
726
+ Definitions extracted from the License text.
727
+
728
+ """
729
 
730
  if len(text) == 0:
731
  return text
732
 
733
+ most_likely_license_type = get_most_likely_license_type(text)
734
+
735
+ text, author_details = extract_author_details(text, verbosity=verbosity)
736
  text = script_cleaner(text)
737
+ text = preprocess_text(text)
738
+ paras, definitions, exceptions = split_definitions_exceptions(
739
+ text, verbosity=verbosity
740
+ )
741
  text = PARA_BREAK.join(paras)
742
  text = character_cleaner(text)
743
  text = re.sub(PARA_BREAK, "\n\n", text)
744
  text = text.strip()
745
 
746
+ # TODO Need to update this too:
747
  if not isEnglish(text):
748
  if not isEnglish(" ".join(text.split()[-5:-1])):
749
  return "", ""
750
+
751
+ if "MIT" in most_likely_license_type:
752
+ mit_content = get_MIT_content(text)
753
+ if verbosity != 0:
754
+ print("This is likely an MIT License!")
755
+ print(mit_content)
756
+
757
  return text, definitions
758
 
759
 
760
  """
761
  Notes:
762
 
763
+ 1. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something)
 
764
  -> Merge with the next para
765
  Ex. "8. Termination."
766
  """
src/doc2vec.py CHANGED
@@ -4,7 +4,10 @@ from gensim.models.doc2vec import Doc2Vec, TaggedDocument
4
  import pandas as pd
5
  import json
6
 
7
- from clean import preprocess_text, script_cleaner
 
 
 
8
 
9
  MODEL_PATH = 'models/d2v.model'
10
  LICENSE_INDEX_PATH = 'data/index_license_map.json'
 
4
  import pandas as pd
5
  import json
6
 
7
+ try:
8
+ from src.clean import preprocess_text, script_cleaner
9
+ except:
10
+ from clean import preprocess_text, script_cleaner
11
 
12
  MODEL_PATH = 'models/d2v.model'
13
  LICENSE_INDEX_PATH = 'data/index_license_map.json'