Nihal D'Souza commited on
Commit
cedd239
·
1 Parent(s): fc6772f

Adding cleaning function

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -0
  2. src/clean.py +80 -0
requirements.txt CHANGED
@@ -95,6 +95,7 @@ smmap==5.0.0
95
  soupsieve==2.3.2.post1
96
  stack-data==0.2.0
97
  streamlit==1.9.0
 
98
  terminado==0.15.0
99
  threadpoolctl==3.1.0
100
  tinycss2==1.1.1
 
95
  soupsieve==2.3.2.post1
96
  stack-data==0.2.0
97
  streamlit==1.9.0
98
+ striprtf==0.0.20
99
  terminado==0.15.0
100
  threadpoolctl==3.1.0
101
  tinycss2==1.1.1
src/clean.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ from bs4 import BeautifulSoup
4
+ from striprtf.striprtf import rtf_to_text
5
+ import json
6
+ import nltk as nltk
7
+
8
+
9
+ def php_cleaner(text):
10
+ return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]
11
+ def html_cleaner(text):
12
+ soup = BeautifulSoup(text)
13
+ return soup.body.text
14
+ def json_cleaner(text):
15
+ out = ""
16
+ for keys in text:
17
+ if keys in ('description', 'license'):
18
+ out+=keys
19
+ out+=": "
20
+ out+=str(text[keys])
21
+ out+=", "
22
+ return out
23
+ def gnu_cleaner(text):
24
+ t = text.split('END OF TERMS AND CONDITIONS')[0]
25
+ if 'Preamble' in text:
26
+ if len(t.split('Preamble')[0])>100:
27
+ t0 = t.split('Preamble')[0]
28
+ try:
29
+ t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
30
+ except:
31
+ try:
32
+ t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
33
+ except:
34
+ t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
35
+ return t0+t1
36
+ else:
37
+ return t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
38
+ else:
39
+ return t
40
+ def rtf_cleaner(text):
41
+ return rtf_to_text(text)
42
+ def character_cleaner(text):
43
+ return re.sub("[=*-/·\n]+", "", text)
44
+ def url_cleaner(text):
45
+ return re.sub(r'http\S+', '', text)
46
+ def isEnglish(s):
47
+ try:
48
+ s.encode(encoding='utf-8').decode('ascii')
49
+ except UnicodeDecodeError:
50
+ return False
51
+ else:
52
+ return True
53
+
54
+ # input as a text
55
+ def clean_license_text(text):
56
+ text = text.strip()
57
+ if text[:5] == '<?php':
58
+ try:
59
+ t = php_cleaner(text)
60
+ except:
61
+ return ""
62
+ elif "</html>" in text:
63
+ t = html_cleaner(text)
64
+ elif text[0] == '{' and text[-1] == '}':
65
+ with open(file, 'r') as f:
66
+ t = json_cleaner(json.load(f))
67
+ elif "GNU" in text or "Apache" in text:
68
+ t = gnu_cleaner(text)
69
+ elif "\\rtf" in text:
70
+ t = rtf_cleaner(text)
71
+ else:
72
+ t = text
73
+
74
+ t = url_cleaner(t)
75
+ t = character_cleaner(t)
76
+
77
+ if not isEnglish(t):
78
+ if not isEnglish(' '.join(t.split()[-5:-1])):
79
+ return ""
80
+ return t