Spaces:
Runtime error
Runtime error
Nihal D'Souza
commited on
Commit
·
cedd239
1
Parent(s):
fc6772f
Adding cleaning function
Browse files- requirements.txt +1 -0
- src/clean.py +80 -0
requirements.txt
CHANGED
@@ -95,6 +95,7 @@ smmap==5.0.0
|
|
95 |
soupsieve==2.3.2.post1
|
96 |
stack-data==0.2.0
|
97 |
streamlit==1.9.0
|
|
|
98 |
terminado==0.15.0
|
99 |
threadpoolctl==3.1.0
|
100 |
tinycss2==1.1.1
|
|
|
95 |
soupsieve==2.3.2.post1
|
96 |
stack-data==0.2.0
|
97 |
streamlit==1.9.0
|
98 |
+
striprtf==0.0.20
|
99 |
terminado==0.15.0
|
100 |
threadpoolctl==3.1.0
|
101 |
tinycss2==1.1.1
|
src/clean.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from striprtf.striprtf import rtf_to_text
|
5 |
+
import json
|
6 |
+
import nltk as nltk
|
7 |
+
|
8 |
+
|
9 |
+
def php_cleaner(text):
|
10 |
+
return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]
|
11 |
+
def html_cleaner(text):
|
12 |
+
soup = BeautifulSoup(text)
|
13 |
+
return soup.body.text
|
14 |
+
def json_cleaner(text):
|
15 |
+
out = ""
|
16 |
+
for keys in text:
|
17 |
+
if keys in ('description', 'license'):
|
18 |
+
out+=keys
|
19 |
+
out+=": "
|
20 |
+
out+=str(text[keys])
|
21 |
+
out+=", "
|
22 |
+
return out
|
23 |
+
def gnu_cleaner(text):
|
24 |
+
t = text.split('END OF TERMS AND CONDITIONS')[0]
|
25 |
+
if 'Preamble' in text:
|
26 |
+
if len(t.split('Preamble')[0])>100:
|
27 |
+
t0 = t.split('Preamble')[0]
|
28 |
+
try:
|
29 |
+
t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
|
30 |
+
except:
|
31 |
+
try:
|
32 |
+
t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
|
33 |
+
except:
|
34 |
+
t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
|
35 |
+
return t0+t1
|
36 |
+
else:
|
37 |
+
return t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
|
38 |
+
else:
|
39 |
+
return t
|
40 |
+
def rtf_cleaner(text):
|
41 |
+
return rtf_to_text(text)
|
42 |
+
def character_cleaner(text):
|
43 |
+
return re.sub("[=*-/·\n]+", "", text)
|
44 |
+
def url_cleaner(text):
|
45 |
+
return re.sub(r'http\S+', '', text)
|
46 |
+
def isEnglish(s):
|
47 |
+
try:
|
48 |
+
s.encode(encoding='utf-8').decode('ascii')
|
49 |
+
except UnicodeDecodeError:
|
50 |
+
return False
|
51 |
+
else:
|
52 |
+
return True
|
53 |
+
|
54 |
+
# input as a text
|
55 |
+
def clean_license_text(text):
|
56 |
+
text = text.strip()
|
57 |
+
if text[:5] == '<?php':
|
58 |
+
try:
|
59 |
+
t = php_cleaner(text)
|
60 |
+
except:
|
61 |
+
return ""
|
62 |
+
elif "</html>" in text:
|
63 |
+
t = html_cleaner(text)
|
64 |
+
elif text[0] == '{' and text[-1] == '}':
|
65 |
+
with open(file, 'r') as f:
|
66 |
+
t = json_cleaner(json.load(f))
|
67 |
+
elif "GNU" in text or "Apache" in text:
|
68 |
+
t = gnu_cleaner(text)
|
69 |
+
elif "\\rtf" in text:
|
70 |
+
t = rtf_cleaner(text)
|
71 |
+
else:
|
72 |
+
t = text
|
73 |
+
|
74 |
+
t = url_cleaner(t)
|
75 |
+
t = character_cleaner(t)
|
76 |
+
|
77 |
+
if not isEnglish(t):
|
78 |
+
if not isEnglish(' '.join(t.split()[-5:-1])):
|
79 |
+
return ""
|
80 |
+
return t
|