Nihal D'Souza
Adding cleaning function
cedd239
raw
history blame
2.29 kB
import re
import os
from bs4 import BeautifulSoup
from striprtf.striprtf import rtf_to_text
import json
import nltk as nltk
def php_cleaner(text):
return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]
def html_cleaner(text):
soup = BeautifulSoup(text)
return soup.body.text
def json_cleaner(text):
out = ""
for keys in text:
if keys in ('description', 'license'):
out+=keys
out+=": "
out+=str(text[keys])
out+=", "
return out
def gnu_cleaner(text):
t = text.split('END OF TERMS AND CONDITIONS')[0]
if 'Preamble' in text:
if len(t.split('Preamble')[0])>100:
t0 = t.split('Preamble')[0]
try:
t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
except:
try:
t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
except:
t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
return t0+t1
else:
return t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
else:
return t
def rtf_cleaner(text):
return rtf_to_text(text)
def character_cleaner(text):
return re.sub("[=*-/·\n]+", "", text)
def url_cleaner(text):
return re.sub(r'http\S+', '', text)
def isEnglish(s):
try:
s.encode(encoding='utf-8').decode('ascii')
except UnicodeDecodeError:
return False
else:
return True
# input as a text
def clean_license_text(text):
text = text.strip()
if text[:5] == '<?php':
try:
t = php_cleaner(text)
except:
return ""
elif "</html>" in text:
t = html_cleaner(text)
elif text[0] == '{' and text[-1] == '}':
with open(file, 'r') as f:
t = json_cleaner(json.load(f))
elif "GNU" in text or "Apache" in text:
t = gnu_cleaner(text)
elif "\\rtf" in text:
t = rtf_cleaner(text)
else:
t = text
t = url_cleaner(t)
t = character_cleaner(t)
if not isEnglish(t):
if not isEnglish(' '.join(t.split()[-5:-1])):
return ""
return t