Spaces:
Runtime error
Runtime error
import re | |
import os | |
from bs4 import BeautifulSoup | |
from striprtf.striprtf import rtf_to_text | |
import json | |
import nltk as nltk | |
def php_cleaner(text): | |
return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0] | |
def html_cleaner(text): | |
soup = BeautifulSoup(text) | |
return soup.body.text | |
def json_cleaner(text): | |
out = "" | |
for keys in text: | |
if keys in ('description', 'license'): | |
out+=keys | |
out+=": " | |
out+=str(text[keys]) | |
out+=", " | |
return out | |
def gnu_cleaner(text): | |
t = text.split('END OF TERMS AND CONDITIONS')[0] | |
if 'Preamble' in text: | |
if len(t.split('Preamble')[0])>100: | |
t0 = t.split('Preamble')[0] | |
try: | |
t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1] | |
except: | |
try: | |
t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1] | |
except: | |
t1 = t.split('Preamble')[1].split('distribution and modification follow')[1] | |
return t0+t1 | |
else: | |
return t.split('Preamble')[1].split('distribution and\nmodification follow')[1] | |
else: | |
return t | |
def rtf_cleaner(text): | |
return rtf_to_text(text) | |
def character_cleaner(text): | |
return re.sub("[=*-/·\n]+", "", text) | |
def url_cleaner(text): | |
return re.sub(r'http\S+', '', text) | |
def isEnglish(s): | |
try: | |
s.encode(encoding='utf-8').decode('ascii') | |
except UnicodeDecodeError: | |
return False | |
else: | |
return True | |
# input as a text | |
def clean_license_text(text): | |
text = text.strip() | |
if text[:5] == '<?php': | |
try: | |
t = php_cleaner(text) | |
except: | |
return "" | |
elif "</html>" in text: | |
t = html_cleaner(text) | |
elif text[0] == '{' and text[-1] == '}': | |
with open(file, 'r') as f: | |
t = json_cleaner(json.load(f)) | |
elif "GNU" in text or "Apache" in text: | |
t = gnu_cleaner(text) | |
elif "\\rtf" in text: | |
t = rtf_cleaner(text) | |
else: | |
t = text | |
t = url_cleaner(t) | |
t = character_cleaner(t) | |
if not isEnglish(t): | |
if not isEnglish(' '.join(t.split()[-5:-1])): | |
return "" | |
return t |