import re import os from bs4 import BeautifulSoup from striprtf.striprtf import rtf_to_text import json import nltk as nltk def php_cleaner(text): return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0] def html_cleaner(text): soup = BeautifulSoup(text) return soup.body.text def json_cleaner(text): out = "" for keys in text: if keys in ('description', 'license'): out+=keys out+=": " out+=str(text[keys]) out+=", " return out def gnu_cleaner(text): t = text.split('END OF TERMS AND CONDITIONS')[0] if 'Preamble' in text: if len(t.split('Preamble')[0])>100: t0 = t.split('Preamble')[0] try: t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1] except: try: t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1] except: t1 = t.split('Preamble')[1].split('distribution and modification follow')[1] return t0+t1 else: return t.split('Preamble')[1].split('distribution and\nmodification follow')[1] else: return t def rtf_cleaner(text): return rtf_to_text(text) def character_cleaner(text): return re.sub("[=*-/·\n]+", "", text) def url_cleaner(text): return re.sub(r'http\S+', '', text) def isEnglish(s): try: s.encode(encoding='utf-8').decode('ascii') except UnicodeDecodeError: return False else: return True # input as a text def clean_license_text(text): text = text.strip() if text[:5] == '" in text: t = html_cleaner(text) elif text[0] == '{' and text[-1] == '}': with open(file, 'r') as f: t = json_cleaner(json.load(f)) elif "GNU" in text or "Apache" in text: t = gnu_cleaner(text) elif "\\rtf" in text: t = rtf_cleaner(text) else: t = text t = url_cleaner(t) t = character_cleaner(t) if not isEnglish(t): if not isEnglish(' '.join(t.split()[-5:-1])): return "" return t