Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

clearlydefined_license_summarizer / src /clean.py

Nihal D'Souza

Adding cleaning function

cedd239 about 3 years ago

2.29 kB

	import re
	import os
	from bs4 import BeautifulSoup
	from striprtf.striprtf import rtf_to_text
	import json
	import nltk as nltk


	def php_cleaner(text):
	return re.findall(r"(?<=<\?php\\n\\n\/\\\\n \).(?=\\n \*\/)", text)[0]
	def html_cleaner(text):
	soup = BeautifulSoup(text)
	return soup.body.text
	def json_cleaner(text):
	out = ""
	for keys in text:
	if keys in ('description', 'license'):
	out+=keys
	out+=": "
	out+=str(text[keys])
	out+=", "
	return out
	def gnu_cleaner(text):
	t = text.split('END OF TERMS AND CONDITIONS')[0]
	if 'Preamble' in text:
	if len(t.split('Preamble')[0])>100:
	t0 = t.split('Preamble')[0]
	try:
	t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
	except:
	try:
	t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
	except:
	t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
	return t0+t1
	else:
	return t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
	else:
	return t
	def rtf_cleaner(text):
	return rtf_to_text(text)
	def character_cleaner(text):
	return re.sub("[=*-/·\n]+", "", text)
	def url_cleaner(text):
	return re.sub(r'http\S+', '', text)
	def isEnglish(s):
	try:
	s.encode(encoding='utf-8').decode('ascii')
	except UnicodeDecodeError:
	return False
	else:
	return True

	# input as a text
	def clean_license_text(text):
	text = text.strip()
	if text[:5] == '<?php':
	try:
	t = php_cleaner(text)
	except:
	return ""
	elif "</html>" in text:
	t = html_cleaner(text)
	elif text[0] == '{' and text[-1] == '}':
	with open(file, 'r') as f:
	t = json_cleaner(json.load(f))
	elif "GNU" in text or "Apache" in text:
	t = gnu_cleaner(text)
	elif "\\rtf" in text:
	t = rtf_cleaner(text)
	else:
	t = text

	t = url_cleaner(t)
	t = character_cleaner(t)

	if not isEnglish(t):
	if not isEnglish(' '.join(t.split()[-5:-1])):
	return ""
	return t