Spaces:
Runtime error
Runtime error
File size: 4,259 Bytes
cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced cedd239 a804ced |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import re
import json
from bs4 import BeautifulSoup
from striprtf.striprtf import rtf_to_text
PARA_BREAK = "para___break"
def php_cleaner(text):
try:
return re.findall("\/\*[\S\s]*?\*\/", text)[0]
except:
return ""
# return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]
def html_cleaner(text):
soup = BeautifulSoup(text)
text = soup.body.text
if not text:
return ""
return text
def json_cleaner(text_dict):
out = ""
for key in text_dict.keys():
if key in ("description", "license"):
out += key
out += ": "
out += str(text_dict[key])
out += ", "
return out
def discard_text_after_tnc(text):
return text.split("END OF TERMS AND CONDITIONS")[0]
def gnu_cleaner(text):
t = text.split('END OF TERMS AND CONDITIONS')[0]
definitions = ""
if 'Preamble' in text:
if len(t.split('Preamble')[0])>100:
t0 = t.split('Preamble')[0]
try:
t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
except:
try:
t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
except:
t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
t = t0+t1
else:
t = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
if 'Definitions' in text:
try:
def_pos = re.search(r"[0-9]\.? (Additional )?Definitions",t).span()
other_start_pos = re.search(r"[0-9]\.? [A-Z][a-z]+",t[def_pos[1]:]).span()[0]
definitions = t[def_pos[0]: def_pos[1] + other_start_pos]
t = t[:def_pos[0]] + t[def_pos[1]+other_start_pos:]
except:
t = t
return t, definitions
def rtf_cleaner(text):
return rtf_to_text(text)
def url_cleaner(text):
return re.sub(r"http\S+", "", text)
def email_cleaner(text):
return re.sub(r"\S*@\S*", "", text)
def var_cleaner(text):
text = re.sub(r"\$\w+", "", text)
text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text)
return text
def character_cleaner(text):
text = url_cleaner(text)
text = email_cleaner(text)
text = var_cleaner(text)
text = re.sub("[\n]{2,}", ". ", text)
text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text)
text = re.sub("[\. ]{2,}", ". ", text)
return text
def isEnglish(s):
try:
s.encode(encoding="utf-8").decode("ascii")
except UnicodeDecodeError:
return False
else:
return True
def preprocess_text(text):
definitions = ""
if "GNU" in text or "Apache" in text:
text, definitions = gnu_cleaner(text)
definitions = definitions.strip()
return text, definitions
def script_cleaner(text):
if "<?php" in text:
text = php_cleaner(text)
elif "</html>" in text:
text = html_cleaner(text)
elif text[0] == "{" and text[-1] == "}":
text = json_cleaner(json.loads(text))
elif "\\rtf" in text:
text = rtf_cleaner(text)
if not text:
return ""
return text
def split_paras(text):
if "\n\n\n\n" in text:
paras = text.split("\n\n\n\n")
elif "\n\n\n" in text:
paras = text.split("\n\n\n")
elif "\n\n" in text:
paras = text.split("\n\n")
else:
paras = [text]
return paras
def clean_paras(paras):
return paras
def clean_license_text(text):
if len(text) == 0:
return text
text = script_cleaner(text)
text, definitions = preprocess_text(text)
paras = clean_paras(split_paras(text))
text = PARA_BREAK.join(paras)
text = character_cleaner(text)
text = re.sub(PARA_BREAK, "\n\n", text)
text = text.strip()
if not isEnglish(text):
if not isEnglish(" ".join(text.split()[-5:-1])):
return "", ""
return text, definitions
"""
Notes:
1. Regex for other definitions: --------> ".{0,20}".{0,40}means
2. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something)
-> Merge with the next para
Ex. "8. Termination."
""" |