Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

App Files Files Community

Nihal D'Souza commited on May 21, 2022

Commit

cedd239

1 Parent(s): fc6772f

Adding cleaning function

Browse files

Files changed (2) hide show

requirements.txt +1 -0
src/clean.py +80 -0

requirements.txt CHANGED Viewed

@@ -95,6 +95,7 @@ smmap==5.0.0
 soupsieve==2.3.2.post1
 stack-data==0.2.0
 streamlit==1.9.0
 terminado==0.15.0
 threadpoolctl==3.1.0
 tinycss2==1.1.1

 soupsieve==2.3.2.post1
 stack-data==0.2.0
 streamlit==1.9.0
+striprtf==0.0.20
 terminado==0.15.0
 threadpoolctl==3.1.0
 tinycss2==1.1.1

src/clean.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import re
+import os
+from bs4 import BeautifulSoup
+from striprtf.striprtf import rtf_to_text
+import json
+import nltk as nltk
+def php_cleaner(text):
+    return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]
+def html_cleaner(text):
+    soup = BeautifulSoup(text)
+    return soup.body.text
+def json_cleaner(text):
+    out = ""
+    for keys in text:
+        if keys in ('description', 'license'):
+            out+=keys
+            out+=": "
+            out+=str(text[keys])
+            out+=", "
+    return out
+def gnu_cleaner(text):
+    t = text.split('END OF TERMS AND CONDITIONS')[0]
+    if 'Preamble' in text:
+        if len(t.split('Preamble')[0])>100:
+            t0 = t.split('Preamble')[0]
+            try:
+                t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
+            except:
+                try:
+                    t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
+                except:
+                    t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
+            return t0+t1
+        else:
+            return t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
+    else:
+        return t
+def rtf_cleaner(text):
+    return rtf_to_text(text)
+def character_cleaner(text):
+    return re.sub("[=*-/·\n]+", "", text)
+def url_cleaner(text):
+    return re.sub(r'http\S+', '', text)
+def isEnglish(s):
+    try:
+        s.encode(encoding='utf-8').decode('ascii')
+    except UnicodeDecodeError:
+        return False
+    else:
+        return True
+# input as a text
+def clean_license_text(text):
+    text = text.strip()
+    if text[:5] == '<?php':
+        try:
+            t = php_cleaner(text)
+        except:
+            return ""
+    elif "</html>" in text:
+        t = html_cleaner(text)
+    elif text[0] == '{' and text[-1] == '}':
+        with open(file, 'r') as f:
+            t = json_cleaner(json.load(f))
+    elif "GNU" in text or "Apache" in text:
+        t = gnu_cleaner(text)
+    elif "\\rtf" in text:
+        t = rtf_cleaner(text)
+    else:
+        t = text
+    t = url_cleaner(t)
+    t = character_cleaner(t)
+    if not isEnglish(t):
+        if not isEnglish(' '.join(t.split()[-5:-1])):
+            return ""
+    return t