Spaces:
Runtime error
Runtime error
Nihal D'Souza
commited on
Commit
·
95fa18f
1
Parent(s):
19dab1b
Updating clean function
Browse files- src/clean.py +12 -7
src/clean.py
CHANGED
@@ -236,7 +236,7 @@ def isEnglish(s):
|
|
236 |
return True
|
237 |
|
238 |
|
239 |
-
def split_definitions_exceptions(text, verbosity=0):
|
240 |
"""
|
241 |
Extract definitions from the License text
|
242 |
|
@@ -275,7 +275,7 @@ def split_definitions_exceptions(text, verbosity=0):
|
|
275 |
)
|
276 |
|
277 |
definitions += more_defs.strip()
|
278 |
-
paras, exceptions = get_exeptions(paras, verbosity=verbosity)
|
279 |
|
280 |
return paras, definitions, exceptions
|
281 |
|
@@ -595,7 +595,7 @@ def get_all_caps(text, verbosity=0):
|
|
595 |
return text, all_caps
|
596 |
|
597 |
|
598 |
-
def get_exeptions(paras, verbosity=0):
|
599 |
"""
|
600 |
Extracts a list of exceptions from the License text.
|
601 |
|
@@ -620,6 +620,8 @@ def get_exeptions(paras, verbosity=0):
|
|
620 |
for para in paras:
|
621 |
if re.search("exception", para.lower()):
|
622 |
exceptions.append(para)
|
|
|
|
|
623 |
else:
|
624 |
non_exception_paras.append(para)
|
625 |
|
@@ -697,7 +699,11 @@ def get_most_likely_license_type(text):
|
|
697 |
The type of the most likely license. "Not found" if no license score is
|
698 |
above 0.9
|
699 |
"""
|
700 |
-
|
|
|
|
|
|
|
|
|
701 |
|
702 |
top1_result = inference(text).loc[0, :]
|
703 |
|
@@ -707,7 +713,7 @@ def get_most_likely_license_type(text):
|
|
707 |
return "Not Found"
|
708 |
|
709 |
|
710 |
-
def clean_license_text(text, verbosity=0):
|
711 |
"""
|
712 |
Cleans License text.
|
713 |
|
@@ -736,7 +742,7 @@ def clean_license_text(text, verbosity=0):
|
|
736 |
text = script_cleaner(text)
|
737 |
text = preprocess_text(text)
|
738 |
paras, definitions, exceptions = split_definitions_exceptions(
|
739 |
-
text, verbosity=verbosity
|
740 |
)
|
741 |
text = PARA_BREAK.join(paras)
|
742 |
text = character_cleaner(text)
|
@@ -756,7 +762,6 @@ def clean_license_text(text, verbosity=0):
|
|
756 |
|
757 |
return text, definitions
|
758 |
|
759 |
-
|
760 |
"""
|
761 |
Notes:
|
762 |
|
|
|
236 |
return True
|
237 |
|
238 |
|
239 |
+
def split_definitions_exceptions(text, remove_exceptions, verbosity=0):
|
240 |
"""
|
241 |
Extract definitions from the License text
|
242 |
|
|
|
275 |
)
|
276 |
|
277 |
definitions += more_defs.strip()
|
278 |
+
paras, exceptions = get_exeptions(paras, remove_exceptions, verbosity=verbosity)
|
279 |
|
280 |
return paras, definitions, exceptions
|
281 |
|
|
|
595 |
return text, all_caps
|
596 |
|
597 |
|
598 |
+
def get_exeptions(paras, remove_exceptions, verbosity=0):
|
599 |
"""
|
600 |
Extracts a list of exceptions from the License text.
|
601 |
|
|
|
620 |
for para in paras:
|
621 |
if re.search("exception", para.lower()):
|
622 |
exceptions.append(para)
|
623 |
+
if not remove_exceptions:
|
624 |
+
non_exception_paras.append(para)
|
625 |
else:
|
626 |
non_exception_paras.append(para)
|
627 |
|
|
|
699 |
The type of the most likely license. "Not found" if no license score is
|
700 |
above 0.9
|
701 |
"""
|
702 |
+
|
703 |
+
try:
|
704 |
+
from src.doc2vec import inference
|
705 |
+
except:
|
706 |
+
from doc2vec import inference
|
707 |
|
708 |
top1_result = inference(text).loc[0, :]
|
709 |
|
|
|
713 |
return "Not Found"
|
714 |
|
715 |
|
716 |
+
def clean_license_text(text, remove_exceptions=False, verbosity=0):
|
717 |
"""
|
718 |
Cleans License text.
|
719 |
|
|
|
742 |
text = script_cleaner(text)
|
743 |
text = preprocess_text(text)
|
744 |
paras, definitions, exceptions = split_definitions_exceptions(
|
745 |
+
text, remove_exceptions, verbosity=verbosity
|
746 |
)
|
747 |
text = PARA_BREAK.join(paras)
|
748 |
text = character_cleaner(text)
|
|
|
762 |
|
763 |
return text, definitions
|
764 |
|
|
|
765 |
"""
|
766 |
Notes:
|
767 |
|