Nihal D'Souza commited on
Commit
95fa18f
·
1 Parent(s): 19dab1b

Updating clean function

Browse files
Files changed (1) hide show
  1. src/clean.py +12 -7
src/clean.py CHANGED
@@ -236,7 +236,7 @@ def isEnglish(s):
236
  return True
237
 
238
 
239
- def split_definitions_exceptions(text, verbosity=0):
240
  """
241
  Extract definitions from the License text
242
 
@@ -275,7 +275,7 @@ def split_definitions_exceptions(text, verbosity=0):
275
  )
276
 
277
  definitions += more_defs.strip()
278
- paras, exceptions = get_exeptions(paras, verbosity=verbosity)
279
 
280
  return paras, definitions, exceptions
281
 
@@ -595,7 +595,7 @@ def get_all_caps(text, verbosity=0):
595
  return text, all_caps
596
 
597
 
598
- def get_exeptions(paras, verbosity=0):
599
  """
600
  Extracts a list of exceptions from the License text.
601
 
@@ -620,6 +620,8 @@ def get_exeptions(paras, verbosity=0):
620
  for para in paras:
621
  if re.search("exception", para.lower()):
622
  exceptions.append(para)
 
 
623
  else:
624
  non_exception_paras.append(para)
625
 
@@ -697,7 +699,11 @@ def get_most_likely_license_type(text):
697
  The type of the most likely license. "Not found" if no license score is
698
  above 0.9
699
  """
700
- from src.doc2vec import inference
 
 
 
 
701
 
702
  top1_result = inference(text).loc[0, :]
703
 
@@ -707,7 +713,7 @@ def get_most_likely_license_type(text):
707
  return "Not Found"
708
 
709
 
710
- def clean_license_text(text, verbosity=0):
711
  """
712
  Cleans License text.
713
 
@@ -736,7 +742,7 @@ def clean_license_text(text, verbosity=0):
736
  text = script_cleaner(text)
737
  text = preprocess_text(text)
738
  paras, definitions, exceptions = split_definitions_exceptions(
739
- text, verbosity=verbosity
740
  )
741
  text = PARA_BREAK.join(paras)
742
  text = character_cleaner(text)
@@ -756,7 +762,6 @@ def clean_license_text(text, verbosity=0):
756
 
757
  return text, definitions
758
 
759
-
760
  """
761
  Notes:
762
 
 
236
  return True
237
 
238
 
239
+ def split_definitions_exceptions(text, remove_exceptions, verbosity=0):
240
  """
241
  Extract definitions from the License text
242
 
 
275
  )
276
 
277
  definitions += more_defs.strip()
278
+ paras, exceptions = get_exeptions(paras, remove_exceptions, verbosity=verbosity)
279
 
280
  return paras, definitions, exceptions
281
 
 
595
  return text, all_caps
596
 
597
 
598
+ def get_exeptions(paras, remove_exceptions, verbosity=0):
599
  """
600
  Extracts a list of exceptions from the License text.
601
 
 
620
  for para in paras:
621
  if re.search("exception", para.lower()):
622
  exceptions.append(para)
623
+ if not remove_exceptions:
624
+ non_exception_paras.append(para)
625
  else:
626
  non_exception_paras.append(para)
627
 
 
699
  The type of the most likely license. "Not found" if no license score is
700
  above 0.9
701
  """
702
+
703
+ try:
704
+ from src.doc2vec import inference
705
+ except:
706
+ from doc2vec import inference
707
 
708
  top1_result = inference(text).loc[0, :]
709
 
 
713
  return "Not Found"
714
 
715
 
716
+ def clean_license_text(text, remove_exceptions=False, verbosity=0):
717
  """
718
  Cleans License text.
719
 
 
742
  text = script_cleaner(text)
743
  text = preprocess_text(text)
744
  paras, definitions, exceptions = split_definitions_exceptions(
745
+ text, remove_exceptions, verbosity=verbosity
746
  )
747
  text = PARA_BREAK.join(paras)
748
  text = character_cleaner(text)
 
762
 
763
  return text, definitions
764
 
 
765
  """
766
  Notes:
767