Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

App Files Files Community

Nihal D'Souza commited on Jun 7, 2022

Commit

ac750db

1 Parent(s): cf72a29

Pushing latest development branch

Browse files

Files changed (2) hide show

src/doc2vec.py +19 -6
src/textrank.py +203 -35

src/doc2vec.py CHANGED Viewed

@@ -1,14 +1,20 @@
 import gensim
 from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 import pandas as pd
-import nltk
 import json
 MODEL_PATH = 'models/d2v.model'
 LICENSE_INDEX_PATH = 'data/index_license_map.json'
-license_index_name_map = json.load(open(LICENSE_INDEX_PATH))
 def load_model():
@@ -20,7 +26,14 @@ def load_model():
     Returns: Doc2Vec
         Model object
     '''
-    model = Doc2Vec.load(MODEL_PATH)
     return model
@@ -35,7 +48,8 @@ def preprocess(input):
     Return: TaggedDocument
         TaggedDocument Object
     '''
-    tokens = gensim.utils.simple_preprocess(input)
     tagged_doc = TaggedDocument(words=tokens, tags=[1])
     return tagged_doc
@@ -112,5 +126,4 @@ def inference(input):
     infer_vec = inference_vector(model, processed_text)
     results = similarity_ranking(model, infer_vec)
     results_df = scores_to_df(results)
-    return results_df

+import os
 import gensim
 from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 import pandas as pd
 import json
+from clean import preprocess_text, script_cleaner
 MODEL_PATH = 'models/d2v.model'
 LICENSE_INDEX_PATH = 'data/index_license_map.json'
+if os.path.exists(LICENSE_INDEX_PATH):
+    license_index_name_map = json.load(open(LICENSE_INDEX_PATH))
+elif os.path.exists("../" + LICENSE_INDEX_PATH):
+    license_index_name_map = json.load(open("../" + LICENSE_INDEX_PATH))
+else:
+    print("index_license_map Not Found!")
 def load_model():
     Returns: Doc2Vec
         Model object
     '''
+    if os.path.exists(MODEL_PATH):
+        model = Doc2Vec.load(MODEL_PATH)
+    elif os.path.exists("../" + MODEL_PATH):
+        model = Doc2Vec.load("../" + MODEL_PATH)
+    else:
+        print("d2v.model Not Found!")
+        return None
     return model
     Return: TaggedDocument
         TaggedDocument Object
     '''
+    clean_input = preprocess_text(script_cleaner(input))
+    tokens = gensim.utils.simple_preprocess(clean_input)
     tagged_doc = TaggedDocument(words=tokens, tags=[1])
     return tagged_doc
     infer_vec = inference_vector(model, processed_text)
     results = similarity_ranking(model, infer_vec)
     results_df = scores_to_df(results)
+    return results_df

src/textrank.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import nltk
 import numpy as np
 import gensim
@@ -5,68 +6,235 @@ import spacy
 import math
 from collections import Counter
-from src.clean import clean_license_text
-from src.read_data import read_file
 properties_dict = {
-    "modify":['modify', 'modification', 'change'],
-    "distribute":['distribute', 'distribution'],
-    "copy":['copy'],
-    "copyright": ['copyright']
-#     "exception"
 }
 properties_scores = {
-    "modify": 0.8,
-    "distribute": 0.8,
-    "copy": 0.8,
-    "copyright": 0.9
 }
-nlp = spacy.load('en_core_web_sm')
 def lemmatize_tokens(sent):
-    #TODO: Docstrings
-    '''each word in input sentence is converted to lemma'''
-    return [token.lemma_.lower() for token in nlp(sent)]
-def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, debug=False):
-    '''
     TODO: Doctrings
-    '''
-    sent_scores = {}
     cleaned_license_text, definitions = clean_license_text(license_text)
-    cleaned_license_sentences = cleaned_license_text.split('.')
     summary_len = math.ceil(summary_len*len(cleaned_license_sentences))
     if debug:
-        print(f'summary length:{summary_len}')
-    if debug:
         print(cleaned_license_sentences)
-    for i in cleaned_license_sentences:
-        if debug:
-            print(i.split())
-        if len(i.split()) < min_sent_len:
             continue
         score = 0
         for prop, prop_words in properties_dict.items():
             prop_score = 0
-            lemmatized_tokens = lemmatize_tokens(i)
-            word_count = Counter([tok for tok in lemmatized_tokens])
-            for prop_word in prop_words:
                 if prop_word in word_count.keys():
                     prop_score += properties_scores[prop]
             if debug:
-                print(prop, "=", prop_score)
             score += prop_score
-        sent_scores[i] = score/len(lemmatized_tokens)
         if debug:
-            print(f'Sentence score: {sent_scores[i]}')
             print()
     if debug:
         print(sent_scores)
-    sorted_sent_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))
-    summary = '.\n'.join(list(sorted_sent_scores.keys())[:summary_len])
-    return summary, definitions

+import re
 import nltk
 import numpy as np
 import gensim
 import math
 from collections import Counter
+try:
+    from src.clean import clean_license_text
+    from src.read_data import read_file
+except:
+    from clean import clean_license_text
+    from read_data import read_file
+NEGATION_WEIGHT = 0.2
+nlp = spacy.load("en_core_web_sm")
+modal_verbs = {
+    "can",
+    "may",
+    "must",
+    "shall",
+    "will",
+    # "could",
+    # "might",
+    "should",
+    "would"
+}
+neg_modal = {
+    "cannot",
+    "may not",
+    "must not",
+    "shall not",
+    "will not",
+    # "could not",
+    # "might not",
+    "should not",
+    "would not"
+}
+# TODO Move these structures to another file
+license_stopwords = {
+    ",",
+    "(",
+    ")",
+    "software",
+    "license",
+    "work",
+    # "copyright",
+    "program",
+    # "use",
+    # "copy",
+    "source",
+    # "may",
+    # "terms",
+    "code",
+    # "without",
+    # "free",
+    # "distribute",
+    # "rights",
+    # "notice",
+    # "shall",
+    "provided",
+    # "permission",
+    # "including",
+    "version",
+    "library",
+    # "condition",
+    "covered",
+    # "must",
+    "public",
+    # "modify",
+    # "distribution",
+    # "warranty",
+}.union(nlp.Defaults.stop_words) - modal_verbs
+negation_words = {
+    "no",
+    "not",
+    "non"
+}
+# TODO: Consider adding these words to the vocab:
+    # no-charge
+    #
+    #
+    #
+    #
+verbs = [
+    "permit", "copy", "modify", "change", "sell", "reproduce",
+    "transfer", "rent", "lease", "assign", "sublet", "distribute",
+    "redistribute", "allow", "require", "merge", "publish", "use",
+    "include", "grant", "run", "affirm", "propagate", "acknowledge"
+]
+neg_verbs = [f"not-{verb}" for verb in verbs]
 properties_dict = {
+    "0.1": [
+    ],
+    "0.2": ["everyone"],
+    "0.3": ["irrevocable"],
+    "0.4": [],
+    "0.5": [],
+    "0.6": [
+        "distribution", "redistribution",
+        "permission", "modification",
+        "copyright",
+        "permission",
+        "limitation",
+        "free", "charge",
+        "warranty",
+        "term", "terms", "condition",
+        "right",
+        "sublicense",
+        "commercial", "non-commercial",
+        "exception"
+    ],
+    "0.7": verbs + [
+    ],
+    "0.8": [],
+    "0.9": neg_verbs + [],
+    "1.0": [],
+    "3.0": modal_verbs
 }
 properties_scores = {
+    "0.1": 0.1,
+    "0.2": 0.2,
+    "0.3": 0.3,
+    "0.4": 0.4,
+    "0.5": 0.5,
+    "0.6": 0.6,
+    "0.7": 0.7,
+    "0.8": 0.8,
+    "0.9": 0.9,
+    "1.0": 1.0,
+    "3.0": 3.0
 }
 def lemmatize_tokens(sent):
+    # TODO: Docstrings
+    """Each word in input sentence is converted to lemma"""
+    lemmas = list()
+    nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)]
+    for tok_i, token in enumerate(nlp_sent):
+        if (token
+            and token not in license_stopwords
+            and token not in negation_words):
+            if tok_i > 0 and nlp_sent[tok_i-1] in negation_words:
+                lemmas.append(f"{nlp_sent[tok_i-1]}-{token}")
+            elif tok_i > 1 and nlp_sent[tok_i-1] in " -" and nlp_sent[tok_i-2] in negation_words:
+                lemmas.append(f"{nlp_sent[tok_i-2]}-{token}")
+            else:
+                lemmas.append(token)
+    return lemmas
+def custom_textrank_summarizer(license_text,
+                               min_sent_len=3,
+                               summary_len=0.3,
+                               debug=False):
+    """
     TODO: Doctrings
+    """
+    sent_scores = Counter()
     cleaned_license_text, definitions = clean_license_text(license_text)
+    cleaned_license_sentences = re.split('(\n{2,}|\.)', cleaned_license_text)
+    cleaned_license_sentences = [
+        text.strip() for text in cleaned_license_sentences
+        if text.strip() not in ["", ".", "\n", "\n\n"]
+    ]
     summary_len = math.ceil(summary_len*len(cleaned_license_sentences))
     if debug:
+        print(f"summary length:{summary_len}")
         print(cleaned_license_sentences)
+    for sent_i, sent in enumerate(cleaned_license_sentences):
+        if sent_i < 0:
             continue
+        if len(sent.split()) < min_sent_len:
+            continue
         score = 0
+        lemmatized_tokens = lemmatize_tokens(sent)
+        if debug:
+            print("-"*50)
+            print(f"\nOriginal Sentence = {sent}")
+            print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}")
+        word_count = Counter([tok for tok in lemmatized_tokens])
         for prop, prop_words in properties_dict.items():
             prop_score = 0
+            imp_words = list()
+            for prop_i, prop_word in enumerate(prop_words):
                 if prop_word in word_count.keys():
                     prop_score += properties_scores[prop]
+                    imp_words.append(prop_word)
             if debug:
+                print(prop, "=", imp_words, "=", prop_score)
             score += prop_score
+        sent_scores[sent] = score / len(lemmatized_tokens)
         if debug:
+            print(f"Sentence score: {sent_scores[sent]}")
             print()
     if debug:
         print(sent_scores)
+    sorted_sent_scores = sent_scores.most_common()
+    summary = ".\n".join(sent for sent, score in sorted_sent_scores[:summary_len])
+    return summary, definitions