File size: 1,766 Bytes
e41b03f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 14 23:53:47 2022

@author: UTKARSH
"""

import numpy as np

import re
import glob
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

try:
    from src.clean import clean_license_text
except:
    from clean import clean_license_text


nlp = spacy.load("en_core_web_sm")

TOP_N_WORDS = 100


def tfidf_preprocess(text):
    """
    Cleans text by lowercasing it and removing all the special characters.

    Parameters
    ----------
    text : str
        Raw license text.

    Returns
    -------
    str
        Cleaned and lowercased license text.

    """
    text = text.lower()

    # Remove all non-letter words
    text = re.sub("[^a-zA-Z]+", " ", text)

    ## Lemmatize the words
    # text = " ".join([token.lemma_.lower().strip() for token in nlp(text)])
    return text.strip()

corpus = list()

filepaths = glob.glob("../notebooks/licenses/NOASSERTION/*.txt")
# filepaths.extend(glob.glob("../notebooks/licenses/OTHER/*.txt"))

for file_path in filepaths:
    with open(file_path, "r", encoding="utf-8") as f:
        # To eliminate the url and blank line from start of the files
        f.readline()
        f.readline()

        # Reading the remaining content
        content = f.read()

        cleaned_license_text, _ = clean_license_text(content)
        corpus.append(cleaned_license_text)


vectorizer = TfidfVectorizer(
    lowercase=True,
    preprocessor=tfidf_preprocess,
    stop_words="english"
)

tfidf = vectorizer.fit_transform(corpus)

feature_array = np.array(vectorizer.get_feature_names_out())
tfidf_sorting = np.argsort(tfidf.toarray()).flatten()

top_n = feature_array[tfidf_sorting][-TOP_N_WORDS:][::-1]
bottom_n = feature_array[tfidf_sorting][:TOP_N_WORDS]