Spaces:
Runtime error
Runtime error
Nihal D'Souza
commited on
Commit
·
a804ced
1
Parent(s):
0599777
Custom textrank, changes to UI
Browse files- app.py +34 -5
- data/choosealicense_appendix_labels.csv +42 -42
- requirements.txt +17 -1
- src/abstractive_sum.py +14 -10
- src/clean.py +135 -44
- src/read_data.py +40 -15
- src/textrank.py +69 -0
app.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
-
import
|
2 |
-
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
import nltk
|
5 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
6 |
import torch
|
7 |
-
import
|
8 |
|
9 |
from src.doc2vec import inference
|
10 |
from src.abstractive_sum import summarize_text_with_model
|
|
|
|
|
11 |
|
12 |
CUSTOM_MODEL_NAME = "utkarshsaboo45/ClearlyDefinedLicenseSummarizer"
|
13 |
|
@@ -19,12 +19,37 @@ with st.spinner('Loading...'):
|
|
19 |
model = AutoModelForSeq2SeqLM.from_pretrained(CUSTOM_MODEL_NAME).to(device)
|
20 |
tokenizer = AutoTokenizer.from_pretrained(CUSTOM_MODEL_NAME)
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
st.title('Clearly Defined: License Summarizer')
|
23 |
input = st.text_area('Enter contents of the license')
|
24 |
|
25 |
if len(input) > 0:
|
26 |
with st.spinner('Loading...'):
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
st.header('Summary')
|
29 |
st.write(summary)
|
30 |
|
@@ -32,3 +57,7 @@ if len(input) > 0:
|
|
32 |
st.header('Similarity Index')
|
33 |
st.dataframe(prediction_scores)
|
34 |
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
|
|
|
|
2 |
import nltk
|
3 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
4 |
import torch
|
5 |
+
import streamlit as st
|
6 |
|
7 |
from src.doc2vec import inference
|
8 |
from src.abstractive_sum import summarize_text_with_model
|
9 |
+
from src.textrank import custom_textrank_summarizer
|
10 |
+
from src.clean import clean_license_text
|
11 |
|
12 |
CUSTOM_MODEL_NAME = "utkarshsaboo45/ClearlyDefinedLicenseSummarizer"
|
13 |
|
|
|
19 |
model = AutoModelForSeq2SeqLM.from_pretrained(CUSTOM_MODEL_NAME).to(device)
|
20 |
tokenizer = AutoTokenizer.from_pretrained(CUSTOM_MODEL_NAME)
|
21 |
|
22 |
+
summarization_type = st.sidebar.selectbox(
|
23 |
+
"Select summarization type.",
|
24 |
+
("Abstractive", "Extractive", "Both")
|
25 |
+
)
|
26 |
+
if summarization_type == 'Abstractive':
|
27 |
+
st.sidebar.caption('Summary will be generated by the T5 Transformer Model')
|
28 |
+
elif summarization_type == 'Extractive':
|
29 |
+
st.sidebar.caption('Summary will be generated by a custom TextRank Algorithm')
|
30 |
+
summary_len = st.sidebar.slider('Summary length percentage', 1, 10, 3)
|
31 |
+
elif summarization_type == 'Both':
|
32 |
+
st.sidebar.caption('The License text will be first passed through the custom TextRank algorithm and then passed on to the T5 Transformer Model to generate a summary.')
|
33 |
+
|
34 |
+
clean_text = st.sidebar.checkbox('Show cleaned license text')
|
35 |
+
|
36 |
st.title('Clearly Defined: License Summarizer')
|
37 |
input = st.text_area('Enter contents of the license')
|
38 |
|
39 |
if len(input) > 0:
|
40 |
with st.spinner('Loading...'):
|
41 |
+
if summarization_type == 'Abstractive':
|
42 |
+
summary, definitions = summarize_text_with_model(input, model, tokenizer)
|
43 |
+
if summarization_type == 'Extractive':
|
44 |
+
summary, definitions = custom_textrank_summarizer(input, summary_len = summary_len/10)
|
45 |
+
if summarization_type == 'Both':
|
46 |
+
summary, definitions = summarize_text_with_model(input, model, tokenizer)
|
47 |
+
summary, _ = custom_textrank_summarizer(summary, summary_len = 1)
|
48 |
+
|
49 |
+
if clean_text:
|
50 |
+
st.header('Cleaned License Text')
|
51 |
+
st.write(clean_license_text(input)[0])
|
52 |
+
|
53 |
st.header('Summary')
|
54 |
st.write(summary)
|
55 |
|
|
|
57 |
st.header('Similarity Index')
|
58 |
st.dataframe(prediction_scores)
|
59 |
|
60 |
+
if definitions:
|
61 |
+
st.header('Definitions')
|
62 |
+
st.write(definitions)
|
63 |
+
|
data/choosealicense_appendix_labels.csv
CHANGED
@@ -1,42 +1,42 @@
|
|
1 |
-
spdx_id,license_name,commercial-use,disclose-source,distribution,document-changes,include-copyright,include-copyright--source,liability,modifications,network-use-disclose,patent-use,private-use,same-license,same-license--file,same-license--library,trademark-use,warranty
|
2 |
-
0bsd,BSD Zero Clause License,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations
|
3 |
-
afl-3.0,Academic Free License v3.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
|
4 |
-
agpl-3.0,GNU Affero General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,,limitations
|
5 |
-
apache-2.0,Apache License 2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
|
6 |
-
artistic-2.0,Artistic License 2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
|
7 |
-
bsd-2-clause,BSD 2-Clause Simplified License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
8 |
-
bsd-3-clause,BSD 3-Clause New or Revised License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
9 |
-
bsd-3-clause-clear,BSD 3-Clause Clear License,permissions,,permissions,,conditions,,limitations,permissions,,limitations,permissions,,,,,limitations
|
10 |
-
bsd-4-clause,BSD 4-Clause Original or Old License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
11 |
-
bsl-1.0,Boost Software License 1.0,permissions,,permissions,,,conditions,limitations,permissions,,,permissions,,,,,limitations
|
12 |
-
cc-by-4.0,Creative Commons Attribution 4.0 International,permissions,,permissions,conditions,conditions,,limitations,permissions,,limitations,permissions,,,,limitations,limitations
|
13 |
-
cc-by-sa-4.0,Creative Commons Attribution Share Alike 4.0 International,permissions,,permissions,conditions,conditions,,limitations,permissions,,limitations,permissions,conditions,,,limitations,limitations
|
14 |
-
cc0-1.0,Creative Commons Zero v1.0 Universal,permissions,,permissions,,,,limitations,permissions,,limitations,permissions,,,,limitations,limitations
|
15 |
-
cecill-2.1,CeCILL Free Software License Agreement v2.1,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
|
16 |
-
ecl-2.0,Educational Community License v2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
|
17 |
-
epl-1.0,Eclipse Public License 1.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
|
18 |
-
epl-2.0,Eclipse Public License 2.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
|
19 |
-
eupl-1.1,European Union Public License 1.1,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations
|
20 |
-
eupl-1.2,European Union Public License 1.2,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations
|
21 |
-
gpl-2.0,GNU General Public License v2.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,conditions,,,,limitations
|
22 |
-
gpl-3.0,GNU General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
|
23 |
-
isc,ISC License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
24 |
-
lgpl-2.1,GNU Lesser General Public License v2.1,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,,,conditions,,limitations
|
25 |
-
lgpl-3.0,GNU Lesser General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,conditions,,limitations
|
26 |
-
lppl-1.3c,LaTeX Project Public License v1.3c,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
27 |
-
mit,MIT License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
28 |
-
mit-0,MIT No Attribution,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations
|
29 |
-
mpl-2.0,Mozilla Public License 2.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,,conditions,,limitations,limitations
|
30 |
-
ms-pl,Microsoft Public License,permissions,,permissions,,conditions,,,permissions,,permissions,permissions,,,,limitations,limitations
|
31 |
-
ms-rl,Microsoft Reciprocal License,permissions,conditions,permissions,,conditions,,,permissions,,permissions,permissions,,conditions,,limitations,limitations
|
32 |
-
mulanpsl-2.0,"Mulan Permissive Software License, Version 2",permissions,,permissions,,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
|
33 |
-
ncsa,University of IllinoisNCSA Open Source License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
34 |
-
odbl-1.0,Open Data Commons Open Database License v1.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,limitations,permissions,conditions,,,limitations,limitations
|
35 |
-
ofl-1.1,SIL Open Font License 1.1,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,conditions,,,,limitations
|
36 |
-
osl-3.0,Open Software License 3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations
|
37 |
-
postgresql,PostgreSQL License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
38 |
-
unlicense,The Unlicense,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations
|
39 |
-
upl-1.0,Universal Permissive License v1.0,permissions,,permissions,,conditions,,limitations,permissions,,permissions,permissions,,,,,limitations
|
40 |
-
vim,Vim License,permissions,conditions,permissions,conditions,conditions,,,permissions,,,permissions,conditions
|
41 |
-
wtfpl,Do What The Fck You Want To Public License,permissions,,permissions,,,,,permissions,,,permissions
|
42 |
-
zlib,zlib License,permissions,,permissions,conditions,,conditions,limitations,permissions,,,permissions,,,,,limitations
|
|
|
1 |
+
spdx_id,license_name,commercial-use,disclose-source,distribution,document-changes,include-copyright,include-copyright--source,liability,modifications,network-use-disclose,patent-use,private-use,same-license,same-license--file,same-license--library,trademark-use,warranty
|
2 |
+
0bsd,BSD Zero Clause License,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations
|
3 |
+
afl-3.0,Academic Free License v3.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
|
4 |
+
agpl-3.0,GNU Affero General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,,limitations
|
5 |
+
apache-2.0,Apache License 2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
|
6 |
+
artistic-2.0,Artistic License 2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
|
7 |
+
bsd-2-clause,BSD 2-Clause Simplified License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
8 |
+
bsd-3-clause,BSD 3-Clause New or Revised License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
9 |
+
bsd-3-clause-clear,BSD 3-Clause Clear License,permissions,,permissions,,conditions,,limitations,permissions,,limitations,permissions,,,,,limitations
|
10 |
+
bsd-4-clause,BSD 4-Clause Original or Old License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
11 |
+
bsl-1.0,Boost Software License 1.0,permissions,,permissions,,,conditions,limitations,permissions,,,permissions,,,,,limitations
|
12 |
+
cc-by-4.0,Creative Commons Attribution 4.0 International,permissions,,permissions,conditions,conditions,,limitations,permissions,,limitations,permissions,,,,limitations,limitations
|
13 |
+
cc-by-sa-4.0,Creative Commons Attribution Share Alike 4.0 International,permissions,,permissions,conditions,conditions,,limitations,permissions,,limitations,permissions,conditions,,,limitations,limitations
|
14 |
+
cc0-1.0,Creative Commons Zero v1.0 Universal,permissions,,permissions,,,,limitations,permissions,,limitations,permissions,,,,limitations,limitations
|
15 |
+
cecill-2.1,CeCILL Free Software License Agreement v2.1,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
|
16 |
+
ecl-2.0,Educational Community License v2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
|
17 |
+
epl-1.0,Eclipse Public License 1.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
|
18 |
+
epl-2.0,Eclipse Public License 2.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
|
19 |
+
eupl-1.1,European Union Public License 1.1,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations
|
20 |
+
eupl-1.2,European Union Public License 1.2,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations
|
21 |
+
gpl-2.0,GNU General Public License v2.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,conditions,,,,limitations
|
22 |
+
gpl-3.0,GNU General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
|
23 |
+
isc,ISC License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
24 |
+
lgpl-2.1,GNU Lesser General Public License v2.1,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,,,conditions,,limitations
|
25 |
+
lgpl-3.0,GNU Lesser General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,conditions,,limitations
|
26 |
+
lppl-1.3c,LaTeX Project Public License v1.3c,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
27 |
+
mit,MIT License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
28 |
+
mit-0,MIT No Attribution,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations
|
29 |
+
mpl-2.0,Mozilla Public License 2.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,,conditions,,limitations,limitations
|
30 |
+
ms-pl,Microsoft Public License,permissions,,permissions,,conditions,,,permissions,,permissions,permissions,,,,limitations,limitations
|
31 |
+
ms-rl,Microsoft Reciprocal License,permissions,conditions,permissions,,conditions,,,permissions,,permissions,permissions,,conditions,,limitations,limitations
|
32 |
+
mulanpsl-2.0,"Mulan Permissive Software License, Version 2",permissions,,permissions,,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
|
33 |
+
ncsa,University of IllinoisNCSA Open Source License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
34 |
+
odbl-1.0,Open Data Commons Open Database License v1.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,limitations,permissions,conditions,,,limitations,limitations
|
35 |
+
ofl-1.1,SIL Open Font License 1.1,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,conditions,,,,limitations
|
36 |
+
osl-3.0,Open Software License 3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations
|
37 |
+
postgresql,PostgreSQL License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
|
38 |
+
unlicense,The Unlicense,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations
|
39 |
+
upl-1.0,Universal Permissive License v1.0,permissions,,permissions,,conditions,,limitations,permissions,,permissions,permissions,,,,,limitations
|
40 |
+
vim,Vim License,permissions,conditions,permissions,conditions,conditions,,,permissions,,,permissions,conditions,,,,
|
41 |
+
wtfpl,Do What The Fck You Want To Public License,permissions,,permissions,,,,,permissions,,,permissions,,,,,
|
42 |
+
zlib,zlib License,permissions,,permissions,conditions,,conditions,limitations,permissions,,,permissions,,,,,limitations
|
requirements.txt
CHANGED
@@ -10,15 +10,19 @@ base58==2.1.1
|
|
10 |
beautifulsoup4==4.11.1
|
11 |
bleach==5.0.0
|
12 |
blinker==1.4
|
|
|
13 |
cachetools==5.1.0
|
|
|
14 |
certifi==2021.10.8
|
15 |
cffi==1.15.0
|
16 |
charset-normalizer==2.0.12
|
17 |
click==8.0.4
|
18 |
cycler==0.11.0
|
|
|
19 |
debugpy==1.6.0
|
20 |
decorator==5.1.1
|
21 |
defusedxml==0.7.1
|
|
|
22 |
entrypoints==0.4
|
23 |
executing==0.8.3
|
24 |
fastjsonschema==2.15.3
|
@@ -43,10 +47,12 @@ jupyter-core==4.10.0
|
|
43 |
jupyterlab-pygments==0.2.2
|
44 |
jupyterlab-widgets==1.1.0
|
45 |
kiwisolver==1.4.2
|
|
|
46 |
MarkupSafe==2.1.1
|
47 |
matplotlib==3.5.0
|
48 |
matplotlib-inline==0.1.3
|
49 |
mistune==0.8.4
|
|
|
50 |
nbclient==0.6.3
|
51 |
nbconvert==6.5.0
|
52 |
nbformat==5.4.0
|
@@ -58,10 +64,12 @@ packaging==21.3
|
|
58 |
pandas==1.3.4
|
59 |
pandocfilters==1.5.0
|
60 |
parso==0.8.3
|
|
|
61 |
pexpect==4.8.0
|
62 |
pickleshare==0.7.5
|
63 |
Pillow==9.1.1
|
64 |
pip==22.1
|
|
|
65 |
prometheus-client==0.14.1
|
66 |
prompt-toolkit==3.0.29
|
67 |
protobuf==3.20.1
|
@@ -70,6 +78,7 @@ ptyprocess==0.7.0
|
|
70 |
pure-eval==0.2.2
|
71 |
pyarrow==8.0.0
|
72 |
pycparser==2.21
|
|
|
73 |
pydeck==0.7.1
|
74 |
Pygments==2.12.0
|
75 |
Pympler==1.0.1
|
@@ -90,13 +99,18 @@ setuptools==62.3.1
|
|
90 |
setuptools-scm==6.4.2
|
91 |
six==1.16.0
|
92 |
sklearn==0.0
|
93 |
-
smart-open==
|
94 |
smmap==5.0.0
|
95 |
soupsieve==2.3.2.post1
|
|
|
|
|
|
|
|
|
96 |
stack-data==0.2.0
|
97 |
streamlit==1.9.0
|
98 |
striprtf==0.0.20
|
99 |
terminado==0.15.0
|
|
|
100 |
threadpoolctl==3.1.0
|
101 |
tinycss2==1.1.1
|
102 |
tokenizers==0.12.1
|
@@ -108,11 +122,13 @@ tornado==6.1
|
|
108 |
tqdm==4.64.0
|
109 |
traitlets==5.2.1.post0
|
110 |
transformers==4.19.2
|
|
|
111 |
typing_extensions==4.2.0
|
112 |
tzdata==2022.1
|
113 |
tzlocal==4.2
|
114 |
urllib3==1.26.9
|
115 |
validators==0.19.0
|
|
|
116 |
watchdog==2.1.8
|
117 |
wcwidth==0.2.5
|
118 |
webencodings==0.5.1
|
|
|
10 |
beautifulsoup4==4.11.1
|
11 |
bleach==5.0.0
|
12 |
blinker==1.4
|
13 |
+
blis==0.7.7
|
14 |
cachetools==5.1.0
|
15 |
+
catalogue==2.0.7
|
16 |
certifi==2021.10.8
|
17 |
cffi==1.15.0
|
18 |
charset-normalizer==2.0.12
|
19 |
click==8.0.4
|
20 |
cycler==0.11.0
|
21 |
+
cymem==2.0.6
|
22 |
debugpy==1.6.0
|
23 |
decorator==5.1.1
|
24 |
defusedxml==0.7.1
|
25 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl
|
26 |
entrypoints==0.4
|
27 |
executing==0.8.3
|
28 |
fastjsonschema==2.15.3
|
|
|
47 |
jupyterlab-pygments==0.2.2
|
48 |
jupyterlab-widgets==1.1.0
|
49 |
kiwisolver==1.4.2
|
50 |
+
langcodes==3.3.0
|
51 |
MarkupSafe==2.1.1
|
52 |
matplotlib==3.5.0
|
53 |
matplotlib-inline==0.1.3
|
54 |
mistune==0.8.4
|
55 |
+
murmurhash==1.0.7
|
56 |
nbclient==0.6.3
|
57 |
nbconvert==6.5.0
|
58 |
nbformat==5.4.0
|
|
|
64 |
pandas==1.3.4
|
65 |
pandocfilters==1.5.0
|
66 |
parso==0.8.3
|
67 |
+
pathy==0.6.1
|
68 |
pexpect==4.8.0
|
69 |
pickleshare==0.7.5
|
70 |
Pillow==9.1.1
|
71 |
pip==22.1
|
72 |
+
preshed==3.0.6
|
73 |
prometheus-client==0.14.1
|
74 |
prompt-toolkit==3.0.29
|
75 |
protobuf==3.20.1
|
|
|
78 |
pure-eval==0.2.2
|
79 |
pyarrow==8.0.0
|
80 |
pycparser==2.21
|
81 |
+
pydantic==1.8.2
|
82 |
pydeck==0.7.1
|
83 |
Pygments==2.12.0
|
84 |
Pympler==1.0.1
|
|
|
99 |
setuptools-scm==6.4.2
|
100 |
six==1.16.0
|
101 |
sklearn==0.0
|
102 |
+
smart-open==5.2.1
|
103 |
smmap==5.0.0
|
104 |
soupsieve==2.3.2.post1
|
105 |
+
spacy==3.3.0
|
106 |
+
spacy-legacy==3.0.9
|
107 |
+
spacy-loggers==1.0.2
|
108 |
+
srsly==2.4.3
|
109 |
stack-data==0.2.0
|
110 |
streamlit==1.9.0
|
111 |
striprtf==0.0.20
|
112 |
terminado==0.15.0
|
113 |
+
thinc==8.0.16
|
114 |
threadpoolctl==3.1.0
|
115 |
tinycss2==1.1.1
|
116 |
tokenizers==0.12.1
|
|
|
122 |
tqdm==4.64.0
|
123 |
traitlets==5.2.1.post0
|
124 |
transformers==4.19.2
|
125 |
+
typer==0.4.1
|
126 |
typing_extensions==4.2.0
|
127 |
tzdata==2022.1
|
128 |
tzlocal==4.2
|
129 |
urllib3==1.26.9
|
130 |
validators==0.19.0
|
131 |
+
wasabi==0.9.1
|
132 |
watchdog==2.1.8
|
133 |
wcwidth==0.2.5
|
134 |
webencodings==0.5.1
|
src/abstractive_sum.py
CHANGED
@@ -12,9 +12,8 @@ import pandas as pd
|
|
12 |
import torch
|
13 |
from torch.utils.data import Dataset, DataLoader
|
14 |
from sklearn.model_selection import train_test_split
|
|
|
15 |
|
16 |
-
|
17 |
-
import torch.nn as nn
|
18 |
from tqdm.auto import tqdm
|
19 |
|
20 |
from transformers import (
|
@@ -32,7 +31,7 @@ MODEL_PATH = "models/"
|
|
32 |
MODEL_FILENAME = "t5-base.model"
|
33 |
|
34 |
MODEL_NAME = "t5-base"
|
35 |
-
|
36 |
|
37 |
TEXT_MAX_TOKEN_LEN = 512
|
38 |
SUMMARY_MAX_TOKEN_LEN = 128
|
@@ -56,13 +55,14 @@ class LicenseSummaryDataset(Dataset):
|
|
56 |
self.text_max_token_len = text_max_token_len
|
57 |
self.summary_max_token_len = summary_max_token_len
|
58 |
|
|
|
59 |
def __len__(self):
|
60 |
return len(self.data)
|
61 |
|
62 |
def __getitem__(self, index: int):
|
63 |
data_row = self.data.iloc[index]
|
64 |
text = data_row["text"]
|
65 |
-
text_encoding =
|
66 |
text,
|
67 |
max_length=self.text_max_token_len,
|
68 |
padding="max_length",
|
@@ -72,7 +72,7 @@ class LicenseSummaryDataset(Dataset):
|
|
72 |
return_tensors="pt"
|
73 |
)
|
74 |
|
75 |
-
summary_encoding =
|
76 |
data_row["summary"],
|
77 |
max_length=self.summary_max_token_len,
|
78 |
padding="max_length",
|
@@ -111,6 +111,8 @@ def prepare_dataloaders():
|
|
111 |
|
112 |
train_df, dev_df = train_test_split(license_summary_data, test_size=0.1)
|
113 |
|
|
|
|
|
114 |
train_dataset = LicenseSummaryDataset(
|
115 |
train_df,
|
116 |
TOKENIZER,
|
@@ -239,6 +241,8 @@ def summarize_text_with_model(text, model, tokenizer):
|
|
239 |
Summary of the License text from the given model.
|
240 |
|
241 |
"""
|
|
|
|
|
242 |
text_encoding = tokenizer(
|
243 |
text,
|
244 |
max_length=TEXT_MAX_TOKEN_LEN,
|
@@ -267,10 +271,10 @@ def summarize_text_with_model(text, model, tokenizer):
|
|
267 |
) for gen_id in generated_ids
|
268 |
]
|
269 |
|
270 |
-
return "".join(preds)
|
271 |
|
272 |
|
273 |
-
def summarize(text, load_from_huggingface=
|
274 |
"""
|
275 |
Summarizes the given License text
|
276 |
|
@@ -295,6 +299,7 @@ def summarize(text, load_from_huggingface=False):
|
|
295 |
if os.path.exists(MODEL_PATH + MODEL_FILENAME):
|
296 |
print("Loading Model...")
|
297 |
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True).to(device)
|
|
|
298 |
model.load_state_dict(torch.load(MODEL_PATH + MODEL_FILENAME))
|
299 |
model.eval()
|
300 |
else:
|
@@ -305,8 +310,7 @@ def summarize(text, load_from_huggingface=False):
|
|
305 |
model = train_and_save_model(train_dataloader, MODEL_PATH + MODEL_FILENAME)
|
306 |
tokenizer = TOKENIZER
|
307 |
|
308 |
-
|
309 |
-
return summary
|
310 |
|
311 |
|
312 |
def summarize_license_files(path):
|
@@ -322,6 +326,6 @@ def summarize_license_files(path):
|
|
322 |
paths = glob.glob(path + "*.txt")
|
323 |
for license_path in paths:
|
324 |
with open(license_path, "r", encoding="utf-8") as f:
|
325 |
-
summary = summarize(f.read())
|
326 |
with open(license_path.replace(".txt", "") + "__summary.txt", "w", encoding="utf-8") as f:
|
327 |
f.write(summary)
|
|
|
12 |
import torch
|
13 |
from torch.utils.data import Dataset, DataLoader
|
14 |
from sklearn.model_selection import train_test_split
|
15 |
+
from src.clean import clean_license_text
|
16 |
|
|
|
|
|
17 |
from tqdm.auto import tqdm
|
18 |
|
19 |
from transformers import (
|
|
|
31 |
MODEL_FILENAME = "t5-base.model"
|
32 |
|
33 |
MODEL_NAME = "t5-base"
|
34 |
+
TOKENIZER = None
|
35 |
|
36 |
TEXT_MAX_TOKEN_LEN = 512
|
37 |
SUMMARY_MAX_TOKEN_LEN = 128
|
|
|
55 |
self.text_max_token_len = text_max_token_len
|
56 |
self.summary_max_token_len = summary_max_token_len
|
57 |
|
58 |
+
|
59 |
def __len__(self):
|
60 |
return len(self.data)
|
61 |
|
62 |
def __getitem__(self, index: int):
|
63 |
data_row = self.data.iloc[index]
|
64 |
text = data_row["text"]
|
65 |
+
text_encoding = self.tokenizer(
|
66 |
text,
|
67 |
max_length=self.text_max_token_len,
|
68 |
padding="max_length",
|
|
|
72 |
return_tensors="pt"
|
73 |
)
|
74 |
|
75 |
+
summary_encoding = self.tokenizer(
|
76 |
data_row["summary"],
|
77 |
max_length=self.summary_max_token_len,
|
78 |
padding="max_length",
|
|
|
111 |
|
112 |
train_df, dev_df = train_test_split(license_summary_data, test_size=0.1)
|
113 |
|
114 |
+
TOKENIZER = T5Tokenizer.from_pretrained(MODEL_NAME)
|
115 |
+
|
116 |
train_dataset = LicenseSummaryDataset(
|
117 |
train_df,
|
118 |
TOKENIZER,
|
|
|
241 |
Summary of the License text from the given model.
|
242 |
|
243 |
"""
|
244 |
+
text, definitions = clean_license_text(text)
|
245 |
+
|
246 |
text_encoding = tokenizer(
|
247 |
text,
|
248 |
max_length=TEXT_MAX_TOKEN_LEN,
|
|
|
271 |
) for gen_id in generated_ids
|
272 |
]
|
273 |
|
274 |
+
return "".join(preds), definitions
|
275 |
|
276 |
|
277 |
+
def summarize(text, load_from_huggingface=True):
|
278 |
"""
|
279 |
Summarizes the given License text
|
280 |
|
|
|
299 |
if os.path.exists(MODEL_PATH + MODEL_FILENAME):
|
300 |
print("Loading Model...")
|
301 |
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True).to(device)
|
302 |
+
TOKENIZER = T5Tokenizer.from_pretrained(MODEL_NAME)
|
303 |
model.load_state_dict(torch.load(MODEL_PATH + MODEL_FILENAME))
|
304 |
model.eval()
|
305 |
else:
|
|
|
310 |
model = train_and_save_model(train_dataloader, MODEL_PATH + MODEL_FILENAME)
|
311 |
tokenizer = TOKENIZER
|
312 |
|
313 |
+
return summarize_text_with_model(text, model, tokenizer)
|
|
|
314 |
|
315 |
|
316 |
def summarize_license_files(path):
|
|
|
326 |
paths = glob.glob(path + "*.txt")
|
327 |
for license_path in paths:
|
328 |
with open(license_path, "r", encoding="utf-8") as f:
|
329 |
+
summary, _ = summarize(f.read())
|
330 |
with open(license_path.replace(".txt", "") + "__summary.txt", "w", encoding="utf-8") as f:
|
331 |
f.write(summary)
|
src/clean.py
CHANGED
@@ -1,27 +1,46 @@
|
|
1 |
import re
|
2 |
-
import
|
3 |
from bs4 import BeautifulSoup
|
4 |
from striprtf.striprtf import rtf_to_text
|
5 |
-
|
6 |
-
|
|
|
7 |
|
8 |
|
9 |
def php_cleaner(text):
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def html_cleaner(text):
|
12 |
soup = BeautifulSoup(text)
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def gnu_cleaner(text):
|
24 |
t = text.split('END OF TERMS AND CONDITIONS')[0]
|
|
|
25 |
if 'Preamble' in text:
|
26 |
if len(t.split('Preamble')[0])>100:
|
27 |
t0 = t.split('Preamble')[0]
|
@@ -32,49 +51,121 @@ def gnu_cleaner(text):
|
|
32 |
t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
|
33 |
except:
|
34 |
t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
|
35 |
-
|
36 |
else:
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def rtf_cleaner(text):
|
41 |
return rtf_to_text(text)
|
42 |
-
|
43 |
-
|
44 |
def url_cleaner(text):
|
45 |
-
return re.sub(r
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
def isEnglish(s):
|
47 |
try:
|
48 |
-
s.encode(encoding=
|
49 |
except UnicodeDecodeError:
|
50 |
return False
|
51 |
else:
|
52 |
return True
|
53 |
-
|
54 |
-
|
55 |
-
def
|
56 |
-
|
57 |
-
if text
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
62 |
elif "</html>" in text:
|
63 |
-
|
64 |
-
elif text[0] ==
|
65 |
-
|
66 |
-
t = json_cleaner(json.load(f))
|
67 |
-
elif "GNU" in text or "Apache" in text:
|
68 |
-
t = gnu_cleaner(text)
|
69 |
elif "\\rtf" in text:
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
else:
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
|
75 |
-
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
+
import json
|
3 |
from bs4 import BeautifulSoup
|
4 |
from striprtf.striprtf import rtf_to_text
|
5 |
+
|
6 |
+
|
7 |
+
PARA_BREAK = "para___break"
|
8 |
|
9 |
|
10 |
def php_cleaner(text):
|
11 |
+
try:
|
12 |
+
return re.findall("\/\*[\S\s]*?\*\/", text)[0]
|
13 |
+
except:
|
14 |
+
return ""
|
15 |
+
# return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]
|
16 |
+
|
17 |
+
|
18 |
def html_cleaner(text):
|
19 |
soup = BeautifulSoup(text)
|
20 |
+
text = soup.body.text
|
21 |
+
if not text:
|
22 |
+
return ""
|
23 |
+
return text
|
24 |
+
|
25 |
+
|
26 |
+
def json_cleaner(text_dict):
|
27 |
+
out = ""
|
28 |
+
for key in text_dict.keys():
|
29 |
+
if key in ("description", "license"):
|
30 |
+
out += key
|
31 |
+
out += ": "
|
32 |
+
out += str(text_dict[key])
|
33 |
+
out += ", "
|
34 |
return out
|
35 |
+
|
36 |
+
|
37 |
+
def discard_text_after_tnc(text):
|
38 |
+
return text.split("END OF TERMS AND CONDITIONS")[0]
|
39 |
+
|
40 |
+
|
41 |
def gnu_cleaner(text):
|
42 |
t = text.split('END OF TERMS AND CONDITIONS')[0]
|
43 |
+
definitions = ""
|
44 |
if 'Preamble' in text:
|
45 |
if len(t.split('Preamble')[0])>100:
|
46 |
t0 = t.split('Preamble')[0]
|
|
|
51 |
t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
|
52 |
except:
|
53 |
t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
|
54 |
+
t = t0+t1
|
55 |
else:
|
56 |
+
t = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
|
57 |
+
if 'Definitions' in text:
|
58 |
+
try:
|
59 |
+
def_pos = re.search(r"[0-9]\.? (Additional )?Definitions",t).span()
|
60 |
+
other_start_pos = re.search(r"[0-9]\.? [A-Z][a-z]+",t[def_pos[1]:]).span()[0]
|
61 |
+
definitions = t[def_pos[0]: def_pos[1] + other_start_pos]
|
62 |
+
t = t[:def_pos[0]] + t[def_pos[1]+other_start_pos:]
|
63 |
+
except:
|
64 |
+
t = t
|
65 |
+
return t, definitions
|
66 |
+
|
67 |
+
|
68 |
def rtf_cleaner(text):
|
69 |
return rtf_to_text(text)
|
70 |
+
|
71 |
+
|
72 |
def url_cleaner(text):
|
73 |
+
return re.sub(r"http\S+", "", text)
|
74 |
+
|
75 |
+
|
76 |
+
def email_cleaner(text):
|
77 |
+
return re.sub(r"\S*@\S*", "", text)
|
78 |
+
|
79 |
+
|
80 |
+
def var_cleaner(text):
|
81 |
+
text = re.sub(r"\$\w+", "", text)
|
82 |
+
text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text)
|
83 |
+
return text
|
84 |
+
|
85 |
+
|
86 |
+
def character_cleaner(text):
|
87 |
+
text = url_cleaner(text)
|
88 |
+
text = email_cleaner(text)
|
89 |
+
text = var_cleaner(text)
|
90 |
+
|
91 |
+
text = re.sub("[\n]{2,}", ". ", text)
|
92 |
+
text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text)
|
93 |
+
text = re.sub("[\. ]{2,}", ". ", text)
|
94 |
+
return text
|
95 |
+
|
96 |
+
|
97 |
def isEnglish(s):
|
98 |
try:
|
99 |
+
s.encode(encoding="utf-8").decode("ascii")
|
100 |
except UnicodeDecodeError:
|
101 |
return False
|
102 |
else:
|
103 |
return True
|
104 |
+
|
105 |
+
|
106 |
+
def preprocess_text(text):
|
107 |
+
definitions = ""
|
108 |
+
if "GNU" in text or "Apache" in text:
|
109 |
+
text, definitions = gnu_cleaner(text)
|
110 |
+
definitions = definitions.strip()
|
111 |
+
return text, definitions
|
112 |
+
|
113 |
+
|
114 |
+
def script_cleaner(text):
|
115 |
+
if "<?php" in text:
|
116 |
+
text = php_cleaner(text)
|
117 |
elif "</html>" in text:
|
118 |
+
text = html_cleaner(text)
|
119 |
+
elif text[0] == "{" and text[-1] == "}":
|
120 |
+
text = json_cleaner(json.loads(text))
|
|
|
|
|
|
|
121 |
elif "\\rtf" in text:
|
122 |
+
text = rtf_cleaner(text)
|
123 |
+
if not text:
|
124 |
+
return ""
|
125 |
+
return text
|
126 |
+
|
127 |
+
|
128 |
+
def split_paras(text):
|
129 |
+
if "\n\n\n\n" in text:
|
130 |
+
paras = text.split("\n\n\n\n")
|
131 |
+
elif "\n\n\n" in text:
|
132 |
+
paras = text.split("\n\n\n")
|
133 |
+
elif "\n\n" in text:
|
134 |
+
paras = text.split("\n\n")
|
135 |
else:
|
136 |
+
paras = [text]
|
137 |
+
return paras
|
138 |
+
|
139 |
+
|
140 |
+
def clean_paras(paras):
|
141 |
+
return paras
|
142 |
+
|
143 |
+
|
144 |
+
def clean_license_text(text):
|
145 |
|
146 |
+
if len(text) == 0:
|
147 |
+
return text
|
148 |
|
149 |
+
text = script_cleaner(text)
|
150 |
+
text, definitions = preprocess_text(text)
|
151 |
+
paras = clean_paras(split_paras(text))
|
152 |
+
text = PARA_BREAK.join(paras)
|
153 |
+
text = character_cleaner(text)
|
154 |
+
text = re.sub(PARA_BREAK, "\n\n", text)
|
155 |
+
text = text.strip()
|
156 |
+
|
157 |
+
if not isEnglish(text):
|
158 |
+
if not isEnglish(" ".join(text.split()[-5:-1])):
|
159 |
+
return "", ""
|
160 |
+
|
161 |
+
return text, definitions
|
162 |
+
|
163 |
+
|
164 |
+
"""
|
165 |
+
Notes:
|
166 |
+
|
167 |
+
1. Regex for other definitions: --------> ".{0,20}".{0,40}means
|
168 |
+
2. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something)
|
169 |
+
-> Merge with the next para
|
170 |
+
Ex. "8. Termination."
|
171 |
+
"""
|
src/read_data.py
CHANGED
@@ -129,7 +129,7 @@ def augment_summary(license_data):
|
|
129 |
return license_data
|
130 |
|
131 |
|
132 |
-
def read_license_data(labels_file="choosealicense_appendix_labels.csv"):
|
133 |
"""
|
134 |
Reads data from Text and Summary File and stores it as a dictionary of
|
135 |
dictionaries.
|
@@ -142,21 +142,46 @@ def read_license_data(labels_file="choosealicense_appendix_labels.csv"):
|
|
142 |
corresponding summaries and license texts respectively.
|
143 |
|
144 |
"""
|
|
|
145 |
files = glob.glob(gold_licenses_data + "*")
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
data_dict = defaultdict(dict)
|
|
|
147 |
for file_path in files:
|
|
|
|
|
|
|
|
|
|
|
148 |
if file_path.endswith(".summary"):
|
149 |
-
file_name = file_path.split(
|
150 |
data_dict[file_name]["summary"] = read_file(file_path)
|
151 |
elif file_path.endswith(".txt"):
|
152 |
-
file_name = file_path.split(
|
153 |
data_dict[file_name]["text"] = clean_data(read_file(file_path))
|
154 |
|
155 |
summary_df = pd.DataFrame(data_dict).T
|
156 |
-
labels_df = pd.read_csv(data_directory + labels_file, index_col=index_col)
|
157 |
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
|
162 |
def read_license_summary_data(aug_summary=False):
|
@@ -193,11 +218,11 @@ def fix_labels(license_data):
|
|
193 |
"limitations": 2
|
194 |
}
|
195 |
|
196 |
-
permissive_not_permissive_map = {
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
}
|
201 |
|
202 |
permissions_columns = [
|
203 |
"commercial-use",
|
@@ -226,14 +251,14 @@ def fix_labels(license_data):
|
|
226 |
"patent-use"
|
227 |
]
|
228 |
|
229 |
-
permissive_not_permissive_columns = [
|
230 |
-
|
231 |
-
]
|
232 |
|
233 |
license_data[permissions_columns] = license_data[permissions_columns].replace(permissions_map)
|
234 |
license_data[conditions_columns] = license_data[conditions_columns].replace(conditions_map)
|
235 |
license_data[limitations_columns] = license_data[limitations_columns].replace(limitations_map)
|
236 |
license_data[permissions_limitations_columns] = license_data[permissions_limitations_columns].replace(permissions_limitations_map)
|
237 |
-
license_data[permissive_not_permissive_columns] = license_data[permissive_not_permissive_columns].replace(permissive_not_permissive_map)
|
238 |
|
239 |
return license_data
|
|
|
129 |
return license_data
|
130 |
|
131 |
|
132 |
+
def read_license_data(labels_file="choosealicense_appendix_labels.csv", drop_summary=False):
|
133 |
"""
|
134 |
Reads data from Text and Summary File and stores it as a dictionary of
|
135 |
dictionaries.
|
|
|
142 |
corresponding summaries and license texts respectively.
|
143 |
|
144 |
"""
|
145 |
+
|
146 |
files = glob.glob(gold_licenses_data + "*")
|
147 |
+
if not files:
|
148 |
+
files = glob.glob(f"../{gold_licenses_data}" + "*")
|
149 |
+
if not files:
|
150 |
+
print("Gold licenses not found, please check the path again!")
|
151 |
+
return None
|
152 |
+
|
153 |
data_dict = defaultdict(dict)
|
154 |
+
|
155 |
for file_path in files:
|
156 |
+
if "\\" in file_path:
|
157 |
+
split_by = "\\"
|
158 |
+
else:
|
159 |
+
split_by = "/"
|
160 |
+
|
161 |
if file_path.endswith(".summary"):
|
162 |
+
file_name = file_path.split(split_by)[-1][:-8]
|
163 |
data_dict[file_name]["summary"] = read_file(file_path)
|
164 |
elif file_path.endswith(".txt"):
|
165 |
+
file_name = file_path.split(split_by)[-1][:-4]
|
166 |
data_dict[file_name]["text"] = clean_data(read_file(file_path))
|
167 |
|
168 |
summary_df = pd.DataFrame(data_dict).T
|
|
|
169 |
|
170 |
+
try:
|
171 |
+
labels_df = pd.read_csv(data_directory + labels_file, index_col=index_col)
|
172 |
+
except:
|
173 |
+
try:
|
174 |
+
labels_df = pd.read_csv(f"../{data_directory}" + labels_file, index_col=index_col)
|
175 |
+
except:
|
176 |
+
print("Labels file not found, please check the path again!")
|
177 |
+
return None
|
178 |
+
|
179 |
+
merged_data = labels_df.join(summary_df).drop(columns=["spdx_id"])
|
180 |
+
|
181 |
+
if drop_summary:
|
182 |
+
merged_data = merged_data.drop(columns=["summary"])
|
183 |
+
|
184 |
+
return merged_data
|
185 |
|
186 |
|
187 |
def read_license_summary_data(aug_summary=False):
|
|
|
218 |
"limitations": 2
|
219 |
}
|
220 |
|
221 |
+
# permissive_not_permissive_map = {
|
222 |
+
# np.nan: 0,
|
223 |
+
# "permissive": 1,
|
224 |
+
# "not_permissive": 2
|
225 |
+
# }
|
226 |
|
227 |
permissions_columns = [
|
228 |
"commercial-use",
|
|
|
251 |
"patent-use"
|
252 |
]
|
253 |
|
254 |
+
# permissive_not_permissive_columns = [
|
255 |
+
# "GTLC_Permissive"
|
256 |
+
# ]
|
257 |
|
258 |
license_data[permissions_columns] = license_data[permissions_columns].replace(permissions_map)
|
259 |
license_data[conditions_columns] = license_data[conditions_columns].replace(conditions_map)
|
260 |
license_data[limitations_columns] = license_data[limitations_columns].replace(limitations_map)
|
261 |
license_data[permissions_limitations_columns] = license_data[permissions_limitations_columns].replace(permissions_limitations_map)
|
262 |
+
# license_data[permissive_not_permissive_columns] = license_data[permissive_not_permissive_columns].replace(permissive_not_permissive_map)
|
263 |
|
264 |
return license_data
|
src/textrank.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
import numpy as np
|
3 |
+
import gensim
|
4 |
+
import spacy
|
5 |
+
import math
|
6 |
+
from collections import Counter
|
7 |
+
|
8 |
+
from src.clean import clean_license_text
|
9 |
+
from src.read_data import read_file
|
10 |
+
|
11 |
+
nltk.download('punkt')
|
12 |
+
|
13 |
+
properties_dict = {
|
14 |
+
"modify":['modify', 'modification', 'change'],
|
15 |
+
"distribute":['distribute', 'distribution'],
|
16 |
+
"copy":['copy'],
|
17 |
+
"copyright": ['copyright']
|
18 |
+
# "exception"
|
19 |
+
}
|
20 |
+
|
21 |
+
properties_scores = {
|
22 |
+
"modify": 0.8,
|
23 |
+
"distribute": 0.8,
|
24 |
+
"copy": 0.8,
|
25 |
+
"copyright": 0.9
|
26 |
+
}
|
27 |
+
|
28 |
+
nlp = spacy.load('en_core_web_sm')
|
29 |
+
|
30 |
+
def lemmatize_tokens(sent):
|
31 |
+
#TODO: Docstrings
|
32 |
+
'''each word in input sentence is converted to lemma'''
|
33 |
+
return [token.lemma_.lower() for token in nlp(sent)]
|
34 |
+
|
35 |
+
|
36 |
+
def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, debug=False):
|
37 |
+
'''
|
38 |
+
TODO: Doctrings
|
39 |
+
'''
|
40 |
+
summary_len = math.ceil(summary_len*len(license_text.split('.')))
|
41 |
+
sent_scores = {}
|
42 |
+
cleaned_license_text, definitions = clean_license_text(license_text)
|
43 |
+
for i in cleaned_license_text.split('.'):
|
44 |
+
if debug:
|
45 |
+
print(i.split())
|
46 |
+
if len(i.split()) < min_sent_len:
|
47 |
+
break
|
48 |
+
score = 0
|
49 |
+
for prop, prop_words in properties_dict.items():
|
50 |
+
prop_score = 0
|
51 |
+
lemmatized_tokens = lemmatize_tokens(i)
|
52 |
+
word_count = Counter([tok for tok in lemmatized_tokens])
|
53 |
+
for prop_word in prop_words:
|
54 |
+
if prop_word in word_count.keys():
|
55 |
+
prop_score += properties_scores[prop_word]
|
56 |
+
if debug:
|
57 |
+
print(prop, "=", prop_score)
|
58 |
+
score += prop_score
|
59 |
+
sent_scores[i] = score/len(lemmatized_tokens)
|
60 |
+
if debug:
|
61 |
+
print(f'Sentence score: {sent_scores[i]}')
|
62 |
+
print()
|
63 |
+
if debug:
|
64 |
+
print(sent_scores)
|
65 |
+
sorted_sent_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))
|
66 |
+
summary = '.\n'.join(list(sorted_sent_scores.keys())[:summary_len])
|
67 |
+
return summary, definitions
|
68 |
+
|
69 |
+
|