Pietro Lesci commited on
Commit
8744085
·
0 Parent(s):

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ .github
6
+
7
+ # C extensions
8
+ *.so
9
+ .vscode
10
+ # Distribution / packaging
11
+ .Python
12
+ env/
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ .hypothesis/
49
+ .pytest_cache/
50
+
51
+ # Translations
52
+ *.mo
53
+ *.pot
54
+
55
+ # Django stuff:
56
+ *.log
57
+ local_settings.py
58
+
59
+ # Flask stuff:
60
+ instance/
61
+ .webassets-cache
62
+
63
+ # Scrapy stuff:
64
+ .scrapy
65
+
66
+ # Sphinx documentation
67
+ docs/_build/
68
+
69
+ # PyBuilder
70
+ target/
71
+
72
+ # Jupyter Notebook
73
+ .ipynb_checkpoints
74
+
75
+ # pyenv
76
+ .python-version
77
+
78
+ # celery beat schedule file
79
+ celerybeat-schedule
80
+
81
+ # SageMath parsed files
82
+ *.sage.py
83
+
84
+ # dotenv
85
+ .env
86
+
87
+ # virtualenv
88
+ .venv
89
+ venv/
90
+ ENV/
91
+
92
+ # Spyder project settings
93
+ .spyderproject
94
+ .spyproject
95
+
96
+ # Rope project settings
97
+ .ropeproject
98
+
99
+ # mkdocs documentation
100
+ /site
101
+
102
+ # mypy
103
+ .mypy_cache/
104
+ .idea
105
+
106
+ # mac
107
+ .DS_Store
108
+ data/data.csv
.streamlit/config.toml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [server]
2
+ # Max size, in megabytes, for files uploaded with the file_uploader.
3
+ # Default: 200
4
+ maxUploadSize = 10
CODEOWNERS ADDED
@@ -0,0 +1 @@
 
 
1
+ * @pietrolesci
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###############################################################################
2
+ # main
3
+ ###############################################################################
4
+
5
+ FROM continuumio/miniconda3:4.8.2 AS main
6
+
7
+ RUN apt-get -y update && \
8
+ apt-get -y install build-essential
9
+ RUN conda update -n base -c defaults conda
10
+
11
+ # chown changes owner from root owner (1000) to the first user inside the env (100)
12
+ # COPY --chown=1000:100 requirements.txt /opt/requirements.txt
13
+ # RUN conda install --force-reinstall -y -q --name base -c conda-forge --file /opt/requirements.txt
14
+ RUN conda install --force-reinstall -y -q --name base pip
15
+
16
+ COPY . /var/app/
17
+ # WORKDIR /var/dev
18
+ WORKDIR /var/app
19
+ RUN pip install -r requirements.txt
20
+ CMD streamlit run ./app.py
21
+
22
+ ###############################################################################
23
+ # test
24
+ ###############################################################################
25
+
26
+ FROM main AS test
27
+ COPY . /var/dev/
28
+ WORKDIR /var/dev
29
+ # add unit test instruction here: RUN xxxxxx
30
+ # add integration test instruction here: RUN xxxxx
LICENSE ADDED
@@ -0,0 +1 @@
 
 
1
+ TODO: placeholder
Makefile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: help build dev integration-test push
2
+ .DEFAULT_GOAL := help
3
+
4
+ # Docker image build info
5
+ PROJECT:=wordify
6
+ BUILD_TAG?=0.0.1
7
+
8
+ ALL_IMAGES:=src
9
+
10
+ help:
11
+ # http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
12
+ @echo "python starter project"
13
+ @echo "====================="
14
+ @echo "Replace % with a directory name (e.g., make build/python-example)"
15
+ @echo
16
+ @grep -E '^[a-zA-Z0-9_%/-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
17
+
18
+ ########################################################
19
+ ## Local development
20
+ ########################################################
21
+
22
+ dev: ARGS?=/bin/bash
23
+ dev: DARGS?=-v "${CURDIR}":/var/dev
24
+ dev: ## run a foreground container
25
+ docker run -it --rm $(DARGS) $(PROJECT) $(ARGS)
26
+
27
+
28
+ notebook: ARGS?=jupyter lab
29
+ notebook: DARGS?=-v "${CURDIR}":/var/dev -p 8888:8888 ##notebook shall be run on http://0.0.0.0:8888 by default. Change to a different port (e.g. 8899) if 8888 is used for example 8899:8888
30
+ notebook: ## run a foreground container
31
+ docker run -it --rm $(DARGS) $(PROJECT) $(ARGS) \
32
+ --ip=0.0.0.0 \
33
+ --allow-root \
34
+ --NotebookApp.token="" \
35
+ --NotebookApp.password=""
36
+
37
+ build: DARGS?=
38
+ build: ## build the latest image for a project
39
+ docker build $(DARGS) --build-arg BUILD_TAG=${BUILD_TAG} --rm --force-rm -t $(PROJECT):${BUILD_TAG} .
40
+
41
+ run:
42
+ docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 8501:8501 $(PROJECT):${BUILD_TAG}
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from src.utils import get_logo
3
+ from src import session_state
4
+ from src.pages import (
5
+ home,
6
+ faq,
7
+ about,
8
+ )
9
+ from src.configs import SupportedFiles
10
+
11
+ # app configs
12
+ st.set_page_config(
13
+ page_title="Wordify",
14
+ layout="wide",
15
+ page_icon="./assets/logo.png",
16
+ )
17
+
18
+ # session state
19
+ session = session_state.get(process=False, run_id=0, posdf=None, negdf=None)
20
+
21
+
22
+ # ==== SIDEBAR ==== #
23
+ # LOGO
24
+ client_logo = get_logo("./assets/logo.png")
25
+ with st.sidebar.beta_container():
26
+ st.image(client_logo)
27
+
28
+ # NAVIGATION
29
+ PAGES = {
30
+ "Home": home,
31
+ "FAQ": faq,
32
+ "About": about,
33
+ }
34
+
35
+ with st.sidebar.beta_container():
36
+ st.sidebar.header("Navigation")
37
+ selection = st.sidebar.radio("Go to", list(PAGES.keys()))
38
+
39
+ page = PAGES[selection]
40
+
41
+ # FILE UPLOADER
42
+ with st.sidebar.beta_container():
43
+ st.markdown("")
44
+ st.markdown("")
45
+ st.header("Upload file")
46
+ uploaded_file = st.sidebar.file_uploader("Select file", type=[i.name for i in SupportedFiles])
47
+
48
+
49
+ # FOOTER
50
+ with st.sidebar.beta_container():
51
+ st.markdown("")
52
+ st.markdown("")
53
+ st.markdown(
54
+ """
55
+ <span style="font-size: 0.75em">Built with &hearts; by [`Pietro Lesci`](https://pietrolesci.github.io/) and [`MilaNLP`](https://twitter.com/MilaNLProc?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor)</span>
56
+ """,
57
+ unsafe_allow_html=True,
58
+ )
59
+
60
+
61
+ # ==== MAIN ==== #
62
+ with st.beta_container():
63
+ st.title("Wordify")
64
+ st.markdown(
65
+ """
66
+ Wordify makes it easy to identify words that discriminate categories in textual data.
67
+
68
+ Let's explain Wordify with an example. Imagine you are thinking about having a glass
69
+ of wine :wine_glass: with your friends :man-man-girl-girl: and you have to buy a bottle.
70
+ You know you like `bold`, `woody` wine but are unsure which one to choose.
71
+ You wonder whether there are some words that describe each type of wine.
72
+ Since you are a researcher :female-scientist: :male-scientist:, you decide to approach
73
+ the problem scientifically :microscope:. That's where Wordify comes to the rescue!
74
+ """
75
+ )
76
+
77
+
78
+ page.write(session, uploaded_file)
assets/logo.png ADDED
assets/page_icon.png ADDED
data/test_de.xlsx ADDED
Binary file (645 kB). View file
 
data/test_en.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/test_es.xlsx ADDED
Binary file (771 kB). View file
 
data/test_fe.xlsx ADDED
Binary file (754 kB). View file
 
data/test_it.xlsx ADDED
Binary file (662 kB). View file
 
notebooks/wordifier_nb.ipynb ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "language_info": {
4
+ "codemirror_mode": {
5
+ "name": "ipython",
6
+ "version": 3
7
+ },
8
+ "file_extension": ".py",
9
+ "mimetype": "text/x-python",
10
+ "name": "python",
11
+ "nbconvert_exporter": "python",
12
+ "pygments_lexer": "ipython3",
13
+ "version": "3.8.3"
14
+ },
15
+ "orig_nbformat": 2,
16
+ "kernelspec": {
17
+ "name": "python383jvsc74a57bd01cb9a1c850fd1d16c5b98054247a74b7b7a12849bcfa00436ba202c2a9e2bbb2",
18
+ "display_name": "Python 3.8.3 64-bit ('py38': conda)"
19
+ }
20
+ },
21
+ "nbformat": 4,
22
+ "nbformat_minor": 2,
23
+ "cells": [
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 1,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "import sys\n",
31
+ "nb_dir = os.path.split(os.getcwd())[0]\n",
32
+ "if nb_dir not in sys.path:\n",
33
+ " sys.path.append(nb_dir)\n",
34
+ "\n",
35
+ "import numpy as np\n",
36
+ "import pandas as pd\n",
37
+ "# import modin.pandas as mpd\n",
38
+ "import spacy\n",
39
+ "from src.configs import ModelConfigs, Languages\n",
40
+ "from src.utils import wordifier, TextPreprocessor, encode\n",
41
+ "\n",
42
+ "from textacy.preprocessing import make_pipeline, remove, replace, normalize\n",
43
+ "from tqdm import trange\n",
44
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
45
+ "from sklearn.linear_model import LogisticRegression\n",
46
+ "from sklearn.preprocessing import LabelEncoder\n",
47
+ "from sklearn.utils import resample\n",
48
+ "import multiprocessing as mp\n",
49
+ "# import dask.dataframe as dask_df\n",
50
+ "from stqdm import stqdm\n",
51
+ "stqdm.pandas()\n",
52
+ "\n",
53
+ "from tqdm import trange\n",
54
+ "\n",
55
+ "import os\n",
56
+ "# os.environ[\"MODIN_ENGINE\"] = \"ray\" # Modin will use Ray\n",
57
+ "\n",
58
+ "import vaex\n",
59
+ "pd.set_option(\"display.max_colwidth\", None)"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": 2,
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "df = pd.read_excel(\"../data/test_de.xlsx\")\n",
69
+ "# mdf = mpd.read_csv(\"../data/test_en.csv\")\n",
70
+ "language = \"English\"\n",
71
+ "nlp = spacy.load(Languages[language].value, exclude=[\"parser\", \"ner\", \"pos\", \"tok2vec\"])"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 3,
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "prep = TextPreprocessor(\n",
81
+ " language=\"English\", \n",
82
+ " cleaning_steps=list(TextPreprocessor._cleaning_options().keys()),\n",
83
+ " lemmatizer_when=None,\n",
84
+ ")"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 4,
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "output_type": "stream",
94
+ "name": "stderr",
95
+ "text": [
96
+ "2021-05-10 14:30:04.984 WARNING root: \n",
97
+ " \u001b[33m\u001b[1mWarning:\u001b[0m to view this Streamlit app on a browser, run it with the following\n",
98
+ " command:\n",
99
+ "\n",
100
+ " streamlit run /Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/ipykernel_launcher.py [ARGUMENTS]\n",
101
+ "100%|██████████| 6269/6269 [00:02<00:00, 2793.61it/s]\n"
102
+ ]
103
+ }
104
+ ],
105
+ "source": [
106
+ "df[\"p_text\"] = prep.fit_transform(df[\"text\"])"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 6,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "X, y, X_names, y_names = encode(df[\"p_text\"], df[\"label\"]).values()"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 11,
121
+ "metadata": {},
122
+ "outputs": [],
123
+ "source": [
124
+ "clf = LogisticRegression(\n",
125
+ " penalty=\"l1\",\n",
126
+ " C=0.05,#ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n",
127
+ " solver=\"saga\",\n",
128
+ " multi_class=\"auto\",\n",
129
+ " max_iter=500,\n",
130
+ " class_weight=\"balanced\",\n",
131
+ ")"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": 12,
137
+ "metadata": {},
138
+ "outputs": [
139
+ {
140
+ "output_type": "stream",
141
+ "name": "stdout",
142
+ "text": [
143
+ "CPU times: user 1min 23s, sys: 138 ms, total: 1min 23s\n",
144
+ "Wall time: 1min 24s\n",
145
+ "/Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:329: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
146
+ " warnings.warn(\"The max_iter was reached which means \"\n"
147
+ ]
148
+ },
149
+ {
150
+ "output_type": "execute_result",
151
+ "data": {
152
+ "text/plain": [
153
+ "LogisticRegression(C=0.05, class_weight='balanced', max_iter=500, penalty='l1',\n",
154
+ " solver='saga')"
155
+ ]
156
+ },
157
+ "metadata": {},
158
+ "execution_count": 12
159
+ }
160
+ ],
161
+ "source": [
162
+ "%%time\n",
163
+ "clf.fit(X, y)"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": 14,
169
+ "metadata": {},
170
+ "outputs": [
171
+ {
172
+ "output_type": "stream",
173
+ "name": "stderr",
174
+ "text": [
175
+ " 6%|▌ | 28/500 [01:01<27:33, 3.50s/it]/Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
176
+ " warnings.warn(\"Liblinear failed to converge, increase \"\n",
177
+ " 31%|███ | 156/500 [06:18<13:54, 2.43s/it]\n"
178
+ ]
179
+ },
180
+ {
181
+ "output_type": "error",
182
+ "ename": "KeyboardInterrupt",
183
+ "evalue": "",
184
+ "traceback": [
185
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
186
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
187
+ "\u001b[0;32m<ipython-input-14-1fef5b7ccf45>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;31m# fit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 41\u001b[0;31m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 42\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
188
+ "\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 1354\u001b[0m \u001b[0;34m\" 'solver' is set to 'liblinear'. Got 'n_jobs'\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1355\u001b[0m \" = {}.\".format(effective_n_jobs(self.n_jobs)))\n\u001b[0;32m-> 1356\u001b[0;31m self.coef_, self.intercept_, n_iter_ = _fit_liblinear(\n\u001b[0m\u001b[1;32m 1357\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_intercept\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintercept_scaling\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1358\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclass_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdual\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
189
+ "\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py\u001b[0m in \u001b[0;36m_fit_liblinear\u001b[0;34m(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 965\u001b[0m \u001b[0msolver_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_liblinear_solver_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmulti_class\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdual\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 966\u001b[0;31m raw_coef_, n_iter_ = liblinear.train_wrap(\n\u001b[0m\u001b[1;32m 967\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_ind\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misspmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msolver_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[0mclass_weight_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrnd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'i'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
190
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
191
+ ]
192
+ }
193
+ ],
194
+ "source": [
195
+ "n_instances, n_features = X.shape\n",
196
+ "n_classes = len(y_names)\n",
197
+ "\n",
198
+ "# NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n",
199
+ "sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n",
200
+ "\n",
201
+ "sample_size = min(\n",
202
+ " # this is the maximum supported\n",
203
+ " ModelConfigs.MAX_SELECTION.value,\n",
204
+ " # at minimum you want MIN_SELECTION but in general you want\n",
205
+ " # n_instances * sample_fraction\n",
206
+ " max(ModelConfigs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n",
207
+ " # however if previous one is bigger the the available instances take\n",
208
+ " # the number of available instances\n",
209
+ " n_instances,\n",
210
+ ")\n",
211
+ "\n",
212
+ "# TODO: might want to try out something to subsample features at each iteration\n",
213
+ "\n",
214
+ "# initialize coefficient matrices\n",
215
+ "pos_scores = np.zeros((n_classes, n_features), dtype=int)\n",
216
+ "neg_scores = np.zeros((n_classes, n_features), dtype=int)\n",
217
+ "\n",
218
+ "for _ in trange(ModelConfigs.NUM_ITERS.value):\n",
219
+ "\n",
220
+ " # run randomized regression\n",
221
+ " clf = LogisticRegression(\n",
222
+ " penalty=\"l1\",\n",
223
+ " C=ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n",
224
+ " solver=\"liblinear\",\n",
225
+ " multi_class=\"auto\",\n",
226
+ " max_iter=500,\n",
227
+ " class_weight=\"balanced\",\n",
228
+ " )\n",
229
+ "\n",
230
+ " # sample indices to subsample matrix\n",
231
+ " selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)\n",
232
+ "\n",
233
+ " # fit\n",
234
+ " try:\n",
235
+ " clf.fit(X[selection], y[selection])\n",
236
+ " except ValueError:\n",
237
+ " continue\n",
238
+ "\n",
239
+ " # record coefficients\n",
240
+ " if n_classes == 2:\n",
241
+ " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n",
242
+ " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n",
243
+ " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n",
244
+ " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n",
245
+ " else:\n",
246
+ " pos_scores += clf.coef_ > 0\n",
247
+ " neg_scores += clf.coef_ < 0"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": null,
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": [
256
+ "# normalize\n",
257
+ "pos_scores = pos_scores / ModelConfigs.NUM_ITERS.value\n",
258
+ "neg_scores = neg_scores / ModelConfigs.NUM_ITERS.value\n",
259
+ "\n",
260
+ "# get only active features\n",
261
+ "pos_positions = np.where(pos_scores >= ModelConfigs.SELECTION_THRESHOLD.value, pos_scores, 0)\n",
262
+ "neg_positions = np.where(neg_scores >= ModelConfigs.SELECTION_THRESHOLD.value, neg_scores, 0)\n",
263
+ "\n",
264
+ "# prepare DataFrame\n",
265
+ "pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]\n",
266
+ "neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]\n",
267
+ "\n",
268
+ "posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)\n",
269
+ "negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": null,
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": []
278
+ }
279
+ ]
280
+ }
pytest.ini ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [pytest]
2
+ markers =
3
+ cache_tests: mark a test which is about the recurrence computer cache
4
+ seed_tests: mark a test which is about the seed sequence
requirements.txt ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pytest==5.4.2
2
+ pytest-cov==2.8.1
3
+ sphinx==3.0.4
4
+ black==19.10b0
5
+ jupyterlab==2.1.4
6
+ pandas==1.2.4
7
+ jupytext==1.5.1
8
+ nbval==0.9.6
9
+ textacy==0.11.0
10
+ streamlit==0.81.1
11
+ spacy==3.0.6
12
+ numpy==1.20.2
13
+ scikit-learn==0.24.2
14
+ xlrd==2.0.1
15
+ openpyxl==3.0.7
16
+ stqdm==0.0.3
17
+ watchdog==2.1.0
18
+ flake8
19
+
20
+ # english
21
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm
22
+ # italian
23
+ https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.0.0/it_core_news_sm-3.0.0.tar.gz#egg=it_core_news_sm
24
+ # german
25
+ https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm
26
+ # spanish
27
+ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.0.0/es_core_news_sm-3.0.0.tar.gz#egg=es_core_news_sm
28
+ # greek
29
+ https://github.com/explosion/spacy-models/releases/download/el_core_news_sm-3.0.0/el_core_news_sm-3.0.0.tar.gz#egg=el_core_news_sm
30
+ # dutch
31
+ https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-3.0.0/nl_core_news_sm-3.0.0.tar.gz#egg=nl_core_news_sm
32
+ # portuguese
33
+ https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.0.0/pt_core_news_sm-3.0.0.tar.gz#egg=pt_core_news_sm
34
+ # french
35
+ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.0.0/fr_core_news_sm-3.0.0.tar.gz#egg=fr_core_news_sm
36
+ # chinese
37
+ # https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.0.0/zh_core_web_sm-3.0.0.tar.gz#egg=zh_core_web_sm
38
+ # danish
39
+ https://github.com/explosion/spacy-models/releases/download/da_core_news_sm-3.0.0/da_core_news_sm-3.0.0.tar.gz#egg=da_core_news_sm
40
+ # japanese
41
+ # https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-3.0.0/ja_core_news_sm-3.0.0.tar.gz#egg=ja_core_news_sm
42
+ # lithuanian
43
+ https://github.com/explosion/spacy-models/releases/download/lt_core_news_sm-3.0.0/lt_core_news_sm-3.0.0.tar.gz#egg=lt_core_news_sm
44
+ # norvegian
45
+ https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-3.0.0/nb_core_news_sm-3.0.0.tar.gz#egg=nb_core_news_sm
46
+ # polish
47
+ https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.0.0/pl_core_news_sm-3.0.0.tar.gz#egg=pl_core_news_sm
48
+ # romanian
49
+ https://github.com/explosion/spacy-models/releases/download/ro_core_news_sm-3.0.0/ro_core_news_sm-3.0.0.tar.gz#egg=ro_core_news_sm
50
+ # russian
51
+ https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.0.0/ru_core_news_sm-3.0.0.tar.gz#egg=ru_core_news_sm
52
+ # multi-language
53
+ https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.0.0/xx_ent_wiki_sm-3.0.0.tar.gz#egg=xx_ent_wiki_sm
src/__init__.py ADDED
File without changes
src/configs.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ import pandas as pd
3
+
4
+
5
+ class ModelConfigs(Enum):
6
+ NUM_ITERS = 500
7
+ SELECTION_THRESHOLD = 0.0
8
+ PENALTIES = [10, 5, 2, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001]
9
+ MAX_SELECTION = 100_000
10
+ MIN_SELECTION = 10_000
11
+
12
+
13
+ class Languages(Enum):
14
+ English = "en_core_web_sm"
15
+ Italian = "it_core_news_sm"
16
+ German = "de_core_news_sm"
17
+ Spanish = "es_core_news_sm"
18
+ Greek = "el_core_news_sm"
19
+ Dutch = "nl_core_news_sm"
20
+ Portuguese = "pt_core_news_sm"
21
+ French = "fr_core_news_sm"
22
+ Chinese = "zh_core_news_sm"
23
+ Danish = "da_core_news_sm"
24
+ Japanese = "ja_core_news_sm"
25
+ Lithuanian = "lt_core_news_sm"
26
+ Norvegian = "nb_core_news_sm"
27
+ Polish = "pl_core_news_sm"
28
+ Romanian = "ro_core_news_sm"
29
+ Russian = "ru_core_news_sm"
30
+ MultiLanguage = "xx_ent_wiki_sm"
31
+
32
+
33
+ class SupportedFiles(Enum):
34
+ xlsx = (pd.read_excel,)
35
+ csv = (pd.read_csv,)
36
+ parquet = (pd.read_parquet,)
src/pages/about.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def write(*args):
5
+ # ==== Contacts ==== #
6
+ with st.beta_container():
7
+ st.markdown("")
8
+ st.markdown("")
9
+ st.header(":rocket:About us")
10
+
11
+ st.markdown(
12
+ """
13
+ You can reach out to us via email, phone, or - if you are old-fashioned - via mail
14
+ """
15
+ )
16
+ with st.beta_expander("Contacts"):
17
+
18
+ _, col2 = st.beta_columns([0.5, 3])
19
+ col2.markdown(
20
+ """
21
+ :email: [email protected]
22
+
23
+ :telephone_receiver: +39 02 5836 2604
24
+
25
+ :postbox: Via Röntgen n. 1, Milan 20136 (ITALY)
26
+ """
27
+ )
28
+
29
+ st.write(
30
+ """
31
+ <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
32
+ """,
33
+ unsafe_allow_html=True,
34
+ )
src/pages/faq.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from src.configs import Languages
3
+
4
+
5
+ def write(*args):
6
+
7
+ # ==== HPW IT WORKS ==== #
8
+ with st.beta_container():
9
+ st.markdown("")
10
+ st.markdown("")
11
+ st.header("How it works")
12
+ st.subheader("Step 1 - Prepare your data")
13
+ st.markdown(
14
+ """
15
+ Create an Excel or CSV file with two columns for each row:
16
+
17
+ - a column with the name or the label identifying a specific object or class (e.g., in our
18
+ wine example above it would be the type of wine or the name of a specific brand). It is
19
+ common practice naming this column `label`
20
+
21
+ - a column with the text describing that specific object or class (e.g., in the wine example
22
+ above it could be the description that you find on the rear of the bottle label). It is
23
+ common practice naming this column `text`
24
+
25
+ To have reliable results, we suggest providing at least 2000 labelled texts. If you provide
26
+ less we will still wordify your file, but the results should then be taken with a grain of
27
+ salt.
28
+
29
+ Consider that we also support multi-language texts, therefore you'll be able to
30
+ automatically discriminate between international wines, even if your preferred Italian
31
+ producer does not provide you with a description written in English!
32
+ """
33
+ )
34
+
35
+ st.subheader("Step 2 - Upload your file and Wordify!")
36
+ st.markdown(
37
+ """
38
+ Once you have prepared your Excel or CSV file, click the "Browse File" button.
39
+ Browse for your file.
40
+ Choose the language of your texts (select multi-language if your file contains text in
41
+ different languages).
42
+ Push the "Wordify|" button, set back, and wait for wordify to do its tricks.
43
+
44
+ Depending on the size of your data, the process can take from 1 minute to 5 minutes
45
+ """
46
+ )
47
+
48
+ # ==== FAQ ==== #
49
+ with st.beta_container():
50
+ st.markdown("")
51
+ st.markdown("")
52
+ st.header(":question:Frequently Asked Questions")
53
+ with st.beta_expander("What is Wordify?"):
54
+ st.markdown(
55
+ """
56
+ Wordify is a way to find out which terms are most indicative for each of your dependent
57
+ variable values.
58
+ """
59
+ )
60
+
61
+ with st.beta_expander("What happens to my data?"):
62
+ st.markdown(
63
+ """
64
+ Nothing. We never store the data you upload on disk: it is only kept in memory for the
65
+ duration of the modeling, and then deleted. We do not retain any copies or traces of
66
+ your data.
67
+ """
68
+ )
69
+
70
+ with st.beta_expander("What input formats do you support?"):
71
+ st.markdown(
72
+ """
73
+ The file you upload should be .xlsx, with two columns: the first should be labeled
74
+ 'text' and contain all your documents (e.g., tweets, reviews, patents, etc.), one per
75
+ line. The second column should be labeled 'label', and contain the dependent variable
76
+ label associated with each text (e.g., rating, author gender, company, etc.).
77
+ """
78
+ )
79
+
80
+ with st.beta_expander("How does it work?"):
81
+ st.markdown(
82
+ """
83
+ It uses a variant of the Stability Selection algorithm
84
+ [(Meinshausen and Bühlmann, 2010)](https://rss.onlinelibrary.wiley.com/doi/full/10.1111/j.1467-9868.2010.00740.x)
85
+ to fit hundreds of logistic regression models on random subsets of the data, using
86
+ different L1 penalties to drive as many of the term coefficients to 0. Any terms that
87
+ receive a non-zero coefficient in at least 30% of all model runs can be seen as stable
88
+ indicators.
89
+ """
90
+ )
91
+
92
+ with st.beta_expander("How much data do I need?"):
93
+ st.markdown(
94
+ """
95
+ We recommend at least 2000 instances, the more, the better. With fewer instances, the
96
+ results are less replicable and reliable.
97
+ """
98
+ )
99
+
100
+ with st.beta_expander("Is there a paper I can cite?"):
101
+ st.markdown(
102
+ """
103
+ Yes please! Reference coming soon...
104
+ """
105
+ )
106
+
107
+ with st.beta_expander("What languages are supported?"):
108
+ st.markdown(
109
+ f"""
110
+ Currently we support: {", ".join([i.name for i in Languages])}.
111
+ """
112
+ )
src/pages/home.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.configs import Languages
2
+ from src.utils import (
3
+ encode,
4
+ wordifier,
5
+ download_button,
6
+ TextPreprocessor,
7
+ plot_labels_prop,
8
+ plot_nchars,
9
+ plot_score,
10
+ get_logo,
11
+ read_file,
12
+ )
13
+ import streamlit as st
14
+
15
+
16
+ def write(session, uploaded_file):
17
+
18
+ if uploaded_file:
19
+
20
+ # 1. READ FILE
21
+ with st.spinner("Reading file"):
22
+ # TODO: write parser function that automatically understands format
23
+ data = read_file(uploaded_file)
24
+
25
+ # 2. CREATE UI TO SELECT COLUMNS
26
+ st.markdown("")
27
+ st.markdown("")
28
+ st.header("Process")
29
+
30
+ col1, col2, col3 = st.beta_columns(3)
31
+ with col1:
32
+ language = st.selectbox("Select language", [i.name for i in Languages])
33
+ with st.beta_expander("Description"):
34
+ st.markdown(
35
+ f"Select a language of text amongst those supported: {', '.join([f'`{i.name}`' for i in Languages])}"
36
+ )
37
+ with col2:
38
+ cols_options = [""] + data.columns.tolist()
39
+ label_column = st.selectbox("Select label column name", cols_options, index=0)
40
+ with st.beta_expander("Description"):
41
+ st.markdown("Select the column containing the label")
42
+
43
+ if label_column:
44
+ st.altair_chart(plot_labels_prop(data, label_column), use_container_width=True)
45
+
46
+ with col3:
47
+ text_column = st.selectbox("Select text column name", cols_options, index=0)
48
+ with st.beta_expander("Description"):
49
+ st.markdown("Select the column containing the text")
50
+
51
+ if text_column:
52
+ st.altair_chart(plot_nchars(data, text_column), use_container_width=True)
53
+
54
+ with st.beta_expander("Advanced options"):
55
+ # Lemmatization option
56
+ col1, col2 = st.beta_columns([1, 3])
57
+ with col1:
58
+ lemmatization_when_elem = st.empty()
59
+ with col2:
60
+ st.markdown("Choose lemmatization option")
61
+
62
+ # stopwords option
63
+ col1, col2 = st.beta_columns([1, 3])
64
+ with col1:
65
+ remove_stopwords_elem = st.empty()
66
+ with col2:
67
+ st.markdown("Choose stopword option")
68
+
69
+ # cleaning steps
70
+ col1, col2 = st.beta_columns([1, 3])
71
+ with col1:
72
+ cleaning_steps_elem = st.empty()
73
+ reset_button = st.empty()
74
+ with col2:
75
+ st.markdown("Choose cleaning steps")
76
+
77
+ # implement reset logic
78
+ if reset_button.button("Reset steps"):
79
+ session.run_id += 1
80
+
81
+ steps_options = list(TextPreprocessor._cleaning_options().keys())
82
+ cleaning_steps = cleaning_steps_elem.multiselect(
83
+ "Select text processing steps (ordered)",
84
+ options=steps_options,
85
+ default=steps_options,
86
+ format_func=lambda x: x.replace("_", " ").title(),
87
+ key=session.run_id,
88
+ )
89
+ lemmatization_options = list(TextPreprocessor._lemmatization_options().keys())
90
+ lemmatization_when = lemmatization_when_elem.selectbox(
91
+ "Select when lemmatization happens",
92
+ options=lemmatization_options,
93
+ index=0,
94
+ key=session.run_id,
95
+ )
96
+ remove_stopwords = remove_stopwords_elem.checkbox("Remove stopwords", value=True, key=session.run_id)
97
+
98
+ # Show sample checkbox
99
+ col1, col2 = st.beta_columns([1, 2])
100
+ with col1:
101
+ show_sample = st.checkbox("Show sample of preprocessed text")
102
+
103
+ # initialize text preprocessor
104
+ preprocessor = TextPreprocessor(
105
+ language=language,
106
+ cleaning_steps=cleaning_steps,
107
+ lemmatizer_when=lemmatization_when,
108
+ remove_stop=remove_stopwords,
109
+ )
110
+
111
+ # 3. PROVIDE FEEDBACK ON OPTIONS
112
+ if show_sample and not (label_column and text_column):
113
+ st.warning("Please select `label` and `text` columns")
114
+
115
+ elif show_sample and (label_column and text_column):
116
+ sample_data = data.sample(10)
117
+ sample_data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(sample_data[text_column]).values
118
+ st.table(sample_data.loc[:, [label_column, text_column, f"preprocessed_{text_column}"]])
119
+
120
+ # 4. RUN
121
+ run_button = st.button("Wordify!")
122
+ if run_button and not (label_column and text_column):
123
+ st.warning("Please select `label` and `text` columns")
124
+
125
+ elif run_button and (label_column and text_column) and not session.process:
126
+ # data = data.head()
127
+ data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(data[text_column]).values
128
+
129
+ inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
130
+ session.posdf, session.negdf = wordifier(**inputs)
131
+ st.success("Wordified!")
132
+
133
+ # session.posdf, session.negdf = process(data, text_column, label_column)
134
+ session.process = True
135
+
136
+ # 5. RESULTS
137
+ if session.process and (label_column and text_column):
138
+ st.markdown("")
139
+ st.markdown("")
140
+ st.header("Results")
141
+
142
+ # col1, col2, _ = st.beta_columns(3)
143
+ col1, col2, col3 = st.beta_columns([2, 3, 3])
144
+
145
+ with col1:
146
+ label = st.selectbox("Select label", data[label_column].unique().tolist())
147
+ # # with col2:
148
+ # thres = st.slider(
149
+ # "Select threshold",
150
+ # min_value=0,
151
+ # max_value=100,
152
+ # step=1,
153
+ # format="%f",
154
+ # value=30,
155
+ # )
156
+ show_plots = st.checkbox("Show plots of top 100")
157
+
158
+ with col2:
159
+ st.subheader(f"Words __positively__ identifying label `{label}`")
160
+ st.write(session.posdf[session.posdf[label_column] == label].sort_values("score", ascending=False))
161
+ download_button(session.posdf, "positive_data")
162
+ if show_plots:
163
+ st.altair_chart(plot_score(session.posdf, label_column, label), use_container_width=True)
164
+
165
+ with col3:
166
+ st.subheader(f"Words __negatively__ identifying label `{label}`")
167
+ st.write(session.negdf[session.negdf[label_column] == label].sort_values("score", ascending=False))
168
+ download_button(session.negdf, "negative_data")
169
+ if show_plots:
170
+ st.altair_chart(plot_score(session.negdf, label_column, label), use_container_width=True)
src/session_state.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hack to add per-session state to Streamlit.
2
+
3
+ Usage
4
+ -----
5
+
6
+ >>> import SessionState
7
+ >>>
8
+ >>> session_state = SessionState.get(user_name='', favorite_color='black')
9
+ >>> session_state.user_name
10
+ ''
11
+ >>> session_state.user_name = 'Mary'
12
+ >>> session_state.favorite_color
13
+ 'black'
14
+
15
+ Since you set user_name above, next time your script runs this will be the
16
+ result:
17
+ >>> session_state = get(user_name='', favorite_color='black')
18
+ >>> session_state.user_name
19
+ 'Mary'
20
+
21
+ """
22
+ try:
23
+ import streamlit.ReportThread as ReportThread
24
+ from streamlit.server.Server import Server
25
+ except Exception:
26
+ # Streamlit >= 0.65.0
27
+ import streamlit.report_thread as ReportThread
28
+ from streamlit.server.server import Server
29
+
30
+
31
+ class SessionState(object):
32
+ def __init__(self, **kwargs):
33
+ """A new SessionState object.
34
+
35
+ Parameters
36
+ ----------
37
+ **kwargs : any
38
+ Default values for the session state.
39
+
40
+ Example
41
+ -------
42
+ >>> session_state = SessionState(user_name='', favorite_color='black')
43
+ >>> session_state.user_name = 'Mary'
44
+ ''
45
+ >>> session_state.favorite_color
46
+ 'black'
47
+
48
+ """
49
+ for key, val in kwargs.items():
50
+ setattr(self, key, val)
51
+
52
+
53
+ def get(**kwargs):
54
+ """Gets a SessionState object for the current session.
55
+
56
+ Creates a new object if necessary.
57
+
58
+ Parameters
59
+ ----------
60
+ **kwargs : any
61
+ Default values you want to add to the session state, if we're creating a
62
+ new one.
63
+
64
+ Example
65
+ -------
66
+ >>> session_state = get(user_name='', favorite_color='black')
67
+ >>> session_state.user_name
68
+ ''
69
+ >>> session_state.user_name = 'Mary'
70
+ >>> session_state.favorite_color
71
+ 'black'
72
+
73
+ Since you set user_name above, next time your script runs this will be the
74
+ result:
75
+ >>> session_state = get(user_name='', favorite_color='black')
76
+ >>> session_state.user_name
77
+ 'Mary'
78
+
79
+ """
80
+ # Hack to get the session object from Streamlit.
81
+
82
+ ctx = ReportThread.get_report_ctx()
83
+
84
+ this_session = None
85
+
86
+ current_server = Server.get_current()
87
+ if hasattr(current_server, "_session_infos"):
88
+ # Streamlit < 0.56
89
+ session_infos = Server.get_current()._session_infos.values()
90
+ else:
91
+ session_infos = Server.get_current()._session_info_by_id.values()
92
+
93
+ for session_info in session_infos:
94
+ s = session_info.session
95
+ if (
96
+ # Streamlit < 0.54.0
97
+ (hasattr(s, "_main_dg") and s._main_dg == ctx.main_dg)
98
+ or
99
+ # Streamlit >= 0.54.0
100
+ (not hasattr(s, "_main_dg") and s.enqueue == ctx.enqueue)
101
+ or
102
+ # Streamlit >= 0.65.2
103
+ (not hasattr(s, "_main_dg") and s._uploaded_file_mgr == ctx.uploaded_file_mgr)
104
+ ):
105
+ this_session = s
106
+
107
+ if this_session is None:
108
+ raise RuntimeError(
109
+ "Oh noes. Couldn't get your Streamlit Session object. " "Are you doing something fancy with threads?"
110
+ )
111
+
112
+ # Got the session object! Now let's attach some state into it.
113
+
114
+ if not hasattr(this_session, "_custom_session_state"):
115
+ this_session._custom_session_state = SessionState(**kwargs)
116
+
117
+ return this_session._custom_session_state
src/utils.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import re
3
+ from collections import OrderedDict
4
+ from typing import Callable, Dict, List
5
+
6
+ import altair as alt
7
+ import numpy as np
8
+ import pandas as pd
9
+ import spacy
10
+ import streamlit as st
11
+ from pandas.core.series import Series
12
+ from PIL import Image
13
+ from sklearn.feature_extraction.text import TfidfVectorizer
14
+ from sklearn.linear_model import LogisticRegression
15
+ from sklearn.preprocessing import LabelEncoder
16
+ from sklearn.utils import resample
17
+ from stqdm import stqdm
18
+ from textacy.preprocessing import make_pipeline, normalize, remove, replace
19
+
20
+ from .configs import Languages, ModelConfigs, SupportedFiles
21
+
22
+ stqdm.pandas()
23
+
24
+
25
+ @st.cache
26
+ def get_logo(path):
27
+ return Image.open(path)
28
+
29
+
30
+ # @st.cache(suppress_st_warning=True)
31
+ def read_file(uploaded_file) -> pd.DataFrame:
32
+
33
+ file_type = uploaded_file.name.split(".")[-1]
34
+ if file_type in set(i.name for i in SupportedFiles):
35
+ read_f = SupportedFiles[file_type].value[0]
36
+ return read_f(uploaded_file, dtype=str)
37
+
38
+ else:
39
+ st.error("File type not supported")
40
+
41
+
42
+ def download_button(dataframe: pd.DataFrame, name: str):
43
+ csv = dataframe.to_csv(index=False)
44
+ # some strings <-> bytes conversions necessary here
45
+ b64 = base64.b64encode(csv.encode()).decode()
46
+ href = f'<a href="data:file/csv;base64,{b64}" download="{name}.csv">Download</a>'
47
+ st.write(href, unsafe_allow_html=True)
48
+
49
+
50
+ def encode(text: pd.Series, labels: pd.Series):
51
+ tfidf_vectorizer = TfidfVectorizer(
52
+ input="content", # default: file already in memory
53
+ encoding="utf-8", # default
54
+ decode_error="strict", # default
55
+ strip_accents=None, # do nothing
56
+ lowercase=False, # do nothing
57
+ preprocessor=None, # do nothing - default
58
+ tokenizer=None, # default
59
+ stop_words=None, # do nothing
60
+ analyzer="word",
61
+ ngram_range=(1, 3), # maximum 3-ngrams
62
+ min_df=0.001,
63
+ max_df=0.75,
64
+ sublinear_tf=True,
65
+ )
66
+ label_encoder = LabelEncoder()
67
+
68
+ with st.spinner("Encoding text using TF-IDF and Encoding labels"):
69
+ X = tfidf_vectorizer.fit_transform(text.values)
70
+ y = label_encoder.fit_transform(labels.values)
71
+
72
+ return {
73
+ "X": X,
74
+ "y": y,
75
+ "X_names": np.array(tfidf_vectorizer.get_feature_names()),
76
+ "y_names": label_encoder.classes_,
77
+ }
78
+
79
+
80
+ def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
81
+
82
+ n_instances, n_features = X.shape
83
+ n_classes = len(y_names)
84
+
85
+ # NOTE: the * 10 / 10 trick is to have "nice" round-ups
86
+ sample_fraction = np.ceil((n_features / n_instances) * 10) / 10
87
+
88
+ sample_size = min(
89
+ # this is the maximum supported
90
+ configs.MAX_SELECTION.value,
91
+ # at minimum you want MIN_SELECTION but in general you want
92
+ # n_instances * sample_fraction
93
+ max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
94
+ # however if previous one is bigger the the available instances take
95
+ # the number of available instances
96
+ n_instances,
97
+ )
98
+
99
+ # TODO: might want to try out something to subsample features at each iteration
100
+
101
+ # initialize coefficient matrices
102
+ pos_scores = np.zeros((n_classes, n_features), dtype=int)
103
+ neg_scores = np.zeros((n_classes, n_features), dtype=int)
104
+
105
+ with st.spinner("Wordifying!"):
106
+
107
+ for _ in stqdm(range(configs.NUM_ITERS.value)):
108
+
109
+ # run randomized regression
110
+ clf = LogisticRegression(
111
+ penalty="l1",
112
+ C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
113
+ solver="liblinear",
114
+ multi_class="auto",
115
+ max_iter=500,
116
+ class_weight="balanced",
117
+ )
118
+
119
+ # sample indices to subsample matrix
120
+ selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
121
+
122
+ # fit
123
+ try:
124
+ clf.fit(X[selection], y[selection])
125
+ except ValueError:
126
+ continue
127
+
128
+ # record coefficients
129
+ if n_classes == 2:
130
+ pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
131
+ neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
132
+ pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
133
+ neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
134
+ else:
135
+ pos_scores += clf.coef_ > 0
136
+ neg_scores += clf.coef_ < 0
137
+
138
+ # normalize
139
+ pos_scores = pos_scores / configs.NUM_ITERS.value
140
+ neg_scores = neg_scores / configs.NUM_ITERS.value
141
+
142
+ # get only active features
143
+ pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
144
+ neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
145
+
146
+ # prepare DataFrame
147
+ pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
148
+ neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
149
+
150
+ posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
151
+ negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
152
+
153
+ return posdf, negdf
154
+
155
+
156
+ # more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
157
+ # and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
158
+ _re_space = re.compile(" {2,}")
159
+
160
+
161
+ def normalize_useless_spaces(t):
162
+ return _re_space.sub(" ", t)
163
+
164
+
165
+ _re_rep = re.compile(r"(\S)(\1{2,})")
166
+
167
+
168
+ def normalize_repeating_chars(t):
169
+ def _replace_rep(m):
170
+ c, cc = m.groups()
171
+ return c
172
+
173
+ return _re_rep.sub(_replace_rep, t)
174
+
175
+
176
+ _re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
177
+
178
+
179
+ def normalize_repeating_words(t):
180
+ def _replace_wrep(m):
181
+ c, cc, e = m.groups()
182
+ return c
183
+
184
+ return _re_wrep.sub(_replace_wrep, t)
185
+
186
+
187
+ class TextPreprocessor:
188
+ def __init__(
189
+ self, language: str, cleaning_steps: List[str], lemmatizer_when: str = "last", remove_stop: bool = True
190
+ ) -> None:
191
+ # prepare lemmatizer
192
+ self.language = language
193
+ self.nlp = spacy.load(Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"])
194
+ self.lemmatizer_when = self._lemmatization_options().get(lemmatizer_when, None)
195
+ self.remove_stop = remove_stop
196
+ self._lemmatize = self._get_lemmatizer()
197
+
198
+ # prepare cleaning
199
+ self.cleaning_steps = [
200
+ self._cleaning_options()[step] for step in cleaning_steps if step in self._cleaning_options()
201
+ ]
202
+ self.cleaning_pipeline = make_pipeline(*self.cleaning_steps) if self.cleaning_steps else lambda x: x
203
+
204
+ def _get_lemmatizer(self) -> Callable:
205
+ """Return the correct spacy Doc-level lemmatizer"""
206
+ if self.remove_stop:
207
+
208
+ def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
209
+ """Lemmatizes spacy Doc and removes stopwords"""
210
+ return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
211
+
212
+ else:
213
+
214
+ def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
215
+ """Lemmatizes spacy Doc"""
216
+ return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
217
+
218
+ return lemmatizer
219
+
220
+ @staticmethod
221
+ def _lemmatization_options() -> Dict[str, str]:
222
+ return {
223
+ "Before preprocessing": "first",
224
+ "After preprocessing": "last",
225
+ "Never! Let's do it quick and dirty": None,
226
+ }
227
+
228
+ def lemmatizer(self, series: pd.Series) -> pd.Series:
229
+ """
230
+ Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
231
+ """
232
+ res = []
233
+ pbar = stqdm(total=len(series))
234
+ for doc in self.nlp.pipe(series, batch_size=500):
235
+ res.append(self._lemmatize(doc))
236
+ pbar.update(1)
237
+ pbar.close()
238
+ return pd.Series(res)
239
+
240
+ @staticmethod
241
+ def _cleaning_options():
242
+ """Returns available cleaning steps in order"""
243
+ return OrderedDict(
244
+ [
245
+ ("lower", lambda x: x.lower()),
246
+ ("normalize_unicode", normalize.unicode),
247
+ ("normalize_bullet_points", normalize.bullet_points),
248
+ ("normalize_hyphenated_words", normalize.hyphenated_words),
249
+ ("normalize_quotation_marks", normalize.quotation_marks),
250
+ ("normalize_whitespace", normalize.whitespace),
251
+ ("remove_accents", remove.accents),
252
+ ("remove_brackets", remove.brackets),
253
+ ("remove_html_tags", remove.html_tags),
254
+ ("remove_punctuation", remove.punctuation),
255
+ ("replace_currency_symbols", replace.currency_symbols),
256
+ ("replace_emails", replace.emails),
257
+ ("replace_emojis", replace.emojis),
258
+ ("replace_hashtags", replace.hashtags),
259
+ ("replace_numbers", replace.numbers),
260
+ ("replace_phone_numbers", replace.phone_numbers),
261
+ ("replace_urls", replace.urls),
262
+ ("replace_user_handles", replace.user_handles),
263
+ ("normalize_useless_spaces", normalize_useless_spaces),
264
+ ("normalize_repeating_chars", normalize_repeating_chars),
265
+ ("normalize_repeating_words", normalize_repeating_words),
266
+ ("strip", lambda x: x.strip()),
267
+ ]
268
+ )
269
+
270
+ def fit_transform(self, series: pd.Series) -> Series:
271
+ """Applies text preprocessing"""
272
+
273
+ if self.lemmatizer_when == "first":
274
+ with st.spinner("Lemmatizing"):
275
+ series = self.lemmatizer(series)
276
+
277
+ with st.spinner("Cleaning"):
278
+ series = series.progress_map(self.cleaning_pipeline)
279
+
280
+ if self.lemmatizer_when == "last":
281
+ with st.spinner("Lemmatizing"):
282
+ series = self.lemmatizer(series)
283
+
284
+ return series
285
+
286
+
287
+ def plot_labels_prop(data: pd.DataFrame, label_column: str):
288
+
289
+ source = data["label"].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
290
+
291
+ source["Proportions"] = ((source["Counts"] / source["Counts"].sum()).round(3) * 100).map("{:,.2f}".format) + "%"
292
+
293
+ bars = (
294
+ alt.Chart(source)
295
+ .mark_bar()
296
+ .encode(
297
+ x="Labels:O",
298
+ y="Counts:Q",
299
+ )
300
+ )
301
+
302
+ text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
303
+
304
+ return (bars + text).properties(height=300)
305
+
306
+
307
+ def plot_nchars(data: pd.DataFrame, text_column: str):
308
+ source = data[text_column].str.len().to_frame()
309
+
310
+ plot = (
311
+ alt.Chart(source)
312
+ .mark_bar()
313
+ .encode(
314
+ alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
315
+ alt.Y("count()", axis=alt.Axis(title="")),
316
+ )
317
+ )
318
+
319
+ return plot.properties(height=300)
320
+
321
+
322
+ def plot_score(data: pd.DataFrame, label_col: str, label: str):
323
+
324
+ source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
325
+
326
+ plot = (
327
+ alt.Chart(source)
328
+ .mark_bar()
329
+ .encode(
330
+ y=alt.Y("word:O", sort="-x"),
331
+ x="score:Q",
332
+ )
333
+ )
334
+
335
+ return plot.properties(height=max(30 * source.shape[0], 50))