Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
8744085
0
Parent(s):
first commit
Browse files- .gitignore +108 -0
- .streamlit/config.toml +4 -0
- CODEOWNERS +1 -0
- Dockerfile +30 -0
- LICENSE +1 -0
- Makefile +42 -0
- app.py +78 -0
- assets/logo.png +0 -0
- assets/page_icon.png +0 -0
- data/test_de.xlsx +0 -0
- data/test_en.csv +0 -0
- data/test_es.xlsx +0 -0
- data/test_fe.xlsx +0 -0
- data/test_it.xlsx +0 -0
- notebooks/wordifier_nb.ipynb +280 -0
- pytest.ini +4 -0
- requirements.txt +53 -0
- src/__init__.py +0 -0
- src/configs.py +36 -0
- src/pages/about.py +34 -0
- src/pages/faq.py +112 -0
- src/pages/home.py +170 -0
- src/session_state.py +117 -0
- src/utils.py +335 -0
.gitignore
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
.github
|
6 |
+
|
7 |
+
# C extensions
|
8 |
+
*.so
|
9 |
+
.vscode
|
10 |
+
# Distribution / packaging
|
11 |
+
.Python
|
12 |
+
env/
|
13 |
+
build/
|
14 |
+
develop-eggs/
|
15 |
+
dist/
|
16 |
+
downloads/
|
17 |
+
eggs/
|
18 |
+
.eggs/
|
19 |
+
lib/
|
20 |
+
lib64/
|
21 |
+
parts/
|
22 |
+
sdist/
|
23 |
+
var/
|
24 |
+
wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.coverage
|
43 |
+
.coverage.*
|
44 |
+
.cache
|
45 |
+
nosetests.xml
|
46 |
+
coverage.xml
|
47 |
+
*.cover
|
48 |
+
.hypothesis/
|
49 |
+
.pytest_cache/
|
50 |
+
|
51 |
+
# Translations
|
52 |
+
*.mo
|
53 |
+
*.pot
|
54 |
+
|
55 |
+
# Django stuff:
|
56 |
+
*.log
|
57 |
+
local_settings.py
|
58 |
+
|
59 |
+
# Flask stuff:
|
60 |
+
instance/
|
61 |
+
.webassets-cache
|
62 |
+
|
63 |
+
# Scrapy stuff:
|
64 |
+
.scrapy
|
65 |
+
|
66 |
+
# Sphinx documentation
|
67 |
+
docs/_build/
|
68 |
+
|
69 |
+
# PyBuilder
|
70 |
+
target/
|
71 |
+
|
72 |
+
# Jupyter Notebook
|
73 |
+
.ipynb_checkpoints
|
74 |
+
|
75 |
+
# pyenv
|
76 |
+
.python-version
|
77 |
+
|
78 |
+
# celery beat schedule file
|
79 |
+
celerybeat-schedule
|
80 |
+
|
81 |
+
# SageMath parsed files
|
82 |
+
*.sage.py
|
83 |
+
|
84 |
+
# dotenv
|
85 |
+
.env
|
86 |
+
|
87 |
+
# virtualenv
|
88 |
+
.venv
|
89 |
+
venv/
|
90 |
+
ENV/
|
91 |
+
|
92 |
+
# Spyder project settings
|
93 |
+
.spyderproject
|
94 |
+
.spyproject
|
95 |
+
|
96 |
+
# Rope project settings
|
97 |
+
.ropeproject
|
98 |
+
|
99 |
+
# mkdocs documentation
|
100 |
+
/site
|
101 |
+
|
102 |
+
# mypy
|
103 |
+
.mypy_cache/
|
104 |
+
.idea
|
105 |
+
|
106 |
+
# mac
|
107 |
+
.DS_Store
|
108 |
+
data/data.csv
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[server]
|
2 |
+
# Max size, in megabytes, for files uploaded with the file_uploader.
|
3 |
+
# Default: 200
|
4 |
+
maxUploadSize = 10
|
CODEOWNERS
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
* @pietrolesci
|
Dockerfile
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
###############################################################################
|
2 |
+
# main
|
3 |
+
###############################################################################
|
4 |
+
|
5 |
+
FROM continuumio/miniconda3:4.8.2 AS main
|
6 |
+
|
7 |
+
RUN apt-get -y update && \
|
8 |
+
apt-get -y install build-essential
|
9 |
+
RUN conda update -n base -c defaults conda
|
10 |
+
|
11 |
+
# chown changes owner from root owner (1000) to the first user inside the env (100)
|
12 |
+
# COPY --chown=1000:100 requirements.txt /opt/requirements.txt
|
13 |
+
# RUN conda install --force-reinstall -y -q --name base -c conda-forge --file /opt/requirements.txt
|
14 |
+
RUN conda install --force-reinstall -y -q --name base pip
|
15 |
+
|
16 |
+
COPY . /var/app/
|
17 |
+
# WORKDIR /var/dev
|
18 |
+
WORKDIR /var/app
|
19 |
+
RUN pip install -r requirements.txt
|
20 |
+
CMD streamlit run ./app.py
|
21 |
+
|
22 |
+
###############################################################################
|
23 |
+
# test
|
24 |
+
###############################################################################
|
25 |
+
|
26 |
+
FROM main AS test
|
27 |
+
COPY . /var/dev/
|
28 |
+
WORKDIR /var/dev
|
29 |
+
# add unit test instruction here: RUN xxxxxx
|
30 |
+
# add integration test instruction here: RUN xxxxx
|
LICENSE
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
TODO: placeholder
|
Makefile
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: help build dev integration-test push
|
2 |
+
.DEFAULT_GOAL := help
|
3 |
+
|
4 |
+
# Docker image build info
|
5 |
+
PROJECT:=wordify
|
6 |
+
BUILD_TAG?=0.0.1
|
7 |
+
|
8 |
+
ALL_IMAGES:=src
|
9 |
+
|
10 |
+
help:
|
11 |
+
# http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
|
12 |
+
@echo "python starter project"
|
13 |
+
@echo "====================="
|
14 |
+
@echo "Replace % with a directory name (e.g., make build/python-example)"
|
15 |
+
@echo
|
16 |
+
@grep -E '^[a-zA-Z0-9_%/-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
17 |
+
|
18 |
+
########################################################
|
19 |
+
## Local development
|
20 |
+
########################################################
|
21 |
+
|
22 |
+
dev: ARGS?=/bin/bash
|
23 |
+
dev: DARGS?=-v "${CURDIR}":/var/dev
|
24 |
+
dev: ## run a foreground container
|
25 |
+
docker run -it --rm $(DARGS) $(PROJECT) $(ARGS)
|
26 |
+
|
27 |
+
|
28 |
+
notebook: ARGS?=jupyter lab
|
29 |
+
notebook: DARGS?=-v "${CURDIR}":/var/dev -p 8888:8888 ##notebook shall be run on http://0.0.0.0:8888 by default. Change to a different port (e.g. 8899) if 8888 is used for example 8899:8888
|
30 |
+
notebook: ## run a foreground container
|
31 |
+
docker run -it --rm $(DARGS) $(PROJECT) $(ARGS) \
|
32 |
+
--ip=0.0.0.0 \
|
33 |
+
--allow-root \
|
34 |
+
--NotebookApp.token="" \
|
35 |
+
--NotebookApp.password=""
|
36 |
+
|
37 |
+
build: DARGS?=
|
38 |
+
build: ## build the latest image for a project
|
39 |
+
docker build $(DARGS) --build-arg BUILD_TAG=${BUILD_TAG} --rm --force-rm -t $(PROJECT):${BUILD_TAG} .
|
40 |
+
|
41 |
+
run:
|
42 |
+
docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 8501:8501 $(PROJECT):${BUILD_TAG}
|
app.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from src.utils import get_logo
|
3 |
+
from src import session_state
|
4 |
+
from src.pages import (
|
5 |
+
home,
|
6 |
+
faq,
|
7 |
+
about,
|
8 |
+
)
|
9 |
+
from src.configs import SupportedFiles
|
10 |
+
|
11 |
+
# app configs
|
12 |
+
st.set_page_config(
|
13 |
+
page_title="Wordify",
|
14 |
+
layout="wide",
|
15 |
+
page_icon="./assets/logo.png",
|
16 |
+
)
|
17 |
+
|
18 |
+
# session state
|
19 |
+
session = session_state.get(process=False, run_id=0, posdf=None, negdf=None)
|
20 |
+
|
21 |
+
|
22 |
+
# ==== SIDEBAR ==== #
|
23 |
+
# LOGO
|
24 |
+
client_logo = get_logo("./assets/logo.png")
|
25 |
+
with st.sidebar.beta_container():
|
26 |
+
st.image(client_logo)
|
27 |
+
|
28 |
+
# NAVIGATION
|
29 |
+
PAGES = {
|
30 |
+
"Home": home,
|
31 |
+
"FAQ": faq,
|
32 |
+
"About": about,
|
33 |
+
}
|
34 |
+
|
35 |
+
with st.sidebar.beta_container():
|
36 |
+
st.sidebar.header("Navigation")
|
37 |
+
selection = st.sidebar.radio("Go to", list(PAGES.keys()))
|
38 |
+
|
39 |
+
page = PAGES[selection]
|
40 |
+
|
41 |
+
# FILE UPLOADER
|
42 |
+
with st.sidebar.beta_container():
|
43 |
+
st.markdown("")
|
44 |
+
st.markdown("")
|
45 |
+
st.header("Upload file")
|
46 |
+
uploaded_file = st.sidebar.file_uploader("Select file", type=[i.name for i in SupportedFiles])
|
47 |
+
|
48 |
+
|
49 |
+
# FOOTER
|
50 |
+
with st.sidebar.beta_container():
|
51 |
+
st.markdown("")
|
52 |
+
st.markdown("")
|
53 |
+
st.markdown(
|
54 |
+
"""
|
55 |
+
<span style="font-size: 0.75em">Built with ♥ by [`Pietro Lesci`](https://pietrolesci.github.io/) and [`MilaNLP`](https://twitter.com/MilaNLProc?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor)</span>
|
56 |
+
""",
|
57 |
+
unsafe_allow_html=True,
|
58 |
+
)
|
59 |
+
|
60 |
+
|
61 |
+
# ==== MAIN ==== #
|
62 |
+
with st.beta_container():
|
63 |
+
st.title("Wordify")
|
64 |
+
st.markdown(
|
65 |
+
"""
|
66 |
+
Wordify makes it easy to identify words that discriminate categories in textual data.
|
67 |
+
|
68 |
+
Let's explain Wordify with an example. Imagine you are thinking about having a glass
|
69 |
+
of wine :wine_glass: with your friends :man-man-girl-girl: and you have to buy a bottle.
|
70 |
+
You know you like `bold`, `woody` wine but are unsure which one to choose.
|
71 |
+
You wonder whether there are some words that describe each type of wine.
|
72 |
+
Since you are a researcher :female-scientist: :male-scientist:, you decide to approach
|
73 |
+
the problem scientifically :microscope:. That's where Wordify comes to the rescue!
|
74 |
+
"""
|
75 |
+
)
|
76 |
+
|
77 |
+
|
78 |
+
page.write(session, uploaded_file)
|
assets/logo.png
ADDED
![]() |
assets/page_icon.png
ADDED
![]() |
data/test_de.xlsx
ADDED
Binary file (645 kB). View file
|
|
data/test_en.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/test_es.xlsx
ADDED
Binary file (771 kB). View file
|
|
data/test_fe.xlsx
ADDED
Binary file (754 kB). View file
|
|
data/test_it.xlsx
ADDED
Binary file (662 kB). View file
|
|
notebooks/wordifier_nb.ipynb
ADDED
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"language_info": {
|
4 |
+
"codemirror_mode": {
|
5 |
+
"name": "ipython",
|
6 |
+
"version": 3
|
7 |
+
},
|
8 |
+
"file_extension": ".py",
|
9 |
+
"mimetype": "text/x-python",
|
10 |
+
"name": "python",
|
11 |
+
"nbconvert_exporter": "python",
|
12 |
+
"pygments_lexer": "ipython3",
|
13 |
+
"version": "3.8.3"
|
14 |
+
},
|
15 |
+
"orig_nbformat": 2,
|
16 |
+
"kernelspec": {
|
17 |
+
"name": "python383jvsc74a57bd01cb9a1c850fd1d16c5b98054247a74b7b7a12849bcfa00436ba202c2a9e2bbb2",
|
18 |
+
"display_name": "Python 3.8.3 64-bit ('py38': conda)"
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"nbformat": 4,
|
22 |
+
"nbformat_minor": 2,
|
23 |
+
"cells": [
|
24 |
+
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"execution_count": 1,
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [],
|
29 |
+
"source": [
|
30 |
+
"import sys\n",
|
31 |
+
"nb_dir = os.path.split(os.getcwd())[0]\n",
|
32 |
+
"if nb_dir not in sys.path:\n",
|
33 |
+
" sys.path.append(nb_dir)\n",
|
34 |
+
"\n",
|
35 |
+
"import numpy as np\n",
|
36 |
+
"import pandas as pd\n",
|
37 |
+
"# import modin.pandas as mpd\n",
|
38 |
+
"import spacy\n",
|
39 |
+
"from src.configs import ModelConfigs, Languages\n",
|
40 |
+
"from src.utils import wordifier, TextPreprocessor, encode\n",
|
41 |
+
"\n",
|
42 |
+
"from textacy.preprocessing import make_pipeline, remove, replace, normalize\n",
|
43 |
+
"from tqdm import trange\n",
|
44 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
45 |
+
"from sklearn.linear_model import LogisticRegression\n",
|
46 |
+
"from sklearn.preprocessing import LabelEncoder\n",
|
47 |
+
"from sklearn.utils import resample\n",
|
48 |
+
"import multiprocessing as mp\n",
|
49 |
+
"# import dask.dataframe as dask_df\n",
|
50 |
+
"from stqdm import stqdm\n",
|
51 |
+
"stqdm.pandas()\n",
|
52 |
+
"\n",
|
53 |
+
"from tqdm import trange\n",
|
54 |
+
"\n",
|
55 |
+
"import os\n",
|
56 |
+
"# os.environ[\"MODIN_ENGINE\"] = \"ray\" # Modin will use Ray\n",
|
57 |
+
"\n",
|
58 |
+
"import vaex\n",
|
59 |
+
"pd.set_option(\"display.max_colwidth\", None)"
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"cell_type": "code",
|
64 |
+
"execution_count": 2,
|
65 |
+
"metadata": {},
|
66 |
+
"outputs": [],
|
67 |
+
"source": [
|
68 |
+
"df = pd.read_excel(\"../data/test_de.xlsx\")\n",
|
69 |
+
"# mdf = mpd.read_csv(\"../data/test_en.csv\")\n",
|
70 |
+
"language = \"English\"\n",
|
71 |
+
"nlp = spacy.load(Languages[language].value, exclude=[\"parser\", \"ner\", \"pos\", \"tok2vec\"])"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": 3,
|
77 |
+
"metadata": {},
|
78 |
+
"outputs": [],
|
79 |
+
"source": [
|
80 |
+
"prep = TextPreprocessor(\n",
|
81 |
+
" language=\"English\", \n",
|
82 |
+
" cleaning_steps=list(TextPreprocessor._cleaning_options().keys()),\n",
|
83 |
+
" lemmatizer_when=None,\n",
|
84 |
+
")"
|
85 |
+
]
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"cell_type": "code",
|
89 |
+
"execution_count": 4,
|
90 |
+
"metadata": {},
|
91 |
+
"outputs": [
|
92 |
+
{
|
93 |
+
"output_type": "stream",
|
94 |
+
"name": "stderr",
|
95 |
+
"text": [
|
96 |
+
"2021-05-10 14:30:04.984 WARNING root: \n",
|
97 |
+
" \u001b[33m\u001b[1mWarning:\u001b[0m to view this Streamlit app on a browser, run it with the following\n",
|
98 |
+
" command:\n",
|
99 |
+
"\n",
|
100 |
+
" streamlit run /Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/ipykernel_launcher.py [ARGUMENTS]\n",
|
101 |
+
"100%|██████████| 6269/6269 [00:02<00:00, 2793.61it/s]\n"
|
102 |
+
]
|
103 |
+
}
|
104 |
+
],
|
105 |
+
"source": [
|
106 |
+
"df[\"p_text\"] = prep.fit_transform(df[\"text\"])"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "code",
|
111 |
+
"execution_count": 6,
|
112 |
+
"metadata": {},
|
113 |
+
"outputs": [],
|
114 |
+
"source": [
|
115 |
+
"X, y, X_names, y_names = encode(df[\"p_text\"], df[\"label\"]).values()"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"cell_type": "code",
|
120 |
+
"execution_count": 11,
|
121 |
+
"metadata": {},
|
122 |
+
"outputs": [],
|
123 |
+
"source": [
|
124 |
+
"clf = LogisticRegression(\n",
|
125 |
+
" penalty=\"l1\",\n",
|
126 |
+
" C=0.05,#ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n",
|
127 |
+
" solver=\"saga\",\n",
|
128 |
+
" multi_class=\"auto\",\n",
|
129 |
+
" max_iter=500,\n",
|
130 |
+
" class_weight=\"balanced\",\n",
|
131 |
+
")"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"cell_type": "code",
|
136 |
+
"execution_count": 12,
|
137 |
+
"metadata": {},
|
138 |
+
"outputs": [
|
139 |
+
{
|
140 |
+
"output_type": "stream",
|
141 |
+
"name": "stdout",
|
142 |
+
"text": [
|
143 |
+
"CPU times: user 1min 23s, sys: 138 ms, total: 1min 23s\n",
|
144 |
+
"Wall time: 1min 24s\n",
|
145 |
+
"/Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:329: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
|
146 |
+
" warnings.warn(\"The max_iter was reached which means \"\n"
|
147 |
+
]
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"output_type": "execute_result",
|
151 |
+
"data": {
|
152 |
+
"text/plain": [
|
153 |
+
"LogisticRegression(C=0.05, class_weight='balanced', max_iter=500, penalty='l1',\n",
|
154 |
+
" solver='saga')"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
"metadata": {},
|
158 |
+
"execution_count": 12
|
159 |
+
}
|
160 |
+
],
|
161 |
+
"source": [
|
162 |
+
"%%time\n",
|
163 |
+
"clf.fit(X, y)"
|
164 |
+
]
|
165 |
+
},
|
166 |
+
{
|
167 |
+
"cell_type": "code",
|
168 |
+
"execution_count": 14,
|
169 |
+
"metadata": {},
|
170 |
+
"outputs": [
|
171 |
+
{
|
172 |
+
"output_type": "stream",
|
173 |
+
"name": "stderr",
|
174 |
+
"text": [
|
175 |
+
" 6%|▌ | 28/500 [01:01<27:33, 3.50s/it]/Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
|
176 |
+
" warnings.warn(\"Liblinear failed to converge, increase \"\n",
|
177 |
+
" 31%|███ | 156/500 [06:18<13:54, 2.43s/it]\n"
|
178 |
+
]
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"output_type": "error",
|
182 |
+
"ename": "KeyboardInterrupt",
|
183 |
+
"evalue": "",
|
184 |
+
"traceback": [
|
185 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
186 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
187 |
+
"\u001b[0;32m<ipython-input-14-1fef5b7ccf45>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;31m# fit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 41\u001b[0;31m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 42\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
188 |
+
"\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 1354\u001b[0m \u001b[0;34m\" 'solver' is set to 'liblinear'. Got 'n_jobs'\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1355\u001b[0m \" = {}.\".format(effective_n_jobs(self.n_jobs)))\n\u001b[0;32m-> 1356\u001b[0;31m self.coef_, self.intercept_, n_iter_ = _fit_liblinear(\n\u001b[0m\u001b[1;32m 1357\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_intercept\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintercept_scaling\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1358\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclass_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdual\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
189 |
+
"\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py\u001b[0m in \u001b[0;36m_fit_liblinear\u001b[0;34m(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 965\u001b[0m \u001b[0msolver_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_liblinear_solver_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmulti_class\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdual\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 966\u001b[0;31m raw_coef_, n_iter_ = liblinear.train_wrap(\n\u001b[0m\u001b[1;32m 967\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_ind\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misspmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msolver_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[0mclass_weight_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrnd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'i'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
190 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
191 |
+
]
|
192 |
+
}
|
193 |
+
],
|
194 |
+
"source": [
|
195 |
+
"n_instances, n_features = X.shape\n",
|
196 |
+
"n_classes = len(y_names)\n",
|
197 |
+
"\n",
|
198 |
+
"# NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n",
|
199 |
+
"sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n",
|
200 |
+
"\n",
|
201 |
+
"sample_size = min(\n",
|
202 |
+
" # this is the maximum supported\n",
|
203 |
+
" ModelConfigs.MAX_SELECTION.value,\n",
|
204 |
+
" # at minimum you want MIN_SELECTION but in general you want\n",
|
205 |
+
" # n_instances * sample_fraction\n",
|
206 |
+
" max(ModelConfigs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n",
|
207 |
+
" # however if previous one is bigger the the available instances take\n",
|
208 |
+
" # the number of available instances\n",
|
209 |
+
" n_instances,\n",
|
210 |
+
")\n",
|
211 |
+
"\n",
|
212 |
+
"# TODO: might want to try out something to subsample features at each iteration\n",
|
213 |
+
"\n",
|
214 |
+
"# initialize coefficient matrices\n",
|
215 |
+
"pos_scores = np.zeros((n_classes, n_features), dtype=int)\n",
|
216 |
+
"neg_scores = np.zeros((n_classes, n_features), dtype=int)\n",
|
217 |
+
"\n",
|
218 |
+
"for _ in trange(ModelConfigs.NUM_ITERS.value):\n",
|
219 |
+
"\n",
|
220 |
+
" # run randomized regression\n",
|
221 |
+
" clf = LogisticRegression(\n",
|
222 |
+
" penalty=\"l1\",\n",
|
223 |
+
" C=ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n",
|
224 |
+
" solver=\"liblinear\",\n",
|
225 |
+
" multi_class=\"auto\",\n",
|
226 |
+
" max_iter=500,\n",
|
227 |
+
" class_weight=\"balanced\",\n",
|
228 |
+
" )\n",
|
229 |
+
"\n",
|
230 |
+
" # sample indices to subsample matrix\n",
|
231 |
+
" selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)\n",
|
232 |
+
"\n",
|
233 |
+
" # fit\n",
|
234 |
+
" try:\n",
|
235 |
+
" clf.fit(X[selection], y[selection])\n",
|
236 |
+
" except ValueError:\n",
|
237 |
+
" continue\n",
|
238 |
+
"\n",
|
239 |
+
" # record coefficients\n",
|
240 |
+
" if n_classes == 2:\n",
|
241 |
+
" pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n",
|
242 |
+
" neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n",
|
243 |
+
" pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n",
|
244 |
+
" neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n",
|
245 |
+
" else:\n",
|
246 |
+
" pos_scores += clf.coef_ > 0\n",
|
247 |
+
" neg_scores += clf.coef_ < 0"
|
248 |
+
]
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"cell_type": "code",
|
252 |
+
"execution_count": null,
|
253 |
+
"metadata": {},
|
254 |
+
"outputs": [],
|
255 |
+
"source": [
|
256 |
+
"# normalize\n",
|
257 |
+
"pos_scores = pos_scores / ModelConfigs.NUM_ITERS.value\n",
|
258 |
+
"neg_scores = neg_scores / ModelConfigs.NUM_ITERS.value\n",
|
259 |
+
"\n",
|
260 |
+
"# get only active features\n",
|
261 |
+
"pos_positions = np.where(pos_scores >= ModelConfigs.SELECTION_THRESHOLD.value, pos_scores, 0)\n",
|
262 |
+
"neg_positions = np.where(neg_scores >= ModelConfigs.SELECTION_THRESHOLD.value, neg_scores, 0)\n",
|
263 |
+
"\n",
|
264 |
+
"# prepare DataFrame\n",
|
265 |
+
"pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]\n",
|
266 |
+
"neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]\n",
|
267 |
+
"\n",
|
268 |
+
"posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)\n",
|
269 |
+
"negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)"
|
270 |
+
]
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"cell_type": "code",
|
274 |
+
"execution_count": null,
|
275 |
+
"metadata": {},
|
276 |
+
"outputs": [],
|
277 |
+
"source": []
|
278 |
+
}
|
279 |
+
]
|
280 |
+
}
|
pytest.ini
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[pytest]
|
2 |
+
markers =
|
3 |
+
cache_tests: mark a test which is about the recurrence computer cache
|
4 |
+
seed_tests: mark a test which is about the seed sequence
|
requirements.txt
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pytest==5.4.2
|
2 |
+
pytest-cov==2.8.1
|
3 |
+
sphinx==3.0.4
|
4 |
+
black==19.10b0
|
5 |
+
jupyterlab==2.1.4
|
6 |
+
pandas==1.2.4
|
7 |
+
jupytext==1.5.1
|
8 |
+
nbval==0.9.6
|
9 |
+
textacy==0.11.0
|
10 |
+
streamlit==0.81.1
|
11 |
+
spacy==3.0.6
|
12 |
+
numpy==1.20.2
|
13 |
+
scikit-learn==0.24.2
|
14 |
+
xlrd==2.0.1
|
15 |
+
openpyxl==3.0.7
|
16 |
+
stqdm==0.0.3
|
17 |
+
watchdog==2.1.0
|
18 |
+
flake8
|
19 |
+
|
20 |
+
# english
|
21 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm
|
22 |
+
# italian
|
23 |
+
https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.0.0/it_core_news_sm-3.0.0.tar.gz#egg=it_core_news_sm
|
24 |
+
# german
|
25 |
+
https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm
|
26 |
+
# spanish
|
27 |
+
https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.0.0/es_core_news_sm-3.0.0.tar.gz#egg=es_core_news_sm
|
28 |
+
# greek
|
29 |
+
https://github.com/explosion/spacy-models/releases/download/el_core_news_sm-3.0.0/el_core_news_sm-3.0.0.tar.gz#egg=el_core_news_sm
|
30 |
+
# dutch
|
31 |
+
https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-3.0.0/nl_core_news_sm-3.0.0.tar.gz#egg=nl_core_news_sm
|
32 |
+
# portuguese
|
33 |
+
https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.0.0/pt_core_news_sm-3.0.0.tar.gz#egg=pt_core_news_sm
|
34 |
+
# french
|
35 |
+
https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.0.0/fr_core_news_sm-3.0.0.tar.gz#egg=fr_core_news_sm
|
36 |
+
# chinese
|
37 |
+
# https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.0.0/zh_core_web_sm-3.0.0.tar.gz#egg=zh_core_web_sm
|
38 |
+
# danish
|
39 |
+
https://github.com/explosion/spacy-models/releases/download/da_core_news_sm-3.0.0/da_core_news_sm-3.0.0.tar.gz#egg=da_core_news_sm
|
40 |
+
# japanese
|
41 |
+
# https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-3.0.0/ja_core_news_sm-3.0.0.tar.gz#egg=ja_core_news_sm
|
42 |
+
# lithuanian
|
43 |
+
https://github.com/explosion/spacy-models/releases/download/lt_core_news_sm-3.0.0/lt_core_news_sm-3.0.0.tar.gz#egg=lt_core_news_sm
|
44 |
+
# norvegian
|
45 |
+
https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-3.0.0/nb_core_news_sm-3.0.0.tar.gz#egg=nb_core_news_sm
|
46 |
+
# polish
|
47 |
+
https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.0.0/pl_core_news_sm-3.0.0.tar.gz#egg=pl_core_news_sm
|
48 |
+
# romanian
|
49 |
+
https://github.com/explosion/spacy-models/releases/download/ro_core_news_sm-3.0.0/ro_core_news_sm-3.0.0.tar.gz#egg=ro_core_news_sm
|
50 |
+
# russian
|
51 |
+
https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.0.0/ru_core_news_sm-3.0.0.tar.gz#egg=ru_core_news_sm
|
52 |
+
# multi-language
|
53 |
+
https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.0.0/xx_ent_wiki_sm-3.0.0.tar.gz#egg=xx_ent_wiki_sm
|
src/__init__.py
ADDED
File without changes
|
src/configs.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
|
5 |
+
class ModelConfigs(Enum):
|
6 |
+
NUM_ITERS = 500
|
7 |
+
SELECTION_THRESHOLD = 0.0
|
8 |
+
PENALTIES = [10, 5, 2, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001]
|
9 |
+
MAX_SELECTION = 100_000
|
10 |
+
MIN_SELECTION = 10_000
|
11 |
+
|
12 |
+
|
13 |
+
class Languages(Enum):
|
14 |
+
English = "en_core_web_sm"
|
15 |
+
Italian = "it_core_news_sm"
|
16 |
+
German = "de_core_news_sm"
|
17 |
+
Spanish = "es_core_news_sm"
|
18 |
+
Greek = "el_core_news_sm"
|
19 |
+
Dutch = "nl_core_news_sm"
|
20 |
+
Portuguese = "pt_core_news_sm"
|
21 |
+
French = "fr_core_news_sm"
|
22 |
+
Chinese = "zh_core_news_sm"
|
23 |
+
Danish = "da_core_news_sm"
|
24 |
+
Japanese = "ja_core_news_sm"
|
25 |
+
Lithuanian = "lt_core_news_sm"
|
26 |
+
Norvegian = "nb_core_news_sm"
|
27 |
+
Polish = "pl_core_news_sm"
|
28 |
+
Romanian = "ro_core_news_sm"
|
29 |
+
Russian = "ru_core_news_sm"
|
30 |
+
MultiLanguage = "xx_ent_wiki_sm"
|
31 |
+
|
32 |
+
|
33 |
+
class SupportedFiles(Enum):
|
34 |
+
xlsx = (pd.read_excel,)
|
35 |
+
csv = (pd.read_csv,)
|
36 |
+
parquet = (pd.read_parquet,)
|
src/pages/about.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
def write(*args):
|
5 |
+
# ==== Contacts ==== #
|
6 |
+
with st.beta_container():
|
7 |
+
st.markdown("")
|
8 |
+
st.markdown("")
|
9 |
+
st.header(":rocket:About us")
|
10 |
+
|
11 |
+
st.markdown(
|
12 |
+
"""
|
13 |
+
You can reach out to us via email, phone, or - if you are old-fashioned - via mail
|
14 |
+
"""
|
15 |
+
)
|
16 |
+
with st.beta_expander("Contacts"):
|
17 |
+
|
18 |
+
_, col2 = st.beta_columns([0.5, 3])
|
19 |
+
col2.markdown(
|
20 |
+
"""
|
21 |
+
:email: [email protected]
|
22 |
+
|
23 |
+
:telephone_receiver: +39 02 5836 2604
|
24 |
+
|
25 |
+
:postbox: Via Röntgen n. 1, Milan 20136 (ITALY)
|
26 |
+
"""
|
27 |
+
)
|
28 |
+
|
29 |
+
st.write(
|
30 |
+
"""
|
31 |
+
<iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
|
32 |
+
""",
|
33 |
+
unsafe_allow_html=True,
|
34 |
+
)
|
src/pages/faq.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from src.configs import Languages
|
3 |
+
|
4 |
+
|
5 |
+
def write(*args):
|
6 |
+
|
7 |
+
# ==== HPW IT WORKS ==== #
|
8 |
+
with st.beta_container():
|
9 |
+
st.markdown("")
|
10 |
+
st.markdown("")
|
11 |
+
st.header("How it works")
|
12 |
+
st.subheader("Step 1 - Prepare your data")
|
13 |
+
st.markdown(
|
14 |
+
"""
|
15 |
+
Create an Excel or CSV file with two columns for each row:
|
16 |
+
|
17 |
+
- a column with the name or the label identifying a specific object or class (e.g., in our
|
18 |
+
wine example above it would be the type of wine or the name of a specific brand). It is
|
19 |
+
common practice naming this column `label`
|
20 |
+
|
21 |
+
- a column with the text describing that specific object or class (e.g., in the wine example
|
22 |
+
above it could be the description that you find on the rear of the bottle label). It is
|
23 |
+
common practice naming this column `text`
|
24 |
+
|
25 |
+
To have reliable results, we suggest providing at least 2000 labelled texts. If you provide
|
26 |
+
less we will still wordify your file, but the results should then be taken with a grain of
|
27 |
+
salt.
|
28 |
+
|
29 |
+
Consider that we also support multi-language texts, therefore you'll be able to
|
30 |
+
automatically discriminate between international wines, even if your preferred Italian
|
31 |
+
producer does not provide you with a description written in English!
|
32 |
+
"""
|
33 |
+
)
|
34 |
+
|
35 |
+
st.subheader("Step 2 - Upload your file and Wordify!")
|
36 |
+
st.markdown(
|
37 |
+
"""
|
38 |
+
Once you have prepared your Excel or CSV file, click the "Browse File" button.
|
39 |
+
Browse for your file.
|
40 |
+
Choose the language of your texts (select multi-language if your file contains text in
|
41 |
+
different languages).
|
42 |
+
Push the "Wordify|" button, set back, and wait for wordify to do its tricks.
|
43 |
+
|
44 |
+
Depending on the size of your data, the process can take from 1 minute to 5 minutes
|
45 |
+
"""
|
46 |
+
)
|
47 |
+
|
48 |
+
# ==== FAQ ==== #
|
49 |
+
with st.beta_container():
|
50 |
+
st.markdown("")
|
51 |
+
st.markdown("")
|
52 |
+
st.header(":question:Frequently Asked Questions")
|
53 |
+
with st.beta_expander("What is Wordify?"):
|
54 |
+
st.markdown(
|
55 |
+
"""
|
56 |
+
Wordify is a way to find out which terms are most indicative for each of your dependent
|
57 |
+
variable values.
|
58 |
+
"""
|
59 |
+
)
|
60 |
+
|
61 |
+
with st.beta_expander("What happens to my data?"):
|
62 |
+
st.markdown(
|
63 |
+
"""
|
64 |
+
Nothing. We never store the data you upload on disk: it is only kept in memory for the
|
65 |
+
duration of the modeling, and then deleted. We do not retain any copies or traces of
|
66 |
+
your data.
|
67 |
+
"""
|
68 |
+
)
|
69 |
+
|
70 |
+
with st.beta_expander("What input formats do you support?"):
|
71 |
+
st.markdown(
|
72 |
+
"""
|
73 |
+
The file you upload should be .xlsx, with two columns: the first should be labeled
|
74 |
+
'text' and contain all your documents (e.g., tweets, reviews, patents, etc.), one per
|
75 |
+
line. The second column should be labeled 'label', and contain the dependent variable
|
76 |
+
label associated with each text (e.g., rating, author gender, company, etc.).
|
77 |
+
"""
|
78 |
+
)
|
79 |
+
|
80 |
+
with st.beta_expander("How does it work?"):
|
81 |
+
st.markdown(
|
82 |
+
"""
|
83 |
+
It uses a variant of the Stability Selection algorithm
|
84 |
+
[(Meinshausen and Bühlmann, 2010)](https://rss.onlinelibrary.wiley.com/doi/full/10.1111/j.1467-9868.2010.00740.x)
|
85 |
+
to fit hundreds of logistic regression models on random subsets of the data, using
|
86 |
+
different L1 penalties to drive as many of the term coefficients to 0. Any terms that
|
87 |
+
receive a non-zero coefficient in at least 30% of all model runs can be seen as stable
|
88 |
+
indicators.
|
89 |
+
"""
|
90 |
+
)
|
91 |
+
|
92 |
+
with st.beta_expander("How much data do I need?"):
|
93 |
+
st.markdown(
|
94 |
+
"""
|
95 |
+
We recommend at least 2000 instances, the more, the better. With fewer instances, the
|
96 |
+
results are less replicable and reliable.
|
97 |
+
"""
|
98 |
+
)
|
99 |
+
|
100 |
+
with st.beta_expander("Is there a paper I can cite?"):
|
101 |
+
st.markdown(
|
102 |
+
"""
|
103 |
+
Yes please! Reference coming soon...
|
104 |
+
"""
|
105 |
+
)
|
106 |
+
|
107 |
+
with st.beta_expander("What languages are supported?"):
|
108 |
+
st.markdown(
|
109 |
+
f"""
|
110 |
+
Currently we support: {", ".join([i.name for i in Languages])}.
|
111 |
+
"""
|
112 |
+
)
|
src/pages/home.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.configs import Languages
|
2 |
+
from src.utils import (
|
3 |
+
encode,
|
4 |
+
wordifier,
|
5 |
+
download_button,
|
6 |
+
TextPreprocessor,
|
7 |
+
plot_labels_prop,
|
8 |
+
plot_nchars,
|
9 |
+
plot_score,
|
10 |
+
get_logo,
|
11 |
+
read_file,
|
12 |
+
)
|
13 |
+
import streamlit as st
|
14 |
+
|
15 |
+
|
16 |
+
def write(session, uploaded_file):
|
17 |
+
|
18 |
+
if uploaded_file:
|
19 |
+
|
20 |
+
# 1. READ FILE
|
21 |
+
with st.spinner("Reading file"):
|
22 |
+
# TODO: write parser function that automatically understands format
|
23 |
+
data = read_file(uploaded_file)
|
24 |
+
|
25 |
+
# 2. CREATE UI TO SELECT COLUMNS
|
26 |
+
st.markdown("")
|
27 |
+
st.markdown("")
|
28 |
+
st.header("Process")
|
29 |
+
|
30 |
+
col1, col2, col3 = st.beta_columns(3)
|
31 |
+
with col1:
|
32 |
+
language = st.selectbox("Select language", [i.name for i in Languages])
|
33 |
+
with st.beta_expander("Description"):
|
34 |
+
st.markdown(
|
35 |
+
f"Select a language of text amongst those supported: {', '.join([f'`{i.name}`' for i in Languages])}"
|
36 |
+
)
|
37 |
+
with col2:
|
38 |
+
cols_options = [""] + data.columns.tolist()
|
39 |
+
label_column = st.selectbox("Select label column name", cols_options, index=0)
|
40 |
+
with st.beta_expander("Description"):
|
41 |
+
st.markdown("Select the column containing the label")
|
42 |
+
|
43 |
+
if label_column:
|
44 |
+
st.altair_chart(plot_labels_prop(data, label_column), use_container_width=True)
|
45 |
+
|
46 |
+
with col3:
|
47 |
+
text_column = st.selectbox("Select text column name", cols_options, index=0)
|
48 |
+
with st.beta_expander("Description"):
|
49 |
+
st.markdown("Select the column containing the text")
|
50 |
+
|
51 |
+
if text_column:
|
52 |
+
st.altair_chart(plot_nchars(data, text_column), use_container_width=True)
|
53 |
+
|
54 |
+
with st.beta_expander("Advanced options"):
|
55 |
+
# Lemmatization option
|
56 |
+
col1, col2 = st.beta_columns([1, 3])
|
57 |
+
with col1:
|
58 |
+
lemmatization_when_elem = st.empty()
|
59 |
+
with col2:
|
60 |
+
st.markdown("Choose lemmatization option")
|
61 |
+
|
62 |
+
# stopwords option
|
63 |
+
col1, col2 = st.beta_columns([1, 3])
|
64 |
+
with col1:
|
65 |
+
remove_stopwords_elem = st.empty()
|
66 |
+
with col2:
|
67 |
+
st.markdown("Choose stopword option")
|
68 |
+
|
69 |
+
# cleaning steps
|
70 |
+
col1, col2 = st.beta_columns([1, 3])
|
71 |
+
with col1:
|
72 |
+
cleaning_steps_elem = st.empty()
|
73 |
+
reset_button = st.empty()
|
74 |
+
with col2:
|
75 |
+
st.markdown("Choose cleaning steps")
|
76 |
+
|
77 |
+
# implement reset logic
|
78 |
+
if reset_button.button("Reset steps"):
|
79 |
+
session.run_id += 1
|
80 |
+
|
81 |
+
steps_options = list(TextPreprocessor._cleaning_options().keys())
|
82 |
+
cleaning_steps = cleaning_steps_elem.multiselect(
|
83 |
+
"Select text processing steps (ordered)",
|
84 |
+
options=steps_options,
|
85 |
+
default=steps_options,
|
86 |
+
format_func=lambda x: x.replace("_", " ").title(),
|
87 |
+
key=session.run_id,
|
88 |
+
)
|
89 |
+
lemmatization_options = list(TextPreprocessor._lemmatization_options().keys())
|
90 |
+
lemmatization_when = lemmatization_when_elem.selectbox(
|
91 |
+
"Select when lemmatization happens",
|
92 |
+
options=lemmatization_options,
|
93 |
+
index=0,
|
94 |
+
key=session.run_id,
|
95 |
+
)
|
96 |
+
remove_stopwords = remove_stopwords_elem.checkbox("Remove stopwords", value=True, key=session.run_id)
|
97 |
+
|
98 |
+
# Show sample checkbox
|
99 |
+
col1, col2 = st.beta_columns([1, 2])
|
100 |
+
with col1:
|
101 |
+
show_sample = st.checkbox("Show sample of preprocessed text")
|
102 |
+
|
103 |
+
# initialize text preprocessor
|
104 |
+
preprocessor = TextPreprocessor(
|
105 |
+
language=language,
|
106 |
+
cleaning_steps=cleaning_steps,
|
107 |
+
lemmatizer_when=lemmatization_when,
|
108 |
+
remove_stop=remove_stopwords,
|
109 |
+
)
|
110 |
+
|
111 |
+
# 3. PROVIDE FEEDBACK ON OPTIONS
|
112 |
+
if show_sample and not (label_column and text_column):
|
113 |
+
st.warning("Please select `label` and `text` columns")
|
114 |
+
|
115 |
+
elif show_sample and (label_column and text_column):
|
116 |
+
sample_data = data.sample(10)
|
117 |
+
sample_data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(sample_data[text_column]).values
|
118 |
+
st.table(sample_data.loc[:, [label_column, text_column, f"preprocessed_{text_column}"]])
|
119 |
+
|
120 |
+
# 4. RUN
|
121 |
+
run_button = st.button("Wordify!")
|
122 |
+
if run_button and not (label_column and text_column):
|
123 |
+
st.warning("Please select `label` and `text` columns")
|
124 |
+
|
125 |
+
elif run_button and (label_column and text_column) and not session.process:
|
126 |
+
# data = data.head()
|
127 |
+
data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(data[text_column]).values
|
128 |
+
|
129 |
+
inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
|
130 |
+
session.posdf, session.negdf = wordifier(**inputs)
|
131 |
+
st.success("Wordified!")
|
132 |
+
|
133 |
+
# session.posdf, session.negdf = process(data, text_column, label_column)
|
134 |
+
session.process = True
|
135 |
+
|
136 |
+
# 5. RESULTS
|
137 |
+
if session.process and (label_column and text_column):
|
138 |
+
st.markdown("")
|
139 |
+
st.markdown("")
|
140 |
+
st.header("Results")
|
141 |
+
|
142 |
+
# col1, col2, _ = st.beta_columns(3)
|
143 |
+
col1, col2, col3 = st.beta_columns([2, 3, 3])
|
144 |
+
|
145 |
+
with col1:
|
146 |
+
label = st.selectbox("Select label", data[label_column].unique().tolist())
|
147 |
+
# # with col2:
|
148 |
+
# thres = st.slider(
|
149 |
+
# "Select threshold",
|
150 |
+
# min_value=0,
|
151 |
+
# max_value=100,
|
152 |
+
# step=1,
|
153 |
+
# format="%f",
|
154 |
+
# value=30,
|
155 |
+
# )
|
156 |
+
show_plots = st.checkbox("Show plots of top 100")
|
157 |
+
|
158 |
+
with col2:
|
159 |
+
st.subheader(f"Words __positively__ identifying label `{label}`")
|
160 |
+
st.write(session.posdf[session.posdf[label_column] == label].sort_values("score", ascending=False))
|
161 |
+
download_button(session.posdf, "positive_data")
|
162 |
+
if show_plots:
|
163 |
+
st.altair_chart(plot_score(session.posdf, label_column, label), use_container_width=True)
|
164 |
+
|
165 |
+
with col3:
|
166 |
+
st.subheader(f"Words __negatively__ identifying label `{label}`")
|
167 |
+
st.write(session.negdf[session.negdf[label_column] == label].sort_values("score", ascending=False))
|
168 |
+
download_button(session.negdf, "negative_data")
|
169 |
+
if show_plots:
|
170 |
+
st.altair_chart(plot_score(session.negdf, label_column, label), use_container_width=True)
|
src/session_state.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Hack to add per-session state to Streamlit.
|
2 |
+
|
3 |
+
Usage
|
4 |
+
-----
|
5 |
+
|
6 |
+
>>> import SessionState
|
7 |
+
>>>
|
8 |
+
>>> session_state = SessionState.get(user_name='', favorite_color='black')
|
9 |
+
>>> session_state.user_name
|
10 |
+
''
|
11 |
+
>>> session_state.user_name = 'Mary'
|
12 |
+
>>> session_state.favorite_color
|
13 |
+
'black'
|
14 |
+
|
15 |
+
Since you set user_name above, next time your script runs this will be the
|
16 |
+
result:
|
17 |
+
>>> session_state = get(user_name='', favorite_color='black')
|
18 |
+
>>> session_state.user_name
|
19 |
+
'Mary'
|
20 |
+
|
21 |
+
"""
|
22 |
+
try:
|
23 |
+
import streamlit.ReportThread as ReportThread
|
24 |
+
from streamlit.server.Server import Server
|
25 |
+
except Exception:
|
26 |
+
# Streamlit >= 0.65.0
|
27 |
+
import streamlit.report_thread as ReportThread
|
28 |
+
from streamlit.server.server import Server
|
29 |
+
|
30 |
+
|
31 |
+
class SessionState(object):
|
32 |
+
def __init__(self, **kwargs):
|
33 |
+
"""A new SessionState object.
|
34 |
+
|
35 |
+
Parameters
|
36 |
+
----------
|
37 |
+
**kwargs : any
|
38 |
+
Default values for the session state.
|
39 |
+
|
40 |
+
Example
|
41 |
+
-------
|
42 |
+
>>> session_state = SessionState(user_name='', favorite_color='black')
|
43 |
+
>>> session_state.user_name = 'Mary'
|
44 |
+
''
|
45 |
+
>>> session_state.favorite_color
|
46 |
+
'black'
|
47 |
+
|
48 |
+
"""
|
49 |
+
for key, val in kwargs.items():
|
50 |
+
setattr(self, key, val)
|
51 |
+
|
52 |
+
|
53 |
+
def get(**kwargs):
|
54 |
+
"""Gets a SessionState object for the current session.
|
55 |
+
|
56 |
+
Creates a new object if necessary.
|
57 |
+
|
58 |
+
Parameters
|
59 |
+
----------
|
60 |
+
**kwargs : any
|
61 |
+
Default values you want to add to the session state, if we're creating a
|
62 |
+
new one.
|
63 |
+
|
64 |
+
Example
|
65 |
+
-------
|
66 |
+
>>> session_state = get(user_name='', favorite_color='black')
|
67 |
+
>>> session_state.user_name
|
68 |
+
''
|
69 |
+
>>> session_state.user_name = 'Mary'
|
70 |
+
>>> session_state.favorite_color
|
71 |
+
'black'
|
72 |
+
|
73 |
+
Since you set user_name above, next time your script runs this will be the
|
74 |
+
result:
|
75 |
+
>>> session_state = get(user_name='', favorite_color='black')
|
76 |
+
>>> session_state.user_name
|
77 |
+
'Mary'
|
78 |
+
|
79 |
+
"""
|
80 |
+
# Hack to get the session object from Streamlit.
|
81 |
+
|
82 |
+
ctx = ReportThread.get_report_ctx()
|
83 |
+
|
84 |
+
this_session = None
|
85 |
+
|
86 |
+
current_server = Server.get_current()
|
87 |
+
if hasattr(current_server, "_session_infos"):
|
88 |
+
# Streamlit < 0.56
|
89 |
+
session_infos = Server.get_current()._session_infos.values()
|
90 |
+
else:
|
91 |
+
session_infos = Server.get_current()._session_info_by_id.values()
|
92 |
+
|
93 |
+
for session_info in session_infos:
|
94 |
+
s = session_info.session
|
95 |
+
if (
|
96 |
+
# Streamlit < 0.54.0
|
97 |
+
(hasattr(s, "_main_dg") and s._main_dg == ctx.main_dg)
|
98 |
+
or
|
99 |
+
# Streamlit >= 0.54.0
|
100 |
+
(not hasattr(s, "_main_dg") and s.enqueue == ctx.enqueue)
|
101 |
+
or
|
102 |
+
# Streamlit >= 0.65.2
|
103 |
+
(not hasattr(s, "_main_dg") and s._uploaded_file_mgr == ctx.uploaded_file_mgr)
|
104 |
+
):
|
105 |
+
this_session = s
|
106 |
+
|
107 |
+
if this_session is None:
|
108 |
+
raise RuntimeError(
|
109 |
+
"Oh noes. Couldn't get your Streamlit Session object. " "Are you doing something fancy with threads?"
|
110 |
+
)
|
111 |
+
|
112 |
+
# Got the session object! Now let's attach some state into it.
|
113 |
+
|
114 |
+
if not hasattr(this_session, "_custom_session_state"):
|
115 |
+
this_session._custom_session_state = SessionState(**kwargs)
|
116 |
+
|
117 |
+
return this_session._custom_session_state
|
src/utils.py
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import re
|
3 |
+
from collections import OrderedDict
|
4 |
+
from typing import Callable, Dict, List
|
5 |
+
|
6 |
+
import altair as alt
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
import spacy
|
10 |
+
import streamlit as st
|
11 |
+
from pandas.core.series import Series
|
12 |
+
from PIL import Image
|
13 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
14 |
+
from sklearn.linear_model import LogisticRegression
|
15 |
+
from sklearn.preprocessing import LabelEncoder
|
16 |
+
from sklearn.utils import resample
|
17 |
+
from stqdm import stqdm
|
18 |
+
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
19 |
+
|
20 |
+
from .configs import Languages, ModelConfigs, SupportedFiles
|
21 |
+
|
22 |
+
stqdm.pandas()
|
23 |
+
|
24 |
+
|
25 |
+
@st.cache
|
26 |
+
def get_logo(path):
|
27 |
+
return Image.open(path)
|
28 |
+
|
29 |
+
|
30 |
+
# @st.cache(suppress_st_warning=True)
|
31 |
+
def read_file(uploaded_file) -> pd.DataFrame:
|
32 |
+
|
33 |
+
file_type = uploaded_file.name.split(".")[-1]
|
34 |
+
if file_type in set(i.name for i in SupportedFiles):
|
35 |
+
read_f = SupportedFiles[file_type].value[0]
|
36 |
+
return read_f(uploaded_file, dtype=str)
|
37 |
+
|
38 |
+
else:
|
39 |
+
st.error("File type not supported")
|
40 |
+
|
41 |
+
|
42 |
+
def download_button(dataframe: pd.DataFrame, name: str):
|
43 |
+
csv = dataframe.to_csv(index=False)
|
44 |
+
# some strings <-> bytes conversions necessary here
|
45 |
+
b64 = base64.b64encode(csv.encode()).decode()
|
46 |
+
href = f'<a href="data:file/csv;base64,{b64}" download="{name}.csv">Download</a>'
|
47 |
+
st.write(href, unsafe_allow_html=True)
|
48 |
+
|
49 |
+
|
50 |
+
def encode(text: pd.Series, labels: pd.Series):
|
51 |
+
tfidf_vectorizer = TfidfVectorizer(
|
52 |
+
input="content", # default: file already in memory
|
53 |
+
encoding="utf-8", # default
|
54 |
+
decode_error="strict", # default
|
55 |
+
strip_accents=None, # do nothing
|
56 |
+
lowercase=False, # do nothing
|
57 |
+
preprocessor=None, # do nothing - default
|
58 |
+
tokenizer=None, # default
|
59 |
+
stop_words=None, # do nothing
|
60 |
+
analyzer="word",
|
61 |
+
ngram_range=(1, 3), # maximum 3-ngrams
|
62 |
+
min_df=0.001,
|
63 |
+
max_df=0.75,
|
64 |
+
sublinear_tf=True,
|
65 |
+
)
|
66 |
+
label_encoder = LabelEncoder()
|
67 |
+
|
68 |
+
with st.spinner("Encoding text using TF-IDF and Encoding labels"):
|
69 |
+
X = tfidf_vectorizer.fit_transform(text.values)
|
70 |
+
y = label_encoder.fit_transform(labels.values)
|
71 |
+
|
72 |
+
return {
|
73 |
+
"X": X,
|
74 |
+
"y": y,
|
75 |
+
"X_names": np.array(tfidf_vectorizer.get_feature_names()),
|
76 |
+
"y_names": label_encoder.classes_,
|
77 |
+
}
|
78 |
+
|
79 |
+
|
80 |
+
def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
|
81 |
+
|
82 |
+
n_instances, n_features = X.shape
|
83 |
+
n_classes = len(y_names)
|
84 |
+
|
85 |
+
# NOTE: the * 10 / 10 trick is to have "nice" round-ups
|
86 |
+
sample_fraction = np.ceil((n_features / n_instances) * 10) / 10
|
87 |
+
|
88 |
+
sample_size = min(
|
89 |
+
# this is the maximum supported
|
90 |
+
configs.MAX_SELECTION.value,
|
91 |
+
# at minimum you want MIN_SELECTION but in general you want
|
92 |
+
# n_instances * sample_fraction
|
93 |
+
max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
|
94 |
+
# however if previous one is bigger the the available instances take
|
95 |
+
# the number of available instances
|
96 |
+
n_instances,
|
97 |
+
)
|
98 |
+
|
99 |
+
# TODO: might want to try out something to subsample features at each iteration
|
100 |
+
|
101 |
+
# initialize coefficient matrices
|
102 |
+
pos_scores = np.zeros((n_classes, n_features), dtype=int)
|
103 |
+
neg_scores = np.zeros((n_classes, n_features), dtype=int)
|
104 |
+
|
105 |
+
with st.spinner("Wordifying!"):
|
106 |
+
|
107 |
+
for _ in stqdm(range(configs.NUM_ITERS.value)):
|
108 |
+
|
109 |
+
# run randomized regression
|
110 |
+
clf = LogisticRegression(
|
111 |
+
penalty="l1",
|
112 |
+
C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
|
113 |
+
solver="liblinear",
|
114 |
+
multi_class="auto",
|
115 |
+
max_iter=500,
|
116 |
+
class_weight="balanced",
|
117 |
+
)
|
118 |
+
|
119 |
+
# sample indices to subsample matrix
|
120 |
+
selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
|
121 |
+
|
122 |
+
# fit
|
123 |
+
try:
|
124 |
+
clf.fit(X[selection], y[selection])
|
125 |
+
except ValueError:
|
126 |
+
continue
|
127 |
+
|
128 |
+
# record coefficients
|
129 |
+
if n_classes == 2:
|
130 |
+
pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
|
131 |
+
neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
|
132 |
+
pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
|
133 |
+
neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
|
134 |
+
else:
|
135 |
+
pos_scores += clf.coef_ > 0
|
136 |
+
neg_scores += clf.coef_ < 0
|
137 |
+
|
138 |
+
# normalize
|
139 |
+
pos_scores = pos_scores / configs.NUM_ITERS.value
|
140 |
+
neg_scores = neg_scores / configs.NUM_ITERS.value
|
141 |
+
|
142 |
+
# get only active features
|
143 |
+
pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
|
144 |
+
neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
|
145 |
+
|
146 |
+
# prepare DataFrame
|
147 |
+
pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
|
148 |
+
neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
|
149 |
+
|
150 |
+
posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
|
151 |
+
negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
|
152 |
+
|
153 |
+
return posdf, negdf
|
154 |
+
|
155 |
+
|
156 |
+
# more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
|
157 |
+
# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
|
158 |
+
_re_space = re.compile(" {2,}")
|
159 |
+
|
160 |
+
|
161 |
+
def normalize_useless_spaces(t):
|
162 |
+
return _re_space.sub(" ", t)
|
163 |
+
|
164 |
+
|
165 |
+
_re_rep = re.compile(r"(\S)(\1{2,})")
|
166 |
+
|
167 |
+
|
168 |
+
def normalize_repeating_chars(t):
|
169 |
+
def _replace_rep(m):
|
170 |
+
c, cc = m.groups()
|
171 |
+
return c
|
172 |
+
|
173 |
+
return _re_rep.sub(_replace_rep, t)
|
174 |
+
|
175 |
+
|
176 |
+
_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
|
177 |
+
|
178 |
+
|
179 |
+
def normalize_repeating_words(t):
|
180 |
+
def _replace_wrep(m):
|
181 |
+
c, cc, e = m.groups()
|
182 |
+
return c
|
183 |
+
|
184 |
+
return _re_wrep.sub(_replace_wrep, t)
|
185 |
+
|
186 |
+
|
187 |
+
class TextPreprocessor:
|
188 |
+
def __init__(
|
189 |
+
self, language: str, cleaning_steps: List[str], lemmatizer_when: str = "last", remove_stop: bool = True
|
190 |
+
) -> None:
|
191 |
+
# prepare lemmatizer
|
192 |
+
self.language = language
|
193 |
+
self.nlp = spacy.load(Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"])
|
194 |
+
self.lemmatizer_when = self._lemmatization_options().get(lemmatizer_when, None)
|
195 |
+
self.remove_stop = remove_stop
|
196 |
+
self._lemmatize = self._get_lemmatizer()
|
197 |
+
|
198 |
+
# prepare cleaning
|
199 |
+
self.cleaning_steps = [
|
200 |
+
self._cleaning_options()[step] for step in cleaning_steps if step in self._cleaning_options()
|
201 |
+
]
|
202 |
+
self.cleaning_pipeline = make_pipeline(*self.cleaning_steps) if self.cleaning_steps else lambda x: x
|
203 |
+
|
204 |
+
def _get_lemmatizer(self) -> Callable:
|
205 |
+
"""Return the correct spacy Doc-level lemmatizer"""
|
206 |
+
if self.remove_stop:
|
207 |
+
|
208 |
+
def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
|
209 |
+
"""Lemmatizes spacy Doc and removes stopwords"""
|
210 |
+
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
|
211 |
+
|
212 |
+
else:
|
213 |
+
|
214 |
+
def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
|
215 |
+
"""Lemmatizes spacy Doc"""
|
216 |
+
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
|
217 |
+
|
218 |
+
return lemmatizer
|
219 |
+
|
220 |
+
@staticmethod
|
221 |
+
def _lemmatization_options() -> Dict[str, str]:
|
222 |
+
return {
|
223 |
+
"Before preprocessing": "first",
|
224 |
+
"After preprocessing": "last",
|
225 |
+
"Never! Let's do it quick and dirty": None,
|
226 |
+
}
|
227 |
+
|
228 |
+
def lemmatizer(self, series: pd.Series) -> pd.Series:
|
229 |
+
"""
|
230 |
+
Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
|
231 |
+
"""
|
232 |
+
res = []
|
233 |
+
pbar = stqdm(total=len(series))
|
234 |
+
for doc in self.nlp.pipe(series, batch_size=500):
|
235 |
+
res.append(self._lemmatize(doc))
|
236 |
+
pbar.update(1)
|
237 |
+
pbar.close()
|
238 |
+
return pd.Series(res)
|
239 |
+
|
240 |
+
@staticmethod
|
241 |
+
def _cleaning_options():
|
242 |
+
"""Returns available cleaning steps in order"""
|
243 |
+
return OrderedDict(
|
244 |
+
[
|
245 |
+
("lower", lambda x: x.lower()),
|
246 |
+
("normalize_unicode", normalize.unicode),
|
247 |
+
("normalize_bullet_points", normalize.bullet_points),
|
248 |
+
("normalize_hyphenated_words", normalize.hyphenated_words),
|
249 |
+
("normalize_quotation_marks", normalize.quotation_marks),
|
250 |
+
("normalize_whitespace", normalize.whitespace),
|
251 |
+
("remove_accents", remove.accents),
|
252 |
+
("remove_brackets", remove.brackets),
|
253 |
+
("remove_html_tags", remove.html_tags),
|
254 |
+
("remove_punctuation", remove.punctuation),
|
255 |
+
("replace_currency_symbols", replace.currency_symbols),
|
256 |
+
("replace_emails", replace.emails),
|
257 |
+
("replace_emojis", replace.emojis),
|
258 |
+
("replace_hashtags", replace.hashtags),
|
259 |
+
("replace_numbers", replace.numbers),
|
260 |
+
("replace_phone_numbers", replace.phone_numbers),
|
261 |
+
("replace_urls", replace.urls),
|
262 |
+
("replace_user_handles", replace.user_handles),
|
263 |
+
("normalize_useless_spaces", normalize_useless_spaces),
|
264 |
+
("normalize_repeating_chars", normalize_repeating_chars),
|
265 |
+
("normalize_repeating_words", normalize_repeating_words),
|
266 |
+
("strip", lambda x: x.strip()),
|
267 |
+
]
|
268 |
+
)
|
269 |
+
|
270 |
+
def fit_transform(self, series: pd.Series) -> Series:
|
271 |
+
"""Applies text preprocessing"""
|
272 |
+
|
273 |
+
if self.lemmatizer_when == "first":
|
274 |
+
with st.spinner("Lemmatizing"):
|
275 |
+
series = self.lemmatizer(series)
|
276 |
+
|
277 |
+
with st.spinner("Cleaning"):
|
278 |
+
series = series.progress_map(self.cleaning_pipeline)
|
279 |
+
|
280 |
+
if self.lemmatizer_when == "last":
|
281 |
+
with st.spinner("Lemmatizing"):
|
282 |
+
series = self.lemmatizer(series)
|
283 |
+
|
284 |
+
return series
|
285 |
+
|
286 |
+
|
287 |
+
def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
288 |
+
|
289 |
+
source = data["label"].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
|
290 |
+
|
291 |
+
source["Proportions"] = ((source["Counts"] / source["Counts"].sum()).round(3) * 100).map("{:,.2f}".format) + "%"
|
292 |
+
|
293 |
+
bars = (
|
294 |
+
alt.Chart(source)
|
295 |
+
.mark_bar()
|
296 |
+
.encode(
|
297 |
+
x="Labels:O",
|
298 |
+
y="Counts:Q",
|
299 |
+
)
|
300 |
+
)
|
301 |
+
|
302 |
+
text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
|
303 |
+
|
304 |
+
return (bars + text).properties(height=300)
|
305 |
+
|
306 |
+
|
307 |
+
def plot_nchars(data: pd.DataFrame, text_column: str):
|
308 |
+
source = data[text_column].str.len().to_frame()
|
309 |
+
|
310 |
+
plot = (
|
311 |
+
alt.Chart(source)
|
312 |
+
.mark_bar()
|
313 |
+
.encode(
|
314 |
+
alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
|
315 |
+
alt.Y("count()", axis=alt.Axis(title="")),
|
316 |
+
)
|
317 |
+
)
|
318 |
+
|
319 |
+
return plot.properties(height=300)
|
320 |
+
|
321 |
+
|
322 |
+
def plot_score(data: pd.DataFrame, label_col: str, label: str):
|
323 |
+
|
324 |
+
source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
|
325 |
+
|
326 |
+
plot = (
|
327 |
+
alt.Chart(source)
|
328 |
+
.mark_bar()
|
329 |
+
.encode(
|
330 |
+
y=alt.Y("word:O", sort="-x"),
|
331 |
+
x="score:Q",
|
332 |
+
)
|
333 |
+
)
|
334 |
+
|
335 |
+
return plot.properties(height=max(30 * source.shape[0], 50))
|