Spaces:

MilaNLProc
/

wordify

Build error

App Files Files Community

Pietro Lesci commited on May 10, 2021

Commit

8744085

0 Parent(s):

first commit

Browse files

Files changed (24) hide show

.gitignore +108 -0
.streamlit/config.toml +4 -0
CODEOWNERS +1 -0
Dockerfile +30 -0
LICENSE +1 -0
Makefile +42 -0
app.py +78 -0
assets/logo.png +0 -0
assets/page_icon.png +0 -0
data/test_de.xlsx +0 -0
data/test_en.csv +0 -0
data/test_es.xlsx +0 -0
data/test_fe.xlsx +0 -0
data/test_it.xlsx +0 -0
notebooks/wordifier_nb.ipynb +280 -0
pytest.ini +4 -0
requirements.txt +53 -0
src/__init__.py +0 -0
src/configs.py +36 -0
src/pages/about.py +34 -0
src/pages/faq.py +112 -0
src/pages/home.py +170 -0
src/session_state.py +117 -0
src/utils.py +335 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,108 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+.github
+# C extensions
+*.so
+.vscode
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# dotenv
+.env
+# virtualenv
+.venv
+venv/
+ENV/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.idea
+# mac
+.DS_Store
+data/data.csv

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,4 @@

+[server]
+# Max size, in megabytes, for files uploaded with the file_uploader.
+# Default: 200
+maxUploadSize = 10

CODEOWNERS ADDED Viewed

	@@ -0,0 +1 @@


1	+ * @pietrolesci

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+###############################################################################
+# main
+###############################################################################
+FROM continuumio/miniconda3:4.8.2 AS main
+RUN apt-get -y update && \
+    apt-get -y install build-essential
+RUN conda update -n base -c defaults conda
+# chown changes owner from root owner (1000) to the first user inside the env (100)
+# COPY --chown=1000:100 requirements.txt /opt/requirements.txt
+# RUN conda install --force-reinstall -y -q --name base -c conda-forge --file /opt/requirements.txt
+RUN conda install --force-reinstall -y -q --name base pip
+COPY . /var/app/
+# WORKDIR /var/dev
+WORKDIR /var/app
+RUN pip install -r requirements.txt
+CMD streamlit run ./app.py
+###############################################################################
+# test
+###############################################################################
+FROM main AS test
+COPY . /var/dev/
+WORKDIR /var/dev
+# add unit test instruction here: RUN xxxxxx
+# add integration test instruction here: RUN xxxxx

LICENSE ADDED Viewed

	@@ -0,0 +1 @@


1	+ TODO: placeholder

Makefile ADDED Viewed

	@@ -0,0 +1,42 @@

+.PHONY: help build dev integration-test push
+.DEFAULT_GOAL := help
+# Docker image build info
+PROJECT:=wordify
+BUILD_TAG?=0.0.1
+ALL_IMAGES:=src
+help:
+# http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
+	@echo "python starter project"
+	@echo "====================="
+	@echo "Replace % with a directory name (e.g., make build/python-example)"
+	@echo
+	@grep -E '^[a-zA-Z0-9_%/-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+########################################################
+## Local development
+########################################################
+dev: ARGS?=/bin/bash
+dev: DARGS?=-v "${CURDIR}":/var/dev
+dev: ## run a foreground container
+	docker run -it --rm $(DARGS) $(PROJECT) $(ARGS)
+notebook: ARGS?=jupyter lab
+notebook: DARGS?=-v "${CURDIR}":/var/dev -p 8888:8888 ##notebook shall be run on http://0.0.0.0:8888 by default. Change to a different port (e.g. 8899) if 8888 is used for example 8899:8888
+notebook: ## run a foreground container
+	docker run -it --rm $(DARGS) $(PROJECT) $(ARGS) \
+		--ip=0.0.0.0 \
+		--allow-root \
+		--NotebookApp.token="" \
+		--NotebookApp.password=""
+build: DARGS?=
+build: ## build the latest image for a project
+	docker build $(DARGS) --build-arg BUILD_TAG=${BUILD_TAG} --rm --force-rm -t $(PROJECT):${BUILD_TAG} .
+run:
+	docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 8501:8501 $(PROJECT):${BUILD_TAG}

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import streamlit as st
+from src.utils import get_logo
+from src import session_state
+from src.pages import (
+    home,
+    faq,
+    about,
+)
+from src.configs import SupportedFiles
+# app configs
+st.set_page_config(
+    page_title="Wordify",
+    layout="wide",
+    page_icon="./assets/logo.png",
+)
+# session state
+session = session_state.get(process=False, run_id=0, posdf=None, negdf=None)
+# ==== SIDEBAR ==== #
+# LOGO
+client_logo = get_logo("./assets/logo.png")
+with st.sidebar.beta_container():
+    st.image(client_logo)
+# NAVIGATION
+PAGES = {
+    "Home": home,
+    "FAQ": faq,
+    "About": about,
+}
+with st.sidebar.beta_container():
+    st.sidebar.header("Navigation")
+    selection = st.sidebar.radio("Go to", list(PAGES.keys()))
+page = PAGES[selection]
+# FILE UPLOADER
+with st.sidebar.beta_container():
+    st.markdown("")
+    st.markdown("")
+    st.header("Upload file")
+    uploaded_file = st.sidebar.file_uploader("Select file", type=[i.name for i in SupportedFiles])
+# FOOTER
+with st.sidebar.beta_container():
+    st.markdown("")
+    st.markdown("")
+    st.markdown(
+        """
+        <span style="font-size: 0.75em">Built with &hearts; by [`Pietro Lesci`](https://pietrolesci.github.io/) and [`MilaNLP`](https://twitter.com/MilaNLProc?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor)</span>
+        """,
+        unsafe_allow_html=True,
+    )
+# ==== MAIN ==== #
+with st.beta_container():
+    st.title("Wordify")
+    st.markdown(
+        """
+        Wordify makes it easy to identify words that discriminate categories in textual data.
+        Let's explain Wordify with an example. Imagine you are thinking about having a glass
+        of wine :wine_glass: with your friends :man-man-girl-girl: and you have to buy a bottle.
+        You know you like `bold`, `woody` wine but are unsure which one to choose.
+        You wonder whether there are some words that describe each type of wine.
+        Since you are a researcher :female-scientist: :male-scientist:, you decide to approach
+        the problem scientifically :microscope:. That's where Wordify comes to the rescue!
+        """
+    )
+page.write(session, uploaded_file)

assets/logo.png ADDED Viewed

assets/page_icon.png ADDED Viewed

data/test_de.xlsx ADDED Viewed

Binary file (645 kB). View file

data/test_en.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/test_es.xlsx ADDED Viewed

Binary file (771 kB). View file

data/test_fe.xlsx ADDED Viewed

Binary file (754 kB). View file

data/test_it.xlsx ADDED Viewed

Binary file (662 kB). View file

notebooks/wordifier_nb.ipynb ADDED Viewed

	@@ -0,0 +1,280 @@

+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python383jvsc74a57bd01cb9a1c850fd1d16c5b98054247a74b7b7a12849bcfa00436ba202c2a9e2bbb2",
+   "display_name": "Python 3.8.3 64-bit ('py38': conda)"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "nb_dir = os.path.split(os.getcwd())[0]\n",
+    "if nb_dir not in sys.path:\n",
+    "    sys.path.append(nb_dir)\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "# import modin.pandas as mpd\n",
+    "import spacy\n",
+    "from src.configs import ModelConfigs, Languages\n",
+    "from src.utils import wordifier, TextPreprocessor, encode\n",
+    "\n",
+    "from textacy.preprocessing import make_pipeline, remove, replace, normalize\n",
+    "from tqdm import trange\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from sklearn.utils import resample\n",
+    "import multiprocessing as mp\n",
+    "# import dask.dataframe as dask_df\n",
+    "from stqdm import stqdm\n",
+    "stqdm.pandas()\n",
+    "\n",
+    "from tqdm import trange\n",
+    "\n",
+    "import os\n",
+    "# os.environ[\"MODIN_ENGINE\"] = \"ray\"  # Modin will use Ray\n",
+    "\n",
+    "import vaex\n",
+    "pd.set_option(\"display.max_colwidth\", None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_excel(\"../data/test_de.xlsx\")\n",
+    "# mdf = mpd.read_csv(\"../data/test_en.csv\")\n",
+    "language = \"English\"\n",
+    "nlp = spacy.load(Languages[language].value, exclude=[\"parser\", \"ner\", \"pos\", \"tok2vec\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prep = TextPreprocessor(\n",
+    "    language=\"English\", \n",
+    "    cleaning_steps=list(TextPreprocessor._cleaning_options().keys()),\n",
+    "    lemmatizer_when=None,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "2021-05-10 14:30:04.984 WARNING root: \n",
+      "  \u001b[33m\u001b[1mWarning:\u001b[0m to view this Streamlit app on a browser, run it with the following\n",
+      "  command:\n",
+      "\n",
+      "    streamlit run /Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/ipykernel_launcher.py [ARGUMENTS]\n",
+      "100%|██████████| 6269/6269 [00:02<00:00, 2793.61it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "df[\"p_text\"] = prep.fit_transform(df[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X, y, X_names, y_names = encode(df[\"p_text\"], df[\"label\"]).values()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clf = LogisticRegression(\n",
+    "    penalty=\"l1\",\n",
+    "    C=0.05,#ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n",
+    "    solver=\"saga\",\n",
+    "    multi_class=\"auto\",\n",
+    "    max_iter=500,\n",
+    "    class_weight=\"balanced\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "CPU times: user 1min 23s, sys: 138 ms, total: 1min 23s\n",
+      "Wall time: 1min 24s\n",
+      "/Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:329: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\"The max_iter was reached which means \"\n"
+     ]
+    },
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "LogisticRegression(C=0.05, class_weight='balanced', max_iter=500, penalty='l1',\n",
+       "                   solver='saga')"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 12
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "clf.fit(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "  6%|▌         | 28/500 [01:01<27:33,  3.50s/it]/Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
+      "  warnings.warn(\"Liblinear failed to converge, increase \"\n",
+      " 31%|███       | 156/500 [06:18<13:54,  2.43s/it]\n"
+     ]
+    },
+    {
+     "output_type": "error",
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-14-1fef5b7ccf45>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     39\u001b[0m     \u001b[0;31m# fit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     40\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 41\u001b[0;31m         \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     42\u001b[0m     \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     43\u001b[0m         \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m   1354\u001b[0m                               \u001b[0;34m\" 'solver' is set to 'liblinear'. Got 'n_jobs'\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1355\u001b[0m                               \" = {}.\".format(effective_n_jobs(self.n_jobs)))\n\u001b[0;32m-> 1356\u001b[0;31m             self.coef_, self.intercept_, n_iter_ = _fit_liblinear(\n\u001b[0m\u001b[1;32m   1357\u001b[0m                 \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_intercept\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintercept_scaling\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1358\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclass_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdual\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py\u001b[0m in \u001b[0;36m_fit_liblinear\u001b[0;34m(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)\u001b[0m\n\u001b[1;32m    964\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    965\u001b[0m     \u001b[0msolver_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_liblinear_solver_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmulti_class\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdual\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 966\u001b[0;31m     raw_coef_, n_iter_ = liblinear.train_wrap(\n\u001b[0m\u001b[1;32m    967\u001b[0m         \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_ind\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misspmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msolver_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    968\u001b[0m         \u001b[0mclass_weight_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrnd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'i'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "n_instances, n_features = X.shape\n",
+    "n_classes = len(y_names)\n",
+    "\n",
+    "# NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n",
+    "sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n",
+    "\n",
+    "sample_size = min(\n",
+    "    # this is the maximum supported\n",
+    "    ModelConfigs.MAX_SELECTION.value,\n",
+    "    # at minimum you want MIN_SELECTION but in general you want\n",
+    "    # n_instances * sample_fraction\n",
+    "    max(ModelConfigs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n",
+    "    # however if previous one is bigger the the available instances take\n",
+    "    # the number of available instances\n",
+    "    n_instances,\n",
+    ")\n",
+    "\n",
+    "# TODO: might want to try out something to subsample features at each iteration\n",
+    "\n",
+    "# initialize coefficient matrices\n",
+    "pos_scores = np.zeros((n_classes, n_features), dtype=int)\n",
+    "neg_scores = np.zeros((n_classes, n_features), dtype=int)\n",
+    "\n",
+    "for _ in trange(ModelConfigs.NUM_ITERS.value):\n",
+    "\n",
+    "    # run randomized regression\n",
+    "    clf = LogisticRegression(\n",
+    "        penalty=\"l1\",\n",
+    "        C=ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n",
+    "        solver=\"liblinear\",\n",
+    "        multi_class=\"auto\",\n",
+    "        max_iter=500,\n",
+    "        class_weight=\"balanced\",\n",
+    "    )\n",
+    "\n",
+    "    # sample indices to subsample matrix\n",
+    "    selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)\n",
+    "\n",
+    "    # fit\n",
+    "    try:\n",
+    "        clf.fit(X[selection], y[selection])\n",
+    "    except ValueError:\n",
+    "        continue\n",
+    "\n",
+    "    # record coefficients\n",
+    "    if n_classes == 2:\n",
+    "        pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n",
+    "        neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n",
+    "        pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n",
+    "        neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n",
+    "    else:\n",
+    "        pos_scores += clf.coef_ > 0\n",
+    "        neg_scores += clf.coef_ < 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# normalize\n",
+    "pos_scores = pos_scores / ModelConfigs.NUM_ITERS.value\n",
+    "neg_scores = neg_scores / ModelConfigs.NUM_ITERS.value\n",
+    "\n",
+    "# get only active features\n",
+    "pos_positions = np.where(pos_scores >= ModelConfigs.SELECTION_THRESHOLD.value, pos_scores, 0)\n",
+    "neg_positions = np.where(neg_scores >= ModelConfigs.SELECTION_THRESHOLD.value, neg_scores, 0)\n",
+    "\n",
+    "# prepare DataFrame\n",
+    "pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]\n",
+    "neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]\n",
+    "\n",
+    "posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)\n",
+    "negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ]
+}

pytest.ini ADDED Viewed

	@@ -0,0 +1,4 @@

+[pytest]
+markers =
+    cache_tests: mark a test which is about the recurrence computer cache
+    seed_tests: mark a test which is about the seed sequence

requirements.txt ADDED Viewed

	@@ -0,0 +1,53 @@

+pytest==5.4.2
+pytest-cov==2.8.1
+sphinx==3.0.4
+black==19.10b0
+jupyterlab==2.1.4
+pandas==1.2.4
+jupytext==1.5.1
+nbval==0.9.6
+textacy==0.11.0
+streamlit==0.81.1
+spacy==3.0.6
+numpy==1.20.2
+scikit-learn==0.24.2
+xlrd==2.0.1
+openpyxl==3.0.7
+stqdm==0.0.3
+watchdog==2.1.0
+flake8
+# english
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm
+# italian
+https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.0.0/it_core_news_sm-3.0.0.tar.gz#egg=it_core_news_sm
+# german
+https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm
+# spanish
+https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.0.0/es_core_news_sm-3.0.0.tar.gz#egg=es_core_news_sm
+# greek
+https://github.com/explosion/spacy-models/releases/download/el_core_news_sm-3.0.0/el_core_news_sm-3.0.0.tar.gz#egg=el_core_news_sm
+# dutch
+https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-3.0.0/nl_core_news_sm-3.0.0.tar.gz#egg=nl_core_news_sm
+# portuguese
+https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.0.0/pt_core_news_sm-3.0.0.tar.gz#egg=pt_core_news_sm
+# french
+https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.0.0/fr_core_news_sm-3.0.0.tar.gz#egg=fr_core_news_sm
+# chinese
+# https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.0.0/zh_core_web_sm-3.0.0.tar.gz#egg=zh_core_web_sm
+# danish
+https://github.com/explosion/spacy-models/releases/download/da_core_news_sm-3.0.0/da_core_news_sm-3.0.0.tar.gz#egg=da_core_news_sm
+# japanese
+# https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-3.0.0/ja_core_news_sm-3.0.0.tar.gz#egg=ja_core_news_sm
+# lithuanian
+https://github.com/explosion/spacy-models/releases/download/lt_core_news_sm-3.0.0/lt_core_news_sm-3.0.0.tar.gz#egg=lt_core_news_sm
+# norvegian
+https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-3.0.0/nb_core_news_sm-3.0.0.tar.gz#egg=nb_core_news_sm
+# polish
+https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.0.0/pl_core_news_sm-3.0.0.tar.gz#egg=pl_core_news_sm
+# romanian
+https://github.com/explosion/spacy-models/releases/download/ro_core_news_sm-3.0.0/ro_core_news_sm-3.0.0.tar.gz#egg=ro_core_news_sm
+# russian
+https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.0.0/ru_core_news_sm-3.0.0.tar.gz#egg=ru_core_news_sm
+# multi-language
+https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.0.0/xx_ent_wiki_sm-3.0.0.tar.gz#egg=xx_ent_wiki_sm

src/__init__.py ADDED Viewed

File without changes

src/configs.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from enum import Enum
+import pandas as pd
+class ModelConfigs(Enum):
+    NUM_ITERS = 500
+    SELECTION_THRESHOLD = 0.0
+    PENALTIES = [10, 5, 2, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001]
+    MAX_SELECTION = 100_000
+    MIN_SELECTION = 10_000
+class Languages(Enum):
+    English = "en_core_web_sm"
+    Italian = "it_core_news_sm"
+    German = "de_core_news_sm"
+    Spanish = "es_core_news_sm"
+    Greek = "el_core_news_sm"
+    Dutch = "nl_core_news_sm"
+    Portuguese = "pt_core_news_sm"
+    French = "fr_core_news_sm"
+    Chinese = "zh_core_news_sm"
+    Danish = "da_core_news_sm"
+    Japanese = "ja_core_news_sm"
+    Lithuanian = "lt_core_news_sm"
+    Norvegian = "nb_core_news_sm"
+    Polish = "pl_core_news_sm"
+    Romanian = "ro_core_news_sm"
+    Russian = "ru_core_news_sm"
+    MultiLanguage = "xx_ent_wiki_sm"
+class SupportedFiles(Enum):
+    xlsx = (pd.read_excel,)
+    csv = (pd.read_csv,)
+    parquet = (pd.read_parquet,)

src/pages/about.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import streamlit as st
+def write(*args):
+    # ==== Contacts ==== #
+    with st.beta_container():
+        st.markdown("")
+        st.markdown("")
+        st.header(":rocket:About us")
+        st.markdown(
+            """
+            You can reach out to us via email, phone, or - if you are old-fashioned - via mail
+            """
+        )
+        with st.beta_expander("Contacts"):
+            _, col2 = st.beta_columns([0.5, 3])
+            col2.markdown(
+                """
+                :email: [email protected]
+                :telephone_receiver: +39 02 5836 2604
+                :postbox: Via Röntgen n. 1, Milan 20136 (ITALY)
+                """
+            )
+        st.write(
+            """
+            <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
+            """,
+            unsafe_allow_html=True,
+        )

src/pages/faq.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import streamlit as st
+from src.configs import Languages
+def write(*args):
+    # ==== HPW IT WORKS ==== #
+    with st.beta_container():
+        st.markdown("")
+        st.markdown("")
+        st.header("How it works")
+        st.subheader("Step 1 - Prepare your data")
+        st.markdown(
+            """
+            Create an Excel or CSV file with two columns for each row:
+            - a column with the name or the label identifying a specific object or class (e.g., in our
+            wine example above it would be the type of wine or the name of a specific brand). It is
+            common practice naming this column `label`
+            - a column with the text describing that specific object or class (e.g., in the wine example
+            above it could be the description that you find on the rear of the bottle label). It is
+            common practice naming this column `text`
+            To have reliable results, we suggest providing at least 2000 labelled texts. If you provide
+            less we will still wordify your file, but the results should then be taken with a grain of
+            salt.
+            Consider that we also support multi-language texts, therefore you'll be able to
+            automatically discriminate between international wines, even if your preferred Italian
+            producer does not provide you with a description written in English!
+            """
+        )
+        st.subheader("Step 2 - Upload your file and Wordify!")
+        st.markdown(
+            """
+            Once you have prepared your Excel or CSV file, click the "Browse File" button.
+            Browse for your file.
+            Choose the language of your texts (select multi-language if your file contains text in
+            different languages).
+            Push the "Wordify|" button, set back, and wait for wordify to do its tricks.
+            Depending on the size of your data, the process can take from 1 minute to 5 minutes
+            """
+        )
+    # ==== FAQ ==== #
+    with st.beta_container():
+        st.markdown("")
+        st.markdown("")
+        st.header(":question:Frequently Asked Questions")
+        with st.beta_expander("What is Wordify?"):
+            st.markdown(
+                """
+                Wordify is a way to find out which terms are most indicative for each of your dependent
+                variable values.
+                """
+            )
+        with st.beta_expander("What happens to my data?"):
+            st.markdown(
+                """
+                Nothing. We never store the data you upload on disk: it is only kept in memory for the
+                duration of the modeling, and then deleted. We do not retain any copies or traces of
+                your data.
+                """
+            )
+        with st.beta_expander("What input formats do you support?"):
+            st.markdown(
+                """
+                The file you upload should be .xlsx, with two columns: the first should be labeled
+                'text' and contain all your documents (e.g., tweets, reviews, patents, etc.), one per
+                line. The second column should be labeled 'label', and contain the dependent variable
+                label associated with each text (e.g., rating, author gender, company, etc.).
+                """
+            )
+        with st.beta_expander("How does it work?"):
+            st.markdown(
+                """
+                It uses a variant of the Stability Selection algorithm
+                [(Meinshausen and Bühlmann, 2010)](https://rss.onlinelibrary.wiley.com/doi/full/10.1111/j.1467-9868.2010.00740.x)
+                to fit hundreds of logistic regression models on random subsets of the data, using
+                different L1 penalties to drive as many of the term coefficients to 0. Any terms that
+                receive a non-zero coefficient in at least 30% of all model runs can be seen as stable
+                indicators.
+                """
+            )
+        with st.beta_expander("How much data do I need?"):
+            st.markdown(
+                """
+                We recommend at least 2000 instances, the more, the better. With fewer instances, the
+                results are less replicable and reliable.
+                """
+            )
+        with st.beta_expander("Is there a paper I can cite?"):
+            st.markdown(
+                """
+                Yes please! Reference coming soon...
+                """
+            )
+        with st.beta_expander("What languages are supported?"):
+            st.markdown(
+                f"""
+                Currently we support: {", ".join([i.name for i in Languages])}.
+                """
+            )

src/pages/home.py ADDED Viewed

	@@ -0,0 +1,170 @@

+from src.configs import Languages
+from src.utils import (
+    encode,
+    wordifier,
+    download_button,
+    TextPreprocessor,
+    plot_labels_prop,
+    plot_nchars,
+    plot_score,
+    get_logo,
+    read_file,
+)
+import streamlit as st
+def write(session, uploaded_file):
+    if uploaded_file:
+        # 1. READ FILE
+        with st.spinner("Reading file"):
+            # TODO: write parser function that automatically understands format
+            data = read_file(uploaded_file)
+        # 2. CREATE UI TO SELECT COLUMNS
+        st.markdown("")
+        st.markdown("")
+        st.header("Process")
+        col1, col2, col3 = st.beta_columns(3)
+        with col1:
+            language = st.selectbox("Select language", [i.name for i in Languages])
+            with st.beta_expander("Description"):
+                st.markdown(
+                    f"Select a language of text amongst those supported: {', '.join([f'`{i.name}`' for i in Languages])}"
+                )
+        with col2:
+            cols_options = [""] + data.columns.tolist()
+            label_column = st.selectbox("Select label column name", cols_options, index=0)
+            with st.beta_expander("Description"):
+                st.markdown("Select the column containing the label")
+            if label_column:
+                st.altair_chart(plot_labels_prop(data, label_column), use_container_width=True)
+        with col3:
+            text_column = st.selectbox("Select text column name", cols_options, index=0)
+            with st.beta_expander("Description"):
+                st.markdown("Select the column containing the text")
+            if text_column:
+                st.altair_chart(plot_nchars(data, text_column), use_container_width=True)
+        with st.beta_expander("Advanced options"):
+            # Lemmatization option
+            col1, col2 = st.beta_columns([1, 3])
+            with col1:
+                lemmatization_when_elem = st.empty()
+            with col2:
+                st.markdown("Choose lemmatization option")
+            # stopwords option
+            col1, col2 = st.beta_columns([1, 3])
+            with col1:
+                remove_stopwords_elem = st.empty()
+            with col2:
+                st.markdown("Choose stopword option")
+            # cleaning steps
+            col1, col2 = st.beta_columns([1, 3])
+            with col1:
+                cleaning_steps_elem = st.empty()
+                reset_button = st.empty()
+            with col2:
+                st.markdown("Choose cleaning steps")
+            # implement reset logic
+            if reset_button.button("Reset steps"):
+                session.run_id += 1
+            steps_options = list(TextPreprocessor._cleaning_options().keys())
+            cleaning_steps = cleaning_steps_elem.multiselect(
+                "Select text processing steps (ordered)",
+                options=steps_options,
+                default=steps_options,
+                format_func=lambda x: x.replace("_", " ").title(),
+                key=session.run_id,
+            )
+            lemmatization_options = list(TextPreprocessor._lemmatization_options().keys())
+            lemmatization_when = lemmatization_when_elem.selectbox(
+                "Select when lemmatization happens",
+                options=lemmatization_options,
+                index=0,
+                key=session.run_id,
+            )
+            remove_stopwords = remove_stopwords_elem.checkbox("Remove stopwords", value=True, key=session.run_id)
+        # Show sample checkbox
+        col1, col2 = st.beta_columns([1, 2])
+        with col1:
+            show_sample = st.checkbox("Show sample of preprocessed text")
+        # initialize text preprocessor
+        preprocessor = TextPreprocessor(
+            language=language,
+            cleaning_steps=cleaning_steps,
+            lemmatizer_when=lemmatization_when,
+            remove_stop=remove_stopwords,
+        )
+        # 3. PROVIDE FEEDBACK ON OPTIONS
+        if show_sample and not (label_column and text_column):
+            st.warning("Please select `label` and `text` columns")
+        elif show_sample and (label_column and text_column):
+            sample_data = data.sample(10)
+            sample_data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(sample_data[text_column]).values
+            st.table(sample_data.loc[:, [label_column, text_column, f"preprocessed_{text_column}"]])
+        # 4. RUN
+        run_button = st.button("Wordify!")
+        if run_button and not (label_column and text_column):
+            st.warning("Please select `label` and `text` columns")
+        elif run_button and (label_column and text_column) and not session.process:
+            # data = data.head()
+            data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(data[text_column]).values
+            inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
+            session.posdf, session.negdf = wordifier(**inputs)
+            st.success("Wordified!")
+            # session.posdf, session.negdf = process(data, text_column, label_column)
+            session.process = True
+        # 5. RESULTS
+        if session.process and (label_column and text_column):
+            st.markdown("")
+            st.markdown("")
+            st.header("Results")
+            # col1, col2, _ = st.beta_columns(3)
+            col1, col2, col3 = st.beta_columns([2, 3, 3])
+            with col1:
+                label = st.selectbox("Select label", data[label_column].unique().tolist())
+                # # with col2:
+                # thres = st.slider(
+                #     "Select threshold",
+                #     min_value=0,
+                #     max_value=100,
+                #     step=1,
+                #     format="%f",
+                #     value=30,
+                # )
+                show_plots = st.checkbox("Show plots of top 100")
+            with col2:
+                st.subheader(f"Words __positively__ identifying label `{label}`")
+                st.write(session.posdf[session.posdf[label_column] == label].sort_values("score", ascending=False))
+                download_button(session.posdf, "positive_data")
+                if show_plots:
+                    st.altair_chart(plot_score(session.posdf, label_column, label), use_container_width=True)
+            with col3:
+                st.subheader(f"Words __negatively__ identifying label `{label}`")
+                st.write(session.negdf[session.negdf[label_column] == label].sort_values("score", ascending=False))
+                download_button(session.negdf, "negative_data")
+                if show_plots:
+                    st.altair_chart(plot_score(session.negdf, label_column, label), use_container_width=True)

src/session_state.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""Hack to add per-session state to Streamlit.
+Usage
+-----
+>>> import SessionState
+>>>
+>>> session_state = SessionState.get(user_name='', favorite_color='black')
+>>> session_state.user_name
+''
+>>> session_state.user_name = 'Mary'
+>>> session_state.favorite_color
+'black'
+Since you set user_name above, next time your script runs this will be the
+result:
+>>> session_state = get(user_name='', favorite_color='black')
+>>> session_state.user_name
+'Mary'
+"""
+try:
+    import streamlit.ReportThread as ReportThread
+    from streamlit.server.Server import Server
+except Exception:
+    # Streamlit >= 0.65.0
+    import streamlit.report_thread as ReportThread
+    from streamlit.server.server import Server
+class SessionState(object):
+    def __init__(self, **kwargs):
+        """A new SessionState object.
+        Parameters
+        ----------
+        **kwargs : any
+            Default values for the session state.
+        Example
+        -------
+        >>> session_state = SessionState(user_name='', favorite_color='black')
+        >>> session_state.user_name = 'Mary'
+        ''
+        >>> session_state.favorite_color
+        'black'
+        """
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+def get(**kwargs):
+    """Gets a SessionState object for the current session.
+    Creates a new object if necessary.
+    Parameters
+    ----------
+    **kwargs : any
+        Default values you want to add to the session state, if we're creating a
+        new one.
+    Example
+    -------
+    >>> session_state = get(user_name='', favorite_color='black')
+    >>> session_state.user_name
+    ''
+    >>> session_state.user_name = 'Mary'
+    >>> session_state.favorite_color
+    'black'
+    Since you set user_name above, next time your script runs this will be the
+    result:
+    >>> session_state = get(user_name='', favorite_color='black')
+    >>> session_state.user_name
+    'Mary'
+    """
+    # Hack to get the session object from Streamlit.
+    ctx = ReportThread.get_report_ctx()
+    this_session = None
+    current_server = Server.get_current()
+    if hasattr(current_server, "_session_infos"):
+        # Streamlit < 0.56
+        session_infos = Server.get_current()._session_infos.values()
+    else:
+        session_infos = Server.get_current()._session_info_by_id.values()
+    for session_info in session_infos:
+        s = session_info.session
+        if (
+            # Streamlit < 0.54.0
+            (hasattr(s, "_main_dg") and s._main_dg == ctx.main_dg)
+            or
+            # Streamlit >= 0.54.0
+            (not hasattr(s, "_main_dg") and s.enqueue == ctx.enqueue)
+            or
+            # Streamlit >= 0.65.2
+            (not hasattr(s, "_main_dg") and s._uploaded_file_mgr == ctx.uploaded_file_mgr)
+        ):
+            this_session = s
+    if this_session is None:
+        raise RuntimeError(
+            "Oh noes. Couldn't get your Streamlit Session object. " "Are you doing something fancy with threads?"
+        )
+    # Got the session object! Now let's attach some state into it.
+    if not hasattr(this_session, "_custom_session_state"):
+        this_session._custom_session_state = SessionState(**kwargs)
+    return this_session._custom_session_state

src/utils.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import base64
+import re
+from collections import OrderedDict
+from typing import Callable, Dict, List
+import altair as alt
+import numpy as np
+import pandas as pd
+import spacy
+import streamlit as st
+from pandas.core.series import Series
+from PIL import Image
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import LabelEncoder
+from sklearn.utils import resample
+from stqdm import stqdm
+from textacy.preprocessing import make_pipeline, normalize, remove, replace
+from .configs import Languages, ModelConfigs, SupportedFiles
+stqdm.pandas()
+@st.cache
+def get_logo(path):
+    return Image.open(path)
+# @st.cache(suppress_st_warning=True)
+def read_file(uploaded_file) -> pd.DataFrame:
+    file_type = uploaded_file.name.split(".")[-1]
+    if file_type in set(i.name for i in SupportedFiles):
+        read_f = SupportedFiles[file_type].value[0]
+        return read_f(uploaded_file, dtype=str)
+    else:
+        st.error("File type not supported")
+def download_button(dataframe: pd.DataFrame, name: str):
+    csv = dataframe.to_csv(index=False)
+    # some strings <-> bytes conversions necessary here
+    b64 = base64.b64encode(csv.encode()).decode()
+    href = f'<a href="data:file/csv;base64,{b64}" download="{name}.csv">Download</a>'
+    st.write(href, unsafe_allow_html=True)
+def encode(text: pd.Series, labels: pd.Series):
+    tfidf_vectorizer = TfidfVectorizer(
+        input="content",  # default: file already in memory
+        encoding="utf-8",  # default
+        decode_error="strict",  # default
+        strip_accents=None,  # do nothing
+        lowercase=False,  # do nothing
+        preprocessor=None,  # do nothing - default
+        tokenizer=None,  # default
+        stop_words=None,  # do nothing
+        analyzer="word",
+        ngram_range=(1, 3),  # maximum 3-ngrams
+        min_df=0.001,
+        max_df=0.75,
+        sublinear_tf=True,
+    )
+    label_encoder = LabelEncoder()
+    with st.spinner("Encoding text using TF-IDF and Encoding labels"):
+        X = tfidf_vectorizer.fit_transform(text.values)
+        y = label_encoder.fit_transform(labels.values)
+    return {
+        "X": X,
+        "y": y,
+        "X_names": np.array(tfidf_vectorizer.get_feature_names()),
+        "y_names": label_encoder.classes_,
+    }
+def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
+    n_instances, n_features = X.shape
+    n_classes = len(y_names)
+    # NOTE: the * 10 / 10 trick is to have "nice" round-ups
+    sample_fraction = np.ceil((n_features / n_instances) * 10) / 10
+    sample_size = min(
+        # this is the maximum supported
+        configs.MAX_SELECTION.value,
+        # at minimum you want MIN_SELECTION but in general you want
+        # n_instances * sample_fraction
+        max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
+        # however if previous one is bigger the the available instances take
+        # the number of available instances
+        n_instances,
+    )
+    # TODO: might want to try out something to subsample features at each iteration
+    # initialize coefficient matrices
+    pos_scores = np.zeros((n_classes, n_features), dtype=int)
+    neg_scores = np.zeros((n_classes, n_features), dtype=int)
+    with st.spinner("Wordifying!"):
+        for _ in stqdm(range(configs.NUM_ITERS.value)):
+            # run randomized regression
+            clf = LogisticRegression(
+                penalty="l1",
+                C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
+                solver="liblinear",
+                multi_class="auto",
+                max_iter=500,
+                class_weight="balanced",
+            )
+            # sample indices to subsample matrix
+            selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
+            # fit
+            try:
+                clf.fit(X[selection], y[selection])
+            except ValueError:
+                continue
+            # record coefficients
+            if n_classes == 2:
+                pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
+                neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
+                pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
+                neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
+            else:
+                pos_scores += clf.coef_ > 0
+                neg_scores += clf.coef_ < 0
+        # normalize
+        pos_scores = pos_scores / configs.NUM_ITERS.value
+        neg_scores = neg_scores / configs.NUM_ITERS.value
+        # get only active features
+        pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
+        neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
+        # prepare DataFrame
+        pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
+        neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
+    posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
+    negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
+    return posdf, negdf
+# more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
+# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
+_re_space = re.compile(" {2,}")
+def normalize_useless_spaces(t):
+    return _re_space.sub(" ", t)
+_re_rep = re.compile(r"(\S)(\1{2,})")
+def normalize_repeating_chars(t):
+    def _replace_rep(m):
+        c, cc = m.groups()
+        return c
+    return _re_rep.sub(_replace_rep, t)
+_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
+def normalize_repeating_words(t):
+    def _replace_wrep(m):
+        c, cc, e = m.groups()
+        return c
+    return _re_wrep.sub(_replace_wrep, t)
+class TextPreprocessor:
+    def __init__(
+        self, language: str, cleaning_steps: List[str], lemmatizer_when: str = "last", remove_stop: bool = True
+    ) -> None:
+        # prepare lemmatizer
+        self.language = language
+        self.nlp = spacy.load(Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"])
+        self.lemmatizer_when = self._lemmatization_options().get(lemmatizer_when, None)
+        self.remove_stop = remove_stop
+        self._lemmatize = self._get_lemmatizer()
+        # prepare cleaning
+        self.cleaning_steps = [
+            self._cleaning_options()[step] for step in cleaning_steps if step in self._cleaning_options()
+        ]
+        self.cleaning_pipeline = make_pipeline(*self.cleaning_steps) if self.cleaning_steps else lambda x: x
+    def _get_lemmatizer(self) -> Callable:
+        """Return the correct spacy Doc-level lemmatizer"""
+        if self.remove_stop:
+            def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
+                """Lemmatizes spacy Doc and removes stopwords"""
+                return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
+        else:
+            def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
+                """Lemmatizes spacy Doc"""
+                return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
+        return lemmatizer
+    @staticmethod
+    def _lemmatization_options() -> Dict[str, str]:
+        return {
+            "Before preprocessing": "first",
+            "After preprocessing": "last",
+            "Never! Let's do it quick and dirty": None,
+        }
+    def lemmatizer(self, series: pd.Series) -> pd.Series:
+        """
+        Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
+        """
+        res = []
+        pbar = stqdm(total=len(series))
+        for doc in self.nlp.pipe(series, batch_size=500):
+            res.append(self._lemmatize(doc))
+            pbar.update(1)
+        pbar.close()
+        return pd.Series(res)
+    @staticmethod
+    def _cleaning_options():
+        """Returns available cleaning steps in order"""
+        return OrderedDict(
+            [
+                ("lower", lambda x: x.lower()),
+                ("normalize_unicode", normalize.unicode),
+                ("normalize_bullet_points", normalize.bullet_points),
+                ("normalize_hyphenated_words", normalize.hyphenated_words),
+                ("normalize_quotation_marks", normalize.quotation_marks),
+                ("normalize_whitespace", normalize.whitespace),
+                ("remove_accents", remove.accents),
+                ("remove_brackets", remove.brackets),
+                ("remove_html_tags", remove.html_tags),
+                ("remove_punctuation", remove.punctuation),
+                ("replace_currency_symbols", replace.currency_symbols),
+                ("replace_emails", replace.emails),
+                ("replace_emojis", replace.emojis),
+                ("replace_hashtags", replace.hashtags),
+                ("replace_numbers", replace.numbers),
+                ("replace_phone_numbers", replace.phone_numbers),
+                ("replace_urls", replace.urls),
+                ("replace_user_handles", replace.user_handles),
+                ("normalize_useless_spaces", normalize_useless_spaces),
+                ("normalize_repeating_chars", normalize_repeating_chars),
+                ("normalize_repeating_words", normalize_repeating_words),
+                ("strip", lambda x: x.strip()),
+            ]
+        )
+    def fit_transform(self, series: pd.Series) -> Series:
+        """Applies text preprocessing"""
+        if self.lemmatizer_when == "first":
+            with st.spinner("Lemmatizing"):
+                series = self.lemmatizer(series)
+        with st.spinner("Cleaning"):
+            series = series.progress_map(self.cleaning_pipeline)
+        if self.lemmatizer_when == "last":
+            with st.spinner("Lemmatizing"):
+                series = self.lemmatizer(series)
+        return series
+def plot_labels_prop(data: pd.DataFrame, label_column: str):
+    source = data["label"].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
+    source["Proportions"] = ((source["Counts"] / source["Counts"].sum()).round(3) * 100).map("{:,.2f}".format) + "%"
+    bars = (
+        alt.Chart(source)
+        .mark_bar()
+        .encode(
+            x="Labels:O",
+            y="Counts:Q",
+        )
+    )
+    text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
+    return (bars + text).properties(height=300)
+def plot_nchars(data: pd.DataFrame, text_column: str):
+    source = data[text_column].str.len().to_frame()
+    plot = (
+        alt.Chart(source)
+        .mark_bar()
+        .encode(
+            alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
+            alt.Y("count()", axis=alt.Axis(title="")),
+        )
+    )
+    return plot.properties(height=300)
+def plot_score(data: pd.DataFrame, label_col: str, label: str):
+    source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
+    plot = (
+        alt.Chart(source)
+        .mark_bar()
+        .encode(
+            y=alt.Y("word:O", sort="-x"),
+            x="score:Q",
+        )
+    )
+    return plot.properties(height=max(30 * source.shape[0], 50))