Spaces:

sklearn-docs
/

text-feature-extraction-evaluation

Sleeping

App Files Files Community

text-feature-extraction-evaluation / app.py

dominguesm

App gradio

d79693f about 2 years ago

raw

history blame

7.34 kB

	import math

	import gradio as gr
	import numpy as np
	import pandas as pd
	import plotly.express as px
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import RandomizedSearchCV
	from sklearn.naive_bayes import ComplementNB
	from sklearn.pipeline import Pipeline

	CATEGORIES = [
	"alt.atheism",
	"comp.graphics",
	"comp.os.ms-windows.misc",
	"comp.sys.ibm.pc.hardware",
	"comp.sys.mac.hardware",
	"comp.windows.x",
	"misc.forsale",
	"rec.autos",
	"rec.motorcycles",
	"rec.sport.baseball",
	"rec.sport.hockey",
	"sci.crypt",
	"sci.electronics",
	"sci.med",
	"sci.space",
	"soc.religion.christian",
	"talk.politics.guns",
	"talk.politics.mideast",
	"talk.politics.misc",
	"talk.religion.misc",
	]


	PARAMETER_GRID = {
	"vect__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
	"vect__min_df": (1, 3, 5, 10),
	"vect__ngram_range": ((1, 1), (1, 2)), # unigrams or bigrams
	"vect__norm": ("l1", "l2"),
	"clf__alpha": np.logspace(-6, 6, 13),
	}


	def shorten_param(param_name):
	"""Remove components' prefixes in param_name."""
	if "__" in param_name:
	return param_name.rsplit("__", 1)[1]
	return param_name


	def train_model(categories):
	pipeline = Pipeline(
	[
	("vect", TfidfVectorizer()),
	("clf", ComplementNB()),
	]
	)

	data_train = fetch_20newsgroups(
	subset="train",
	categories=categories,
	shuffle=True,
	random_state=42,
	remove=("headers", "footers", "quotes"),
	)

	data_test = fetch_20newsgroups(
	subset="test",
	categories=categories,
	shuffle=True,
	random_state=42,
	remove=("headers", "footers", "quotes"),
	)

	pipeline = Pipeline(
	[
	("vect", TfidfVectorizer()),
	("clf", ComplementNB()),
	]
	)

	random_search = RandomizedSearchCV(
	estimator=pipeline,
	param_distributions=PARAMETER_GRID,
	n_iter=40,
	random_state=0,
	n_jobs=2,
	verbose=1,
	)

	random_search.fit(data_train.data, data_train.target)
	best_parameters = random_search.best_estimator_.get_params()

	test_accuracy = random_search.score(data_test.data, data_test.target)

	cv_results = pd.DataFrame(random_search.cv_results_)
	cv_results = cv_results.rename(shorten_param, axis=1)

	param_names = [shorten_param(name) for name in PARAMETER_GRID.keys()]
	labels = {
	"mean_score_time": "CV Score time (s)",
	"mean_test_score": "CV score (accuracy)",
	}
	fig = px.scatter(
	cv_results,
	x="mean_score_time",
	y="mean_test_score",
	error_x="std_score_time",
	error_y="std_test_score",
	hover_data=param_names,
	labels=labels,
	)
	fig.update_layout(
	title={
	"text": "trade-off between scoring time and mean test score",
	"y": 0.95,
	"x": 0.5,
	"xanchor": "center",
	"yanchor": "top",
	}
	)

	column_results = param_names + ["mean_test_score", "mean_score_time"]

	transform_funcs = dict.fromkeys(column_results, lambda x: x)
	# Using a logarithmic scale for alpha
	transform_funcs["alpha"] = math.log10
	# L1 norms are mapped to index 1, and L2 norms to index 2
	transform_funcs["norm"] = lambda x: 2 if x == "l2" else 1
	# Unigrams are mapped to index 1 and bigrams to index 2
	transform_funcs["ngram_range"] = lambda x: x[1]

	fig2 = px.parallel_coordinates(
	cv_results[column_results].apply(transform_funcs),
	color="mean_test_score",
	color_continuous_scale=px.colors.sequential.Viridis_r,
	labels=labels,
	)
	fig2.update_layout(
	title={
	"text": "Parallel coordinates plot of text classifier pipeline",
	"y": 0.99,
	"x": 0.5,
	"xanchor": "center",
	"yanchor": "top",
	}
	)

	return fig, fig2, best_parameters, test_accuracy


	DESCRIPTION_PART1 = [
	"The dataset used in this example is",
	"[The 20 newsgroups text dataset](https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset)",
	"which will be automatically downloaded, cached and reused for the document classification example.",
	]

	DESCRIPTION_PART2 = [
	"In this example, we tune the hyperparameters of",
	"a particular classifier using a",
	"[RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV).",
	"For a demo on the performance of some other classifiers, see the",
	"[Classification of text documents using sparse features](https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py) notebook.",
	]

	AUTHOR = """
	Created by [@dominguesm](https://huggingface.co/dominguesm) based on [scikit-learn docs](https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_text_feature_extraction.html)
	"""


	with gr.Blocks(theme=gr.themes.Soft()) as app:
	with gr.Row():
	with gr.Column():
	gr.Markdown("# Sample pipeline for text feature extraction and evaluation")
	gr.Markdown(" ".join(DESCRIPTION_PART1))
	gr.Markdown(" ".join(DESCRIPTION_PART2))
	gr.Markdown(AUTHOR)

	with gr.Row():
	with gr.Column():
	gr.Markdown("""## CATEGORY SELECTION""")
	drop_categories = gr.Dropdown(
	CATEGORIES,
	value=["alt.atheism", "talk.religion.misc"],
	multiselect=True,
	label="Categories",
	info="Select the categories you want to train on.",
	max_choices=2,
	interactive=True,
	)
	with gr.Row():
	with gr.Column():
	gr.Markdown(
	"""
	## PARAMETERS GRID
	```python
	{
	'clf__alpha': array(
	[1.e-06, 1.e-05, 1.e-04,...]
	),
	'vect__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
	'vect__min_df': (1, 3, 5, 10),
	'vect__ngram_range': ((1, 1), (1, 2)),
	'vect__norm': ('l1', 'l2')
	}
	```
	## MODEL PIPELINE
	```python
	pipeline = Pipeline(
	[
	("vect", TfidfVectorizer()),
	("clf", ComplementNB()),
	]
	)
	```
	"""
	)
	with gr.Row():
	with gr.Column():
	gr.Markdown("""## TRAINING""")
	with gr.Row():
	brn_train = gr.Button("Train").style(container=False)

	gr.Markdown("## RESULTS")
	with gr.Row():
	best_parameters = gr.Textbox(label="Best parameters")
	test_accuracy = gr.Textbox(label="Test accuracy")

	plot_trade = gr.Plot(label="")
	plot_coordinates = gr.Plot(label="")

	brn_train.click(
	train_model,
	[drop_categories],
	[plot_trade, plot_coordinates, best_parameters, test_accuracy],
	)

	app.launch()