Spaces:

fcernafukuzaki
/

pysentimiento_streamlit

Sleeping

App Files Files Community

pysentimiento_streamlit / app.py

fcernafukuzaki

Update app.py

4b90953 verified 5 months ago

raw

history blame contribute delete

8.99 kB

	import io
	import datetime
	from tqdm import tqdm

	# Pandas
	import pandas as pd

	# Expresiones regulares
	import re

	# Matplotlib, Seaborn y Plotly
	import matplotlib.pyplot as plt
	import seaborn as sns

	# NLTK
	import nltk
	from nltk.corpus import stopwords

	# spaCy
	import spacy

	# PySentimiento y Transformers
	from pysentimiento import create_analyzer
	from sentence_transformers import SentenceTransformer

	# Word cloud
	from PIL import Image

	import uuid
	import streamlit as st

	nltk.download('stopwords')
	nltk.download('punkt')


	### Reformado. Antes hacia reproceso para obtener output y probas. Se puede hacer en un paso.
	def get_sentiment(df,column):
	analyzer = create_analyzer(task="sentiment", lang="es")
	analyzer_outputs = []
	with tqdm(total=len(df), desc="Analyzing Comments") as pbar:
	# Iterate through each element in the DataFrame column
	for element in df[column]:
	# Perform sentiment analysis on each element
	result = analyzer.predict(element)
	# Append the result to the list
	analyzer_outputs.append(result)
	# Update the progress bar
	pbar.update(1)

	# Extracting values into columns
	output_list = [output.output for output in analyzer_outputs]
	NEU_list = [output.probas.get('NEU', None) for output in analyzer_outputs]
	NEG_list = [output.probas.get('NEG', None) for output in analyzer_outputs]
	POS_list = [output.probas.get('POS', None) for output in analyzer_outputs]

	# Assigning lists to DataFrame columns
	df['Polaridad'] = output_list
	df['sent_NEU'] = NEU_list
	df['sent_NEG'] = NEG_list
	df['sent_POS'] = POS_list
	return df

	### Reformado. Antes hacia reproceso para obtener output y probas. Se puede hacer en un paso.
	def get_emotions(df,column):
	analyzer = create_analyzer(task="emotion", lang="es")
	analyzer_outputs = []
	with tqdm(total=len(df), desc="Analyzing Comments") as pbar:
	# Iterate through each element in the DataFrame column
	for element in df[column]:
	# Perform sentiment analysis on each element
	result = analyzer.predict(element)
	# Append the result to the list
	analyzer_outputs.append(result)
	# Update the progress bar
	pbar.update(1)

	# Extracting values into columns
	output_list = [output.output for output in analyzer_outputs]
	anger_list = [output.probas.get('anger', None) for output in analyzer_outputs]
	sadness_list = [output.probas.get('sadness', None) for output in analyzer_outputs]
	surprise_list = [output.probas.get('surprise', None) for output in analyzer_outputs]
	disgust_list = [output.probas.get('disgust', None) for output in analyzer_outputs]
	joy_list = [output.probas.get('joy', None) for output in analyzer_outputs]
	fear_list = [output.probas.get('fear', None) for output in analyzer_outputs]
	others_list = [output.probas.get('others', None) for output in analyzer_outputs]

	# Assigning lists to DataFrame columns
	df['Emocion'] = output_list
	df['emo_anger'] = anger_list
	df['emo_sadness'] = sadness_list
	df['emo_surprise'] = surprise_list
	df['emo_disgust'] = disgust_list
	df['emo_joy'] = joy_list
	df['emo_fear'] = fear_list
	df['emo_others'] = others_list
	return df

	class ProcesamientoLenguaje:
	def __init__(self):
	self.nlp = spacy.load('es_core_news_md', disable=["parser", "ner"])

	def postags_and_stopwords(self, texts, allowed_postags=['NOUN', 'ADJ','PROPN', 'VB', 'X']):

	'''Función que procesa todos los textos en un pipeline de spaCy para tokenizar y etiquetar las POS.
	Luego, filtra todas las palabras de longitud mayor a 2 caracteres que no sean stop words y que se encuentren
	dentro de las etiquetas permitidas: sustantivo, adjetivo, verbo, nombre propio y todo lo que no caiga en una categoría
	preestablecida (palabras OOV, nombres propios no reconocidos, etc).
	Devuelve los textos procesados.
	'''

	texts_out = ' '.join([token.text for token in self.nlp(texts) if token.pos_ in
	allowed_postags and token.text not in stop_words and len(token.text) > 2])
	return texts_out

	def cleaner(self, word):

	'''Función que toma un texto y remueve distintos símbolos y variaciones de palabras.
	Devuelve el string limpio.
	'''

	word = re.sub(r'https?\S+', '', word) #remueve todas las URLs
	word = re.sub(r'(?::\|;\|=)(?:-)?(?:\)\|\(\|D\|P)', "", word) #remueve interrogación, paréntesis, dos puntos, etc
	word = re.sub(r'ee.uu', 'eeuu', word, flags=re.IGNORECASE) #convierte todas las variaciones de EEUU sin importar el separador en EEUU
	word = re.sub(r'\#\.', '', word)
	word = re.sub(r'\n', ' ', word) #remueve todos los line-breaks y los reemplaza con espacios
	word = re.sub(r',', '', word) #remueve comas
	word = re.sub(r'\-', ' ', word) #remueve guiones
	word = re.sub(r'\.{3}', ' ', word) #remueve tres puntos
	word = re.sub(r'a{2,}', 'a', word) #remueve múltiples instancias de la letra a (p.ej: aaaaaaah, holaaaaaa)
	word = re.sub(r'é{2,}', 'é', word) #remueve múltiples instancias de la letra é (p.ej: volvééééé)
	word = re.sub(r'i{2,}', 'i', word) #remueve múltiples instancias de la letra i (p.ej: salíiiiiii)
	word = re.sub(r'ja{2,}', 'ja', word) #remueve las "risas" (p.ej: jaaaaaa)
	word = re.sub(r'[^\w\s@ñ]', '', word, flags=re.UNICODE) #remueve todos los símbolos no alfanuméricos excepto @ y ñ
	word = re.sub(r'\b@\w+\b', '', word) #remueve todos los usuarios de Twitter
	word = re.sub(r'\b\w{1,2}\b', '', word) #remueve todas las palabras de una o dos letras

	return word

	def grafico_pie(df, column_name='Polaridad'):
	file_path = f"{uuid.uuid4()}_sentimiento.jpg"
	plt.figure(figsize=(8, 6))
	polaridad_counts = df[column_name].value_counts()
	plt.pie(polaridad_counts, labels=polaridad_counts.index, autopct='%1.1f%%', startangle=140)
	plt.title("Distribución de Polaridad")
	plt.savefig(file_path, bbox_inches="tight")
	plt.close()
	return file_path

	def grafico_barras(df, column_name='Emocion'):
	file_path = f"{uuid.uuid4()}_sentimiento.jpg"
	plt.figure(figsize=(8, 6))
	ax = sns.countplot(x=column_name, data=df)
	for p in ax.patches:
	ax.annotate(format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
	plt.xlabel("Emocion")
	plt.ylabel("Cantidad")
	plt.title("Histograma de Emocion")
	plt.savefig(file_path, bbox_inches="tight")
	plt.close()
	return file_path

	pln = ProcesamientoLenguaje()
	stop_words = stopwords.words('spanish')

	# Función que lee el archivo CSV
	def procesar_csv(file):
	if file is None:
	return "No se ha cargado ningún archivo."

	csv_bytes = file.getvalue()#.encode('utf-8')
	csv_read_buffer = io.StringIO(csv_bytes.decode('utf-8'))
	df = pd.read_csv(csv_read_buffer, delimiter=';')
	print(df)
	#df = pd.read_csv(file.name, delimiter=';')
	df['Fecha'] = pd.to_datetime(df['Fecha'], format='%d/%m/%y')
	df = get_sentiment(df, "Comentario")
	df = get_emotions(df, "Comentario")

	df['Comentario_clean'] = df['Comentario'].apply(pln.cleaner)
	df['Comentario_clean'] = df['Comentario_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
	df['Comentario_clean'] = df['Comentario_clean'].apply(pln.postags_and_stopwords)
	output_file = f"{uuid.uuid4()}_processed_output.csv"
	df.to_csv(output_file, index=False)

	grafico_pie_path = grafico_pie(df)
	grafico_barras_path = grafico_barras(df)
	return df.head(10), output_file # Muestra las primeras filas


	# Configuración de la app en Streamlit
	st.title("Cargar y visualizar CSV")
	st.write("Sube un archivo CSV para ver los primeros registros. El archivo CSV debe tener los campos Fecha y Comentario.")

	# Subir archivo
	file = st.file_uploader("Archivo CSV", type=["csv"])

	if file is not None:
	# Procesar el archivo y obtener el DataFrame y la ruta del archivo procesado
	df, processed_file_path = procesar_csv(file)

	# Mostrar vista previa del DataFrame
	st.write("Vista previa del archivo procesado:")
	st.dataframe(df)

	# Generar y mostrar gráficos
	torta_path = grafico_pie(df)
	barras_path = grafico_barras(df)

	# Mostrar gráficos en Streamlit
	st.image(torta_path, caption="Gráfico de torta")
	st.image(barras_path, caption="Gráfico de barras")

	# Opción para descargar el archivo CSV procesado
	with open(processed_file_path, 'rb') as f:
	st.download_button(
	label="Descargar CSV procesado",
	data=f,
	file_name=f"{uuid.uuid4()}_processed_output.csv",
	mime="text/csv"
	)