|
import io |
|
import datetime |
|
from tqdm import tqdm |
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
import re |
|
|
|
|
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
|
|
import nltk |
|
from nltk.corpus import stopwords |
|
|
|
|
|
import spacy |
|
|
|
|
|
from pysentimiento import create_analyzer |
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
from PIL import Image |
|
|
|
import uuid |
|
import streamlit as st |
|
|
|
nltk.download('stopwords') |
|
nltk.download('punkt') |
|
|
|
|
|
|
|
def get_sentiment(df,column): |
|
analyzer = create_analyzer(task="sentiment", lang="es") |
|
analyzer_outputs = [] |
|
with tqdm(total=len(df), desc="Analyzing Comments") as pbar: |
|
|
|
for element in df[column]: |
|
|
|
result = analyzer.predict(element) |
|
|
|
analyzer_outputs.append(result) |
|
|
|
pbar.update(1) |
|
|
|
|
|
output_list = [output.output for output in analyzer_outputs] |
|
NEU_list = [output.probas.get('NEU', None) for output in analyzer_outputs] |
|
NEG_list = [output.probas.get('NEG', None) for output in analyzer_outputs] |
|
POS_list = [output.probas.get('POS', None) for output in analyzer_outputs] |
|
|
|
|
|
df['Polaridad'] = output_list |
|
df['sent_NEU'] = NEU_list |
|
df['sent_NEG'] = NEG_list |
|
df['sent_POS'] = POS_list |
|
return df |
|
|
|
|
|
def get_emotions(df,column): |
|
analyzer = create_analyzer(task="emotion", lang="es") |
|
analyzer_outputs = [] |
|
with tqdm(total=len(df), desc="Analyzing Comments") as pbar: |
|
|
|
for element in df[column]: |
|
|
|
result = analyzer.predict(element) |
|
|
|
analyzer_outputs.append(result) |
|
|
|
pbar.update(1) |
|
|
|
|
|
output_list = [output.output for output in analyzer_outputs] |
|
anger_list = [output.probas.get('anger', None) for output in analyzer_outputs] |
|
sadness_list = [output.probas.get('sadness', None) for output in analyzer_outputs] |
|
surprise_list = [output.probas.get('surprise', None) for output in analyzer_outputs] |
|
disgust_list = [output.probas.get('disgust', None) for output in analyzer_outputs] |
|
joy_list = [output.probas.get('joy', None) for output in analyzer_outputs] |
|
fear_list = [output.probas.get('fear', None) for output in analyzer_outputs] |
|
others_list = [output.probas.get('others', None) for output in analyzer_outputs] |
|
|
|
|
|
df['Emocion'] = output_list |
|
df['emo_anger'] = anger_list |
|
df['emo_sadness'] = sadness_list |
|
df['emo_surprise'] = surprise_list |
|
df['emo_disgust'] = disgust_list |
|
df['emo_joy'] = joy_list |
|
df['emo_fear'] = fear_list |
|
df['emo_others'] = others_list |
|
return df |
|
|
|
class ProcesamientoLenguaje: |
|
def __init__(self): |
|
self.nlp = spacy.load('es_core_news_md', disable=["parser", "ner"]) |
|
|
|
def postags_and_stopwords(self, texts, allowed_postags=['NOUN', 'ADJ','PROPN', 'VB', 'X']): |
|
|
|
'''Función que procesa todos los textos en un pipeline de spaCy para tokenizar y etiquetar las POS. |
|
Luego, filtra todas las palabras de longitud mayor a 2 caracteres que no sean stop words y que se encuentren |
|
dentro de las etiquetas permitidas: sustantivo, adjetivo, verbo, nombre propio y todo lo que no caiga en una categoría |
|
preestablecida (palabras OOV, nombres propios no reconocidos, etc). |
|
Devuelve los textos procesados. |
|
''' |
|
|
|
texts_out = ' '.join([token.text for token in self.nlp(texts) if token.pos_ in |
|
allowed_postags and token.text not in stop_words and len(token.text) > 2]) |
|
return texts_out |
|
|
|
def cleaner(self, word): |
|
|
|
'''Función que toma un texto y remueve distintos símbolos y variaciones de palabras. |
|
Devuelve el string limpio. |
|
''' |
|
|
|
word = re.sub(r'https?\S+', '', word) |
|
word = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', "", word) |
|
word = re.sub(r'ee.uu', 'eeuu', word, flags=re.IGNORECASE) |
|
word = re.sub(r'\#\.', '', word) |
|
word = re.sub(r'\n', ' ', word) |
|
word = re.sub(r',', '', word) |
|
word = re.sub(r'\-', ' ', word) |
|
word = re.sub(r'\.{3}', ' ', word) |
|
word = re.sub(r'a{2,}', 'a', word) |
|
word = re.sub(r'é{2,}', 'é', word) |
|
word = re.sub(r'i{2,}', 'i', word) |
|
word = re.sub(r'ja{2,}', 'ja', word) |
|
word = re.sub(r'[^\w\s@ñ]', '', word, flags=re.UNICODE) |
|
word = re.sub(r'\b@\w+\b', '', word) |
|
word = re.sub(r'\b\w{1,2}\b', '', word) |
|
|
|
return word |
|
|
|
def grafico_pie(df, column_name='Polaridad'): |
|
file_path = f"{uuid.uuid4()}_sentimiento.jpg" |
|
plt.figure(figsize=(8, 6)) |
|
polaridad_counts = df[column_name].value_counts() |
|
plt.pie(polaridad_counts, labels=polaridad_counts.index, autopct='%1.1f%%', startangle=140) |
|
plt.title("Distribución de Polaridad") |
|
plt.savefig(file_path, bbox_inches="tight") |
|
plt.close() |
|
return file_path |
|
|
|
def grafico_barras(df, column_name='Emocion'): |
|
file_path = f"{uuid.uuid4()}_sentimiento.jpg" |
|
plt.figure(figsize=(8, 6)) |
|
ax = sns.countplot(x=column_name, data=df) |
|
for p in ax.patches: |
|
ax.annotate(format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points') |
|
plt.xlabel("Emocion") |
|
plt.ylabel("Cantidad") |
|
plt.title("Histograma de Emocion") |
|
plt.savefig(file_path, bbox_inches="tight") |
|
plt.close() |
|
return file_path |
|
|
|
pln = ProcesamientoLenguaje() |
|
stop_words = stopwords.words('spanish') |
|
|
|
|
|
def procesar_csv(file): |
|
if file is None: |
|
return "No se ha cargado ningún archivo." |
|
|
|
csv_bytes = file.getvalue() |
|
csv_read_buffer = io.StringIO(csv_bytes.decode('utf-8')) |
|
df = pd.read_csv(csv_read_buffer, delimiter=';') |
|
print(df) |
|
|
|
df['Fecha'] = pd.to_datetime(df['Fecha'], format='%d/%m/%y') |
|
df = get_sentiment(df, "Comentario") |
|
df = get_emotions(df, "Comentario") |
|
|
|
df['Comentario_clean'] = df['Comentario'].apply(pln.cleaner) |
|
df['Comentario_clean'] = df['Comentario_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)])) |
|
df['Comentario_clean'] = df['Comentario_clean'].apply(pln.postags_and_stopwords) |
|
output_file = f"{uuid.uuid4()}_processed_output.csv" |
|
df.to_csv(output_file, index=False) |
|
|
|
grafico_pie_path = grafico_pie(df) |
|
grafico_barras_path = grafico_barras(df) |
|
return df.head(10), output_file |
|
|
|
|
|
|
|
st.title("Cargar y visualizar CSV") |
|
st.write("Sube un archivo CSV para ver los primeros registros. El archivo CSV debe tener los campos Fecha y Comentario.") |
|
|
|
|
|
file = st.file_uploader("Archivo CSV", type=["csv"]) |
|
|
|
if file is not None: |
|
|
|
df, processed_file_path = procesar_csv(file) |
|
|
|
|
|
st.write("Vista previa del archivo procesado:") |
|
st.dataframe(df) |
|
|
|
|
|
torta_path = grafico_pie(df) |
|
barras_path = grafico_barras(df) |
|
|
|
|
|
st.image(torta_path, caption="Gráfico de torta") |
|
st.image(barras_path, caption="Gráfico de barras") |
|
|
|
|
|
with open(processed_file_path, 'rb') as f: |
|
st.download_button( |
|
label="Descargar CSV procesado", |
|
data=f, |
|
file_name=f"{uuid.uuid4()}_processed_output.csv", |
|
mime="text/csv" |
|
) |