Spaces:

fcernafukuzaki
/

pysentimiento_streamlit

Sleeping

File size: 8,992 Bytes

import io
import datetime
from tqdm import tqdm

# Pandas
import pandas as pd

# Expresiones regulares
import re

# Matplotlib, Seaborn y Plotly
import matplotlib.pyplot as plt
import seaborn as sns

# NLTK
import nltk
from nltk.corpus import stopwords

# spaCy
import spacy

# PySentimiento y Transformers
from pysentimiento import create_analyzer
from sentence_transformers import SentenceTransformer

# Word cloud
from PIL import Image

import uuid
import streamlit as st

nltk.download('stopwords')
nltk.download('punkt')


### Reformado. Antes hacia reproceso para obtener output y probas. Se puede hacer en un paso.
def get_sentiment(df,column):
    analyzer = create_analyzer(task="sentiment", lang="es")
    analyzer_outputs = []
    with tqdm(total=len(df), desc="Analyzing Comments") as pbar:
        # Iterate through each element in the DataFrame column
        for element in df[column]:
            # Perform sentiment analysis on each element
            result = analyzer.predict(element)
            # Append the result to the list
            analyzer_outputs.append(result)
            # Update the progress bar
            pbar.update(1)

    # Extracting values into columns
    output_list = [output.output for output in analyzer_outputs]
    NEU_list = [output.probas.get('NEU', None) for output in analyzer_outputs]
    NEG_list = [output.probas.get('NEG', None) for output in analyzer_outputs]
    POS_list = [output.probas.get('POS', None) for output in analyzer_outputs]
    
    # Assigning lists to DataFrame columns
    df['Polaridad'] = output_list
    df['sent_NEU'] = NEU_list
    df['sent_NEG'] = NEG_list
    df['sent_POS'] = POS_list
    return df

### Reformado. Antes hacia reproceso para obtener output y probas. Se puede hacer en un paso.
def get_emotions(df,column):
    analyzer = create_analyzer(task="emotion", lang="es")
    analyzer_outputs = []
    with tqdm(total=len(df), desc="Analyzing Comments") as pbar:
        # Iterate through each element in the DataFrame column
        for element in df[column]:
            # Perform sentiment analysis on each element
            result = analyzer.predict(element)
            # Append the result to the list
            analyzer_outputs.append(result)
            # Update the progress bar
            pbar.update(1)

    # Extracting values into columns
    output_list = [output.output for output in analyzer_outputs]
    anger_list = [output.probas.get('anger', None) for output in analyzer_outputs]
    sadness_list = [output.probas.get('sadness', None) for output in analyzer_outputs]
    surprise_list = [output.probas.get('surprise', None) for output in analyzer_outputs]
    disgust_list = [output.probas.get('disgust', None) for output in analyzer_outputs]
    joy_list = [output.probas.get('joy', None) for output in analyzer_outputs]
    fear_list = [output.probas.get('fear', None) for output in analyzer_outputs]
    others_list = [output.probas.get('others', None) for output in analyzer_outputs]
    
    # Assigning lists to DataFrame columns
    df['Emocion'] = output_list
    df['emo_anger'] = anger_list
    df['emo_sadness'] = sadness_list
    df['emo_surprise'] = surprise_list
    df['emo_disgust'] = disgust_list
    df['emo_joy'] = joy_list
    df['emo_fear'] = fear_list
    df['emo_others'] = others_list
    return df

class ProcesamientoLenguaje:
    def __init__(self):
        self.nlp = spacy.load('es_core_news_md', disable=["parser", "ner"])

    def postags_and_stopwords(self, texts, allowed_postags=['NOUN', 'ADJ','PROPN', 'VB', 'X']):

        '''Función que procesa todos los textos en un pipeline de spaCy para tokenizar y etiquetar las POS.
        Luego, filtra todas las palabras de longitud mayor a 2 caracteres que no sean stop words y que se encuentren
        dentro de las etiquetas permitidas: sustantivo, adjetivo, verbo, nombre propio y todo lo que no caiga en una categoría
        preestablecida (palabras OOV, nombres propios no reconocidos, etc).
        Devuelve los textos procesados.
        '''

        texts_out = ' '.join([token.text for token in self.nlp(texts) if token.pos_ in
                    allowed_postags and token.text not in stop_words and len(token.text) > 2])
        return texts_out

    def cleaner(self, word):

        '''Función que toma un texto y remueve distintos símbolos y variaciones de palabras.
        Devuelve el string limpio.
        '''

        word = re.sub(r'https?\S+', '', word) #remueve todas las URLs
        word = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', "", word) #remueve interrogación, paréntesis, dos puntos, etc
        word = re.sub(r'ee.uu', 'eeuu', word, flags=re.IGNORECASE) #convierte todas las variaciones de EEUU sin importar el separador en EEUU
        word = re.sub(r'\#\.', '', word)
        word = re.sub(r'\n', ' ', word) #remueve todos los line-breaks y los reemplaza con espacios
        word = re.sub(r',', '', word) #remueve comas
        word = re.sub(r'\-', ' ', word) #remueve guiones
        word = re.sub(r'\.{3}', ' ', word) #remueve tres puntos
        word = re.sub(r'a{2,}', 'a', word) #remueve múltiples instancias de la letra a (p.ej: aaaaaaah, holaaaaaa)
        word = re.sub(r'é{2,}', 'é', word) #remueve múltiples instancias de la letra é (p.ej: volvééééé)
        word = re.sub(r'i{2,}', 'i', word) #remueve múltiples instancias de la letra i (p.ej: salíiiiiii)
        word = re.sub(r'ja{2,}', 'ja', word) #remueve las "risas" (p.ej: jaaaaaa)
        word = re.sub(r'[^\w\s@ñ]', '', word, flags=re.UNICODE) #remueve todos los símbolos no alfanuméricos excepto @ y ñ
        word = re.sub(r'\b@\w+\b', '', word) #remueve todos los usuarios de Twitter
        word = re.sub(r'\b\w{1,2}\b', '', word) #remueve todas las palabras de una o dos letras

        return word

def grafico_pie(df, column_name='Polaridad'):
    file_path = f"{uuid.uuid4()}_sentimiento.jpg"
    plt.figure(figsize=(8, 6))
    polaridad_counts = df[column_name].value_counts()
    plt.pie(polaridad_counts, labels=polaridad_counts.index, autopct='%1.1f%%', startangle=140)
    plt.title("Distribución de Polaridad")
    plt.savefig(file_path, bbox_inches="tight")
    plt.close()
    return file_path

def grafico_barras(df, column_name='Emocion'):
    file_path = f"{uuid.uuid4()}_sentimiento.jpg"
    plt.figure(figsize=(8, 6))
    ax = sns.countplot(x=column_name, data=df)
    for p in ax.patches:
        ax.annotate(format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
    plt.xlabel("Emocion")
    plt.ylabel("Cantidad")
    plt.title("Histograma de Emocion")
    plt.savefig(file_path, bbox_inches="tight")
    plt.close()
    return file_path

pln = ProcesamientoLenguaje()
stop_words = stopwords.words('spanish')

# Función que lee el archivo CSV
def procesar_csv(file):
    if file is None:
        return "No se ha cargado ningún archivo."

    csv_bytes = file.getvalue()#.encode('utf-8')
    csv_read_buffer = io.StringIO(csv_bytes.decode('utf-8'))
    df = pd.read_csv(csv_read_buffer, delimiter=';')
    print(df)
    #df = pd.read_csv(file.name, delimiter=';')
    df['Fecha'] = pd.to_datetime(df['Fecha'], format='%d/%m/%y')
    df = get_sentiment(df, "Comentario")
    df = get_emotions(df, "Comentario")
    
    df['Comentario_clean'] = df['Comentario'].apply(pln.cleaner)
    df['Comentario_clean'] = df['Comentario_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
    df['Comentario_clean'] = df['Comentario_clean'].apply(pln.postags_and_stopwords)
    output_file = f"{uuid.uuid4()}_processed_output.csv"
    df.to_csv(output_file, index=False)

    grafico_pie_path = grafico_pie(df)
    grafico_barras_path = grafico_barras(df)
    return df.head(10), output_file # Muestra las primeras filas


# Configuración de la app en Streamlit
st.title("Cargar y visualizar CSV")
st.write("Sube un archivo CSV para ver los primeros registros. El archivo CSV debe tener los campos Fecha y Comentario.")

# Subir archivo
file = st.file_uploader("Archivo CSV", type=["csv"])

if file is not None:
    # Procesar el archivo y obtener el DataFrame y la ruta del archivo procesado
    df, processed_file_path = procesar_csv(file)
    
    # Mostrar vista previa del DataFrame
    st.write("Vista previa del archivo procesado:")
    st.dataframe(df)

    # Generar y mostrar gráficos
    torta_path = grafico_pie(df)
    barras_path = grafico_barras(df)

    # Mostrar gráficos en Streamlit
    st.image(torta_path, caption="Gráfico de torta")
    st.image(barras_path, caption="Gráfico de barras")

    # Opción para descargar el archivo CSV procesado
    with open(processed_file_path, 'rb') as f:
        st.download_button(
            label="Descargar CSV procesado",
            data=f,
            file_name=f"{uuid.uuid4()}_processed_output.csv",
            mime="text/csv"
        )