Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import plotly.express as px | |
num_rows = 50 | |
df = pd.read_csv('emails_cleaned.csv', on_bad_lines='skip', nrows=num_rows) | |
def get_message(Series: pd.Series): | |
result = pd.Series(index=Series.index) | |
for row, message in enumerate(Series): | |
message_words = message.split('\n') | |
del message_words[:15] | |
result.iloc[row] = ''.join(message_words).strip() | |
return result | |
def get_date(Series: pd.Series): | |
result = pd.Series(index=Series.index) | |
for row, message in enumerate(Series): | |
message_words = message.split('\n') | |
del message_words[0] | |
del message_words[1:] | |
result.iloc[row] = ''.join(message_words).strip() | |
result.iloc[row] = result.iloc[row].replace('Date: ', '') | |
print('Done parsing, converting to datetime format..') | |
return pd.to_datetime(result) | |
def get_sender_and_receiver(Series: pd.Series): | |
sender = pd.Series(index = Series.index) | |
recipient1 = pd.Series(index = Series.index) | |
recipient2 = pd.Series(index = Series.index) | |
recipient3 = pd.Series(index = Series.index) | |
for row,message in enumerate(Series): | |
message_words = message.split('\n') | |
sender[row] = message_words[2].replace('From: ', '') | |
recipient1[row] = message_words[3].replace('To: ', '') | |
recipient2[row] = message_words[10].replace('X-cc: ', '') | |
recipient3[row] = message_words[11].replace('X-bcc: ', '') | |
return sender, recipient1, recipient2, recipient3 | |
def get_subject(Series: pd.Series): | |
result = pd.Series(index = Series.index) | |
for row, message in enumerate(Series): | |
message_words = message.split('\n') | |
message_words = message_words[4] | |
result[row] = message_words.replace('Subject: ', '') | |
return result | |
def get_folder(Series: pd.Series): | |
result = pd.Series(index = Series.index) | |
for row, message in enumerate(Series): | |
message_words = message.split('\n') | |
message_words = message_words[12] | |
result[row] = message_words.replace('X-Folder: ', '') | |
return result | |
df['text'] = get_message(df.message) | |
df['sender'], df['recipient1'], df['recipient2'], df['recipient3'] = get_sender_and_receiver(df.message) | |
df['Subject'] = get_subject(df.message) | |
df['folder'] = get_folder(df.message) | |
df['date'] = get_date(df.message) | |
df = df.drop(['message', 'file'], axis = 1) | |
import chromadb | |
chroma_client = chromadb.Client() | |
from chromadb.utils import embedding_functions | |
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-MiniLM-L3-v2") | |
collection_minilm = chroma_client.create_collection(name="emails_minilm", embedding_function=sentence_transformer_ef) | |
for i in df.index: | |
print(i) | |
collection_minilm.add( | |
documents = df.loc[i, 'text'], | |
metadatas = [{"sender": df.loc[i, 'sender'], | |
"recipient1": df.loc[i, 'recipient1'], | |
"recipient2": df.loc[i, 'recipient2'], | |
"recipient3": df.loc[i, 'recipient3'], | |
"subject": df.loc[i, 'Subject'], | |
"folder": df.loc[i, 'folder'], | |
"date": str(df.loc[i, 'date']) | |
}], | |
ids = str(i) | |
) | |
results = collection_minilm.query( | |
query_texts = ["this is a document"], | |
n_results = 2, | |
include = ['distances', 'metadatas', 'documents'] | |
) | |
results | |
import gradio as gr | |
import ast | |
def create_output(dictionary, number): | |
dictionary_ids = str(dictionary['ids']) | |
dictionary_ids_clean = dictionary_ids.strip("[]") | |
dictionary_ids_clean = dictionary_ids_clean.replace("'", "") | |
dictionary_ids_list = dictionary_ids_clean.split(", ") | |
string_results = ""; | |
for n in range(number): | |
t = collection_minilm.get( | |
ids=[dictionary_ids_list[n]] | |
) | |
id = str(t["ids"]) | |
doc = str(t["documents"]) | |
metadata = str(t["metadatas"]) | |
dictionary_metadata = ast.literal_eval(metadata.strip("[]")) | |
string_results_old = string_results | |
string_temp = """--------------- | |
SUBJECT: """ + dictionary_metadata['subject'] + """" | |
MESSAGE: """ + "\n" + doc + """ | |
---------------""" | |
string_results = string_results_old + string_temp | |
return string_results | |
def query_chromadb_advanced(question,numberOfResults): | |
results = collection_minilm.query( | |
query_texts = question, | |
n_results = numberOfResults, | |
) | |
return create_output(results, numberOfResults) | |
result_advance = query_chromadb_advanced("bank", 4) | |
iface = gr.Interface( | |
fn=query_chromadb_advanced, | |
inputs=["text","number"], | |
outputs="text", | |
title="Email Dataset Interface", | |
description="Insert the question or the key word to find the topic correlated in the dataset" | |
) | |
iface.launch(share=True) |