Spaces:
Runtime error
Runtime error
# https://huggingface.co/spaces/micknikolic/enron | |
# here are the imports | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.vectorstores import Chroma | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.chains import RetrievalQA | |
from langchain import OpenAI, VectorDBQA | |
from langchain.document_loaders import DirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import os | |
import nltk | |
import pytesseract | |
import pandas as pd | |
pd.set_option('display.max_columns',None, | |
'display.max_rows',None, | |
'display.max_colwidth',None | |
) | |
import numpy as np | |
import os | |
import re | |
import io | |
import gradio | |
import warnings | |
warnings.filterwarnings('ignore') | |
# here is the code | |
# data loading. | |
# i am using a subset of the enron dataset, as it would be computationally very expensive to work with over 500k observations locally. | |
data = pd.read_csv('subset_enron.csv',encoding='utf-8') | |
data = data.sample(frac=0.01,random_state=12) #(5174, 2) | |
# Text pre-processing. | |
cleaned_message = data["message"].apply(lambda x: re.sub(r'\\{1,2}n', '', x)) | |
content = cleaned_message.tolist() | |
class Document: | |
def __init__(self, page_content, metadata=None): | |
self.page_content = page_content | |
self.metadata = metadata if metadata is not None else {} | |
documents = [Document(page_content) for page_content in content] | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
texts = text_splitter.split_documents(documents) | |
openAI_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("openai_api_key")) | |
vStore = Chroma.from_documents(documents=texts, embedding=openAI_embeddings) | |
# Retrieval QA | |
model_retrieval = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=os.getenv("openai_api_key"), | |
temperature=0.2, | |
top_p=0.2, | |
max_tokens=2000), | |
chain_type="stuff", retriever=vStore.as_retriever()) | |
# Building Gradio based app. The Retrieval model. | |
def get_answer(question): | |
""" | |
Returns the answer on a given question. | |
Args: | |
question (string): end-user's input. | |
Returns: | |
the model's answer based on the enron emails dataset. | |
""" | |
response = model_retrieval.run(question) | |
return response | |
iface = gradio.Interface( | |
fn=get_answer, | |
inputs=gradio.Textbox(label="Enter your question here"), | |
outputs=[ | |
gradio.Textbox(label="Answer")], | |
title="Retrieval QA for the subset of the Enron dataset", | |
examples=[ | |
"Who are the senders of these emails?", | |
"What's at the center of these emails?" | |
] | |
) | |
iface.launch() |