|
|
|
|
|
|
|
|
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
from langchain.vectorstores import Chroma |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.chains import RetrievalQA |
|
from langchain import OpenAI, VectorDBQA |
|
from langchain.document_loaders import DirectoryLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
import os |
|
import nltk |
|
import pytesseract |
|
|
|
import pandas as pd |
|
pd.set_option('display.max_columns',None, |
|
'display.max_rows',None, |
|
'display.max_colwidth',None |
|
) |
|
import numpy as np |
|
|
|
import os |
|
import re |
|
import io |
|
|
|
import gradio |
|
|
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
|
|
|
|
|
data = pd.read_csv('subset_enron.csv',encoding='utf-8') |
|
data = data.sample(frac=0.01,random_state=12) |
|
|
|
|
|
|
|
cleaned_message = data["message"].apply(lambda x: re.sub(r'\\{1,2}n', '', x)) |
|
content = cleaned_message.tolist() |
|
class Document: |
|
def __init__(self, page_content, metadata=None): |
|
self.page_content = page_content |
|
self.metadata = metadata if metadata is not None else {} |
|
|
|
documents = [Document(page_content) for page_content in content] |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) |
|
texts = text_splitter.split_documents(documents) |
|
|
|
openAI_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("openai_api_key")) |
|
vStore = Chroma.from_documents(documents=texts, embedding=openAI_embeddings) |
|
|
|
|
|
|
|
model_retrieval = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=os.getenv("openai_api_key"), |
|
temperature=0.2, |
|
top_p=0.2, |
|
max_tokens=2000), |
|
chain_type="stuff", retriever=vStore.as_retriever()) |
|
|
|
|
|
|
|
def get_answer(question): |
|
""" |
|
Returns the answer on a given question. |
|
|
|
Args: |
|
question (string): end-user's input. |
|
|
|
Returns: |
|
the model's answer based on the enron emails dataset. |
|
""" |
|
response = model_retrieval.run(question) |
|
return response |
|
|
|
iface = gradio.Interface( |
|
fn=get_answer, |
|
inputs=gradio.Textbox(label="Enter your question here"), |
|
outputs=[ |
|
gradio.Textbox(label="Answer")], |
|
title="Retrieval QA for the subset of the Enron dataset", |
|
examples=[ |
|
"Who are the receivers of the emails from this corpus of emails?", |
|
"What's at the center of these emails?" |
|
] |
|
) |
|
|
|
iface.launch() |