Spaces:

dobinyim
/

aie3-midterm

Paused

App Files Files Community

aie3-midterm / midterm.py

dobinyim

Upload midterm.py

73bc015 verified about 1 year ago

raw

history blame

3.59 kB

	import os
	from dotenv import load_dotenv
	import openai
	import chainlit as cl
	from langchain_community.document_loaders import PyMuPDFLoader
	from operator import itemgetter
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.schema.output_parser import StrOutputParser
	from langchain.schema.runnable import RunnablePassthrough
	from langchain.schema.runnable.config import RunnableConfig
	from langchain import hub
	from langchain_community.vectorstores import Qdrant
	from langchain.prompts import ChatPromptTemplate
	from langchain_openai import OpenAIEmbeddings
	from langchain_openai import ChatOpenAI
	import json

	load_dotenv()

	OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
	openai.api_key = OPENAI_API_KEY

	# Load PDF
	loader = PyMuPDFLoader("./AirBnB10Q.pdf")
	documents = loader.load()

	# Split Document
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=300,
	chunk_overlap=50
	)
	documents = text_splitter.split_documents(documents)

	# Load OpenAI Embeddings
	embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

	# Load Qdrant Vector Store
	qdrant_vector_store = Qdrant.from_documents(
	documents,
	embeddings,
	location=":memory:",
	collection_name="AirBnB10Q"
	)
	retriever = qdrant_vector_store.as_retriever()

	# Pull LangChain QA Prompt Template
	retrieval_qa_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

	template = """You are a helpful assistant. Use only the context available in the file. The file is a PDF and is a company filing submitted to the SEC.
	Pages include company information and detailed reports about financial performance. Pages contain tables, where some key information is found.
	Table columns include name, title, and shares owned. Do not make up any information that is not in the file.
	Use the context provided in the file to answer the questions. Explain your answer by describing how you arrived at the answer:

	Context:
	{context}

	Question:
	{query}
	"""

	rag_prompt = ChatPromptTemplate.from_template(template)
	qa_llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

	@cl.on_chat_start
	async def start_chat():
	"""
	This function will be called at the start of every user session.

	We will build our LCEL RAG chain here, and store it in the user session.

	The user session is a dictionary that is unique to each user session, and is stored in the memory of the server.
	"""

	### BUILD LCEL RAG CHAIN THAT ONLY RETURNS TEXT
	lcel_rag_chain = (
	{"context": itemgetter("query") \| retriever, "query": itemgetter("query")}
	\| rag_prompt \| qa_llm
	)

	cl.user_session.set("lcel_rag_chain", lcel_rag_chain)

	@cl.on_message
	async def main(message: cl.Message):
	"""
	This function will be called every time a message is recieved from a session.

	We will use the LCEL RAG chain to generate a response to the user query.

	The LCEL RAG chain is stored in the user session, and is unique to each user session - this is why we can access it here.
	"""
	lcel_rag_chain = cl.user_session.get("lcel_rag_chain")

	msg = cl.Message(content="")

	async for chunk in lcel_rag_chain.astream(
	{"query": message.content},
	config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
	):
	if isinstance(chunk, dict) and 'content' in chunk:
	await msg.stream_token(chunk['content'])
	elif hasattr(chunk, 'content'):
	await msg.stream_token(chunk.content)
	elif isinstance(chunk, str):
	await msg.stream_token(chunk)

	await msg.send()