Spaces:

jet-taekyo
/

AIE4_MVP

Build error

App Files Files Community

jet-taekyo commited on Oct 6, 2024

Commit

b1ea4f6

1 Parent(s): 2299b1b

requriements

Browse files

Files changed (1) hide show

app.py +17 -10

app.py CHANGED Viewed

@@ -19,6 +19,7 @@ from langchain.schema import StrOutputParser
 from langchain_core.documents import Document
 from typing import cast
 from dotenv import load_dotenv
 ### Emvironment Variables ###
 load_dotenv('.env')
@@ -53,9 +54,18 @@ RAG_PROMPT = ChatPromptTemplate([('human', RAG_SYSTEM_MSG_TEMPLATE)])
 #😉 retriever
-async def get_retriever(filename: str, chunks: list[Document]):
-    client = QdrantClient(":memory:")
     core_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
     cached_embedder = CacheBackedEmbeddings.from_bytes_store(
         underlying_embeddings = core_embeddings,
@@ -63,8 +73,8 @@ async def get_retriever(filename: str, chunks: list[Document]):
         namespace=core_embeddings.model
     )
-    collection_name = f"pdf_to_parse_{filename}"
     if collection_name not in (x.name for x in client.get_collections().collections):
         client.create_collection(
             collection_name=collection_name,
@@ -119,16 +129,13 @@ async def on_chat_start():
     file = files[0]
-    msg = cl.Message(content=f"Processing `{file.name}`...")
     await msg.send()
-    documents = PyMuPDFLoader(file.path).load()
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-    chunks = await text_splitter.atransform_documents(documents)
     # get rag chain
-    retriever, already_exist = await get_retriever(file.name.split('pdf')[0], chunks)
     rag_chain = get_rag(retriever)
     # Let the user know that the system is ready

 from langchain_core.documents import Document
 from typing import cast
 from dotenv import load_dotenv
+import tempfile
 ### Emvironment Variables ###
 load_dotenv('.env')
 #😉 retriever
+async def get_retriever(file: AskFileResponse):
+    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.pdf') as temp_file:
+        temp_file_path = temp_file.name
+    with open(temp_file_path, 'wb') as f:
+        f.write(file.content)
+    documents = PyMuPDFLoader(temp_file_path).load()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+    chunks = await text_splitter.atransform_documents(documents)
+    client = QdrantClient(":memory:")
     core_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
     cached_embedder = CacheBackedEmbeddings.from_bytes_store(
         underlying_embeddings = core_embeddings,
         namespace=core_embeddings.model
     )
+    collection_name = f"pdf_to_parse_{clean_text(file.name)}"
     if collection_name not in (x.name for x in client.get_collections().collections):
         client.create_collection(
             collection_name=collection_name,
     file = files[0]
+    msg = cl.Message(content=f"Processing `{file.name}`...", disable_human_feedback=True)
     await msg.send()
     # get rag chain
+    retriever, already_exist = await get_retriever(file)
+    # retriever, already_exist = await get_retriever(file.name.split('pdf')[0], chunks)
     rag_chain = get_rag(retriever)
     # Let the user know that the system is ready