Spaces:
Build error
Build error
Commit
Β·
b1ea4f6
1
Parent(s):
2299b1b
requriements
Browse files
app.py
CHANGED
@@ -19,6 +19,7 @@ from langchain.schema import StrOutputParser
|
|
19 |
from langchain_core.documents import Document
|
20 |
from typing import cast
|
21 |
from dotenv import load_dotenv
|
|
|
22 |
|
23 |
### Emvironment Variables ###
|
24 |
load_dotenv('.env')
|
@@ -53,9 +54,18 @@ RAG_PROMPT = ChatPromptTemplate([('human', RAG_SYSTEM_MSG_TEMPLATE)])
|
|
53 |
|
54 |
|
55 |
#π retriever
|
56 |
-
async def get_retriever(
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
|
|
59 |
core_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
60 |
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
|
61 |
underlying_embeddings = core_embeddings,
|
@@ -63,8 +73,8 @@ async def get_retriever(filename: str, chunks: list[Document]):
|
|
63 |
namespace=core_embeddings.model
|
64 |
)
|
65 |
|
66 |
-
|
67 |
-
collection_name = f"pdf_to_parse_{
|
68 |
if collection_name not in (x.name for x in client.get_collections().collections):
|
69 |
client.create_collection(
|
70 |
collection_name=collection_name,
|
@@ -119,16 +129,13 @@ async def on_chat_start():
|
|
119 |
|
120 |
|
121 |
file = files[0]
|
122 |
-
msg = cl.Message(content=f"Processing `{file.name}`...")
|
123 |
await msg.send()
|
124 |
|
125 |
|
126 |
-
documents = PyMuPDFLoader(file.path).load()
|
127 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
128 |
-
chunks = await text_splitter.atransform_documents(documents)
|
129 |
-
|
130 |
# get rag chain
|
131 |
-
retriever, already_exist = await get_retriever(file
|
|
|
132 |
rag_chain = get_rag(retriever)
|
133 |
|
134 |
# Let the user know that the system is ready
|
|
|
19 |
from langchain_core.documents import Document
|
20 |
from typing import cast
|
21 |
from dotenv import load_dotenv
|
22 |
+
import tempfile
|
23 |
|
24 |
### Emvironment Variables ###
|
25 |
load_dotenv('.env')
|
|
|
54 |
|
55 |
|
56 |
#π retriever
|
57 |
+
async def get_retriever(file: AskFileResponse):
|
58 |
+
|
59 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.pdf') as temp_file:
|
60 |
+
temp_file_path = temp_file.name
|
61 |
+
with open(temp_file_path, 'wb') as f:
|
62 |
+
f.write(file.content)
|
63 |
+
documents = PyMuPDFLoader(temp_file_path).load()
|
64 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
65 |
+
chunks = await text_splitter.atransform_documents(documents)
|
66 |
+
|
67 |
|
68 |
+
client = QdrantClient(":memory:")
|
69 |
core_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
70 |
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
|
71 |
underlying_embeddings = core_embeddings,
|
|
|
73 |
namespace=core_embeddings.model
|
74 |
)
|
75 |
|
76 |
+
|
77 |
+
collection_name = f"pdf_to_parse_{clean_text(file.name)}"
|
78 |
if collection_name not in (x.name for x in client.get_collections().collections):
|
79 |
client.create_collection(
|
80 |
collection_name=collection_name,
|
|
|
129 |
|
130 |
|
131 |
file = files[0]
|
132 |
+
msg = cl.Message(content=f"Processing `{file.name}`...", disable_human_feedback=True)
|
133 |
await msg.send()
|
134 |
|
135 |
|
|
|
|
|
|
|
|
|
136 |
# get rag chain
|
137 |
+
retriever, already_exist = await get_retriever(file)
|
138 |
+
# retriever, already_exist = await get_retriever(file.name.split('pdf')[0], chunks)
|
139 |
rag_chain = get_rag(retriever)
|
140 |
|
141 |
# Let the user know that the system is ready
|