jet-taekyo commited on
Commit
b1ea4f6
Β·
1 Parent(s): 2299b1b

requriements

Browse files
Files changed (1) hide show
  1. app.py +17 -10
app.py CHANGED
@@ -19,6 +19,7 @@ from langchain.schema import StrOutputParser
19
  from langchain_core.documents import Document
20
  from typing import cast
21
  from dotenv import load_dotenv
 
22
 
23
  ### Emvironment Variables ###
24
  load_dotenv('.env')
@@ -53,9 +54,18 @@ RAG_PROMPT = ChatPromptTemplate([('human', RAG_SYSTEM_MSG_TEMPLATE)])
53
 
54
 
55
  #πŸ˜‰ retriever
56
- async def get_retriever(filename: str, chunks: list[Document]):
57
- client = QdrantClient(":memory:")
 
 
 
 
 
 
 
 
58
 
 
59
  core_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
60
  cached_embedder = CacheBackedEmbeddings.from_bytes_store(
61
  underlying_embeddings = core_embeddings,
@@ -63,8 +73,8 @@ async def get_retriever(filename: str, chunks: list[Document]):
63
  namespace=core_embeddings.model
64
  )
65
 
66
-
67
- collection_name = f"pdf_to_parse_{filename}"
68
  if collection_name not in (x.name for x in client.get_collections().collections):
69
  client.create_collection(
70
  collection_name=collection_name,
@@ -119,16 +129,13 @@ async def on_chat_start():
119
 
120
 
121
  file = files[0]
122
- msg = cl.Message(content=f"Processing `{file.name}`...")
123
  await msg.send()
124
 
125
 
126
- documents = PyMuPDFLoader(file.path).load()
127
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
128
- chunks = await text_splitter.atransform_documents(documents)
129
-
130
  # get rag chain
131
- retriever, already_exist = await get_retriever(file.name.split('pdf')[0], chunks)
 
132
  rag_chain = get_rag(retriever)
133
 
134
  # Let the user know that the system is ready
 
19
  from langchain_core.documents import Document
20
  from typing import cast
21
  from dotenv import load_dotenv
22
+ import tempfile
23
 
24
  ### Emvironment Variables ###
25
  load_dotenv('.env')
 
54
 
55
 
56
  #πŸ˜‰ retriever
57
+ async def get_retriever(file: AskFileResponse):
58
+
59
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.pdf') as temp_file:
60
+ temp_file_path = temp_file.name
61
+ with open(temp_file_path, 'wb') as f:
62
+ f.write(file.content)
63
+ documents = PyMuPDFLoader(temp_file_path).load()
64
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
65
+ chunks = await text_splitter.atransform_documents(documents)
66
+
67
 
68
+ client = QdrantClient(":memory:")
69
  core_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
70
  cached_embedder = CacheBackedEmbeddings.from_bytes_store(
71
  underlying_embeddings = core_embeddings,
 
73
  namespace=core_embeddings.model
74
  )
75
 
76
+
77
+ collection_name = f"pdf_to_parse_{clean_text(file.name)}"
78
  if collection_name not in (x.name for x in client.get_collections().collections):
79
  client.create_collection(
80
  collection_name=collection_name,
 
129
 
130
 
131
  file = files[0]
132
+ msg = cl.Message(content=f"Processing `{file.name}`...", disable_human_feedback=True)
133
  await msg.send()
134
 
135
 
 
 
 
 
136
  # get rag chain
137
+ retriever, already_exist = await get_retriever(file)
138
+ # retriever, already_exist = await get_retriever(file.name.split('pdf')[0], chunks)
139
  rag_chain = get_rag(retriever)
140
 
141
  # Let the user know that the system is ready