jeevan commited on
Commit
637aeec
·
1 Parent(s): 249d2c8
RagPipeline.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from aimakerspace.openai_utils.prompts import (
2
+ UserRolePrompt,
3
+ SystemRolePrompt,
4
+ AssistantRolePrompt,
5
+ )
6
+ from aimakerspace.vectordatabase import VectorDatabase
7
+ from aimakerspace.openai_utils.chatmodel import ChatOpenAI
8
+
9
+ class RetrievalAugmentedQAPipeline:
10
+ def __init__(
11
+ self,
12
+ system_role_prompt: SystemRolePrompt,
13
+ user_role_prompt: UserRolePrompt,
14
+ llm: ChatOpenAI(),
15
+ vector_db_retriever: VectorDatabase,
16
+ ) -> None:
17
+ self.system_role_prompt = system_role_prompt
18
+ self.user_role_prompt = user_role_prompt
19
+ self.llm = llm
20
+ self.vector_db_retriever = vector_db_retriever
21
+
22
+ async def arun_pipeline(self, user_query: str):
23
+ context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
24
+
25
+ context_prompt = ""
26
+ for context in context_list[0]:
27
+ context_prompt += context[0] + "\n"
28
+
29
+ formatted_system_prompt = self.system_role_prompt.create_message()
30
+
31
+ formatted_user_prompt = self.user_role_prompt.create_message(
32
+ question=user_query, context=context_prompt
33
+ )
34
+
35
+ async def generate_response():
36
+ async for chunk in self.llm.astream(
37
+ [formatted_system_prompt, formatted_user_prompt]
38
+ ):
39
+ yield chunk
40
+
41
+ return {"response": generate_response(), "context": context_list}
aimakerspace/openai_utils/embedding.py CHANGED
@@ -28,13 +28,6 @@ class EmbeddingModel:
28
 
29
  return [embeddings.embedding for embeddings in embedding_response.data]
30
 
31
- async def async_get_embeddings_openai(self, list_of_text: List[str]) :
32
- embedding_response = await self.async_client.embeddings.create(
33
- input=list_of_text, model=self.embeddings_model_name, dimensions=self.dimensions
34
- )
35
-
36
- return embedding_response
37
-
38
  async def async_get_embedding(self, text: str) -> List[float]:
39
  embedding = await self.async_client.embeddings.create(
40
  input=text, model=self.embeddings_model_name, dimensions=self.dimensions
 
28
 
29
  return [embeddings.embedding for embeddings in embedding_response.data]
30
 
 
 
 
 
 
 
 
31
  async def async_get_embedding(self, text: str) -> List[float]:
32
  embedding = await self.async_client.embeddings.create(
33
  input=text, model=self.embeddings_model_name, dimensions=self.dimensions
aimakerspace/text_utils.py CHANGED
@@ -45,7 +45,7 @@ class PdfFileLoader:
45
  def load(self):
46
  if os.path.isdir(self.path):
47
  self.load_directory()
48
- elif os.path.isfile(self.path) and self.path.endswith(".pdf"):
49
  self.load_file()
50
  else:
51
  raise ValueError(
 
45
  def load(self):
46
  if os.path.isdir(self.path):
47
  self.load_directory()
48
+ elif self.path.endswith(".pdf"):
49
  self.load_file()
50
  else:
51
  raise ValueError(
aimakerspace/vectordatabase.py CHANGED
@@ -1,5 +1,6 @@
1
  from enum import Enum
2
  import numpy as np
 
3
  from collections import defaultdict
4
  from typing import List, Tuple, Callable
5
  from aimakerspace.openai_utils.embedding import EmbeddingModel
@@ -73,9 +74,31 @@ class VectorDatabase:
73
  self.vectors = defaultdict(np.array)
74
  if vector_db_options == VectorDatabaseOptions.QDRANT:
75
  self.qdrant_client = QdrantClient(":memory:")
 
 
 
 
 
 
 
 
76
 
77
  def insert(self, key: str, vector: np.array) -> None:
78
- self.vectors[key] = vector
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  def search(
81
  self,
@@ -83,11 +106,18 @@ class VectorDatabase:
83
  k: int,
84
  distance_measure: Callable = cosine_similarity,
85
  ) -> List[Tuple[str, float]]:
86
- scores = [
87
- (key, distance_measure(query_vector, vector))
88
- for key, vector in self.vectors.items()
89
- ]
90
- return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
 
 
 
 
 
 
 
91
 
92
  def search_by_text(
93
  self,
@@ -97,39 +127,17 @@ class VectorDatabase:
97
  return_as_text: bool = False,
98
  ) -> List[Tuple[str, float]]:
99
  query_vector = self.embedding_model.get_embedding(query_text)
100
- if self.vector_db_options == VectorDatabaseOptions.DICTIONARY:
101
- results = self.search(query_vector, k, distance_measure)
102
- return [result[0] for result in results] if return_as_text else results
103
- if self.vector_db_options == VectorDatabaseOptions.QDRANT:
104
- search_result = self.qdrant_client.search(collection_name,query_vector=query_vector)
105
- return [(point.payload["text"],point.score) for point in search_result]
106
 
107
  def retrieve_from_key(self, key: str) -> np.array:
108
  return self.vectors.get(key, None)
109
 
110
  async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
111
- if self.vector_db_options == VectorDatabaseOptions.DICTIONARY:
112
- embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
113
- for text, embedding in zip(list_of_text, embeddings):
114
- self.insert(text, np.array(embedding))
115
- if self.vector_db_options == VectorDatabaseOptions.QDRANT:
116
- embeddings_response = await self.embedding_model.async_get_embeddings_openai(list_of_text)
117
- points = [
118
- PointStruct(
119
- id=idx,
120
- vector=data.embedding,
121
- payload={"text": text},
122
- )
123
- for idx, (data, text) in enumerate(zip(embeddings_response.data, list_of_text))
124
- ]
125
- self.qdrant_client.create_collection(
126
- collection_name,
127
- vectors_config=VectorParams(
128
- size=self.embedding_model.dimensions,
129
- distance=Distance.COSINE,
130
- ),
131
- )
132
- self.qdrant_client.upsert(collection_name, points)
133
  return self
134
 
135
 
 
1
  from enum import Enum
2
  import numpy as np
3
+ import uuid
4
  from collections import defaultdict
5
  from typing import List, Tuple, Callable
6
  from aimakerspace.openai_utils.embedding import EmbeddingModel
 
74
  self.vectors = defaultdict(np.array)
75
  if vector_db_options == VectorDatabaseOptions.QDRANT:
76
  self.qdrant_client = QdrantClient(":memory:")
77
+ vector_params = VectorParams(
78
+ size=embedding_model.dimensions, # vector size
79
+ distance="Cosine" # distance metric
80
+ )
81
+ self.qdrant_client.recreate_collection(
82
+ collection_name=collection_name,
83
+ vectors_config={"default": vector_params},
84
+ )
85
 
86
  def insert(self, key: str, vector: np.array) -> None:
87
+ idx = str(uuid.uuid4())
88
+ payload = {"text": key}
89
+
90
+ point = PointStruct(
91
+ id=idx,
92
+ vector={"default": vector.tolist()},
93
+ payload=payload
94
+ )
95
+ # Insert the vector into Qdrant with the associated document
96
+ self.qdrant_client.upsert(
97
+ collection_name=collection_name,
98
+ points=[point]
99
+ )
100
+ print(f"Inserted vector with ID {idx}: {vector}")
101
+
102
 
103
  def search(
104
  self,
 
106
  k: int,
107
  distance_measure: Callable = cosine_similarity,
108
  ) -> List[Tuple[str, float]]:
109
+ # if isinstance(query_vector, list):
110
+ # query_vector = np.array(query_vector)
111
+ print(f"Searching in collection: {collection_name} with vector: {query_vector}")
112
+ collection_info = self.qdrant_client.get_collection(collection_name)
113
+ print(f"Collection info: {collection_info}")
114
+
115
+ search_results = self.qdrant_client.search(
116
+ collection_name=collection_name,
117
+ query_vector=query_vector,
118
+ limit=k
119
+ )
120
+ return [(result.payload['text'], result.score) for result in search_results]
121
 
122
  def search_by_text(
123
  self,
 
127
  return_as_text: bool = False,
128
  ) -> List[Tuple[str, float]]:
129
  query_vector = self.embedding_model.get_embedding(query_text)
130
+ results = self.search(query_vector, k, distance_measure)
131
+ return [result[0] for result in results] if return_as_text else results
132
+
 
 
 
133
 
134
  def retrieve_from_key(self, key: str) -> np.array:
135
  return self.vectors.get(key, None)
136
 
137
  async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
138
+ embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
139
+ for text, embedding in zip(list_of_text, embeddings):
140
+ self.insert(text, np.array(embedding))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  return self
142
 
143
 
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  from openai import AsyncOpenAI
 
3
  from typing import List
4
  from chainlit.types import AskFileResponse
5
  from chainlit.cli import run_chainlit
@@ -57,25 +58,28 @@ def process_text_file(file: AskFileResponse) -> List[str]:
57
  import tempfile
58
 
59
  with tempfile.NamedTemporaryFile(
60
- mode="w", delete=False, suffix=".txt"
61
  ) as temp_file:
62
  temp_file_path = temp_file.name
63
-
64
- with open(file.path, "r", encoding="utf-8") as f:
65
- text = f.read()
66
-
67
- with open(temp_file_path, "w") as f:
68
- f.write(text)
69
 
70
  text_loader = TextFileLoader(temp_file_path)
71
  documents = text_loader.load_documents()
72
  texts = []
73
  for doc in documents:
74
- texts.append(text_splitter.split_text(doc))
75
  return texts
76
 
77
  def process_pdf_file(file: AskFileResponse) -> List[str]:
78
- pdf_loader = PdfFileLoader(file.path)
 
 
 
 
 
 
 
 
79
  texts = pdf_loader.load_documents() # Also handles splitting the text in this case pages
80
  return texts
81
 
@@ -108,9 +112,9 @@ async def on_chat_start():
108
  texts : List[str] = []
109
  for file in files:
110
  if file.type == "application/pdf":
111
- texts.extend(process_pdf_file(file))
112
  if file.type == "text/plain":
113
- texts.extend(process_text_file(file))
114
 
115
  # await send_new_message(content=f"Processing `{file.name}`...")
116
  msg = cl.Message(content=f"Processing `{file.name}`...")
 
1
  import os
2
  from openai import AsyncOpenAI
3
+ from RagPipeline import RetrievalAugmentedQAPipeline
4
  from typing import List
5
  from chainlit.types import AskFileResponse
6
  from chainlit.cli import run_chainlit
 
58
  import tempfile
59
 
60
  with tempfile.NamedTemporaryFile(
61
+ mode="wb", delete=False, suffix=".txt"
62
  ) as temp_file:
63
  temp_file_path = temp_file.name
64
+ temp_file.write(file.content)
 
 
 
 
 
65
 
66
  text_loader = TextFileLoader(temp_file_path)
67
  documents = text_loader.load_documents()
68
  texts = []
69
  for doc in documents:
70
+ texts += text_splitter.split_text(doc)
71
  return texts
72
 
73
  def process_pdf_file(file: AskFileResponse) -> List[str]:
74
+ import tempfile
75
+ with tempfile.NamedTemporaryFile(
76
+ mode="wb", delete=False, suffix=".pdf"
77
+ ) as temp_file:
78
+ temp_file_path = temp_file.name
79
+ temp_file.write(file.content)
80
+
81
+
82
+ pdf_loader = PdfFileLoader(temp_file_path)
83
  texts = pdf_loader.load_documents() # Also handles splitting the text in this case pages
84
  return texts
85
 
 
112
  texts : List[str] = []
113
  for file in files:
114
  if file.type == "application/pdf":
115
+ texts += process_pdf_file(file)
116
  if file.type == "text/plain":
117
+ texts += process_text_file(file)
118
 
119
  # await send_new_message(content=f"Processing `{file.name}`...")
120
  msg = cl.Message(content=f"Processing `{file.name}`...")
requirements copy.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ numpy
2
+ chainlit==0.7.700
3
+ openai
4
+ langchain-text-splitters
5
+ pypdf
6
+ langchain-community
7
+ qdrant-client
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  numpy==1.26.4
2
  chainlit==0.7.700 # 1.1.402
3
- openai==1.3.5
4
  qdrant-client==1.11.0
5
  langchain-text-splitters
6
  langchain-community
 
1
  numpy==1.26.4
2
  chainlit==0.7.700 # 1.1.402
3
+ openai
4
  qdrant-client==1.11.0
5
  langchain-text-splitters
6
  langchain-community