Spaces:

syedmudassir16 commited on
Commit
d734b57
1 Parent(s): 0217d37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -77
app.py CHANGED
@@ -1,6 +1,4 @@
1
  import os
2
- import multiprocessing
3
- import concurrent.futures
4
  from langchain.document_loaders import TextLoader, DirectoryLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain.vectorstores import FAISS
@@ -10,90 +8,51 @@ import torch
10
  import numpy as np
11
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
12
  from datetime import datetime
13
- import json
14
  import gradio as gr
15
  import re
16
  from threading import Thread
17
 
18
- from llama_index.core import VectorStoreIndex, Document
19
- from llama_index.core.tools import QueryEngineTool, ToolMetadata
20
- from llama_index.agent.openai import OpenAIAgent
21
- from llama_index.llms.openai import OpenAI
22
- from llama_index.embeddings.openai import OpenAIEmbedding
23
-
24
  class MultiDocumentAgentSystem:
25
- def __init__(self, documents_dict, llm, embed_model):
26
- self.llm = llm
27
- self.embed_model = embed_model
28
- self.document_agents = {}
29
- self.create_document_agents(documents_dict)
30
- self.top_agent = self.create_top_agent()
31
-
32
- def create_document_agents(self, documents_dict):
33
- for doc_name, doc_content in documents_dict.items():
34
- vector_index = VectorStoreIndex.from_documents([Document(text=doc_content)])
35
- summary_index = VectorStoreIndex.from_documents([Document(text=doc_content)])
36
-
37
- vector_query_engine = vector_index.as_query_engine(similarity_top_k=2)
38
- summary_query_engine = summary_index.as_query_engine()
39
-
40
- query_engine_tools = [
41
- QueryEngineTool(
42
- query_engine=vector_query_engine,
43
- metadata=ToolMetadata(
44
- name=f"vector_tool_{doc_name}",
45
- description=f"Useful for specific questions about {doc_name}",
46
- ),
47
- ),
48
- QueryEngineTool(
49
- query_engine=summary_query_engine,
50
- metadata=ToolMetadata(
51
- name=f"summary_tool_{doc_name}",
52
- description=f"Useful for summarizing content about {doc_name}",
53
- ),
54
- ),
55
- ]
56
-
57
- self.document_agents[doc_name] = OpenAIAgent.from_tools(
58
- query_engine_tools,
59
- llm=self.llm,
60
- verbose=True,
61
- system_prompt=f"You are an agent designed to answer queries about {doc_name}.",
62
- )
63
-
64
- def create_top_agent(self):
65
- all_tools = []
66
- for doc_name, agent in self.document_agents.items():
67
- doc_tool = QueryEngineTool(
68
- query_engine=agent,
69
- metadata=ToolMetadata(
70
- name=f"tool_{doc_name}",
71
- description=f"Use this tool for questions about {doc_name}",
72
- ),
73
- )
74
- all_tools.append(doc_tool)
75
-
76
- obj_index = VectorStoreIndex.from_objects(all_tools, embed_model=self.embed_model)
77
-
78
- return OpenAIAgent.from_tools(
79
- all_tools,
80
- llm=self.llm,
81
- verbose=True,
82
- system_prompt="You are an agent designed to answer queries about multiple documents.",
83
- tool_retriever=obj_index.as_retriever(similarity_top_k=3),
84
- )
85
 
86
  def query(self, user_input):
87
- return self.top_agent.chat(user_input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  class DocumentRetrievalAndGeneration:
90
  def __init__(self, embedding_model_name, lm_model_id, data_folder):
91
  self.documents_dict = self.load_documents(data_folder)
92
  self.embeddings = SentenceTransformer(embedding_model_name)
93
  self.tokenizer, self.model = self.initialize_llm(lm_model_id)
94
- self.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
95
- self.embed_model = OpenAIEmbedding()
96
- self.multi_doc_system = MultiDocumentAgentSystem(self.documents_dict, self.llm, self.embed_model)
97
 
98
  def load_documents(self, folder_path):
99
  documents_dict = {}
@@ -102,7 +61,7 @@ class DocumentRetrievalAndGeneration:
102
  file_path = os.path.join(folder_path, file_name)
103
  with open(file_path, 'r', encoding='utf-8') as file:
104
  content = file.read()
105
- documents_dict[file_name[:-4]] = content # Use filename without .txt as key
106
  return documents_dict
107
 
108
  def initialize_llm(self, model_id):
@@ -132,7 +91,7 @@ class DocumentRetrievalAndGeneration:
132
  top_k=20,
133
  temperature=0.8,
134
  repetition_penalty=1.2,
135
- eos_token_id=[128001, 128008, 128009],
136
  streamer=streamer,
137
  )
138
 
@@ -157,8 +116,8 @@ class DocumentRetrievalAndGeneration:
157
  return response, related_queries
158
 
159
  if __name__ == "__main__":
160
- embedding_model_name = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L12'
161
- lm_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
162
  data_folder = 'sample_embedding_folder2'
163
 
164
  doc_retrieval_gen = DocumentRetrievalAndGeneration(embedding_model_name, lm_model_id, data_folder)
 
1
  import os
 
 
2
  from langchain.document_loaders import TextLoader, DirectoryLoader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain.vectorstores import FAISS
 
8
  import numpy as np
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
10
  from datetime import datetime
 
11
  import gradio as gr
12
  import re
13
  from threading import Thread
14
 
 
 
 
 
 
 
15
  class MultiDocumentAgentSystem:
16
+ def __init__(self, documents_dict, model, tokenizer, embeddings):
17
+ self.model = model
18
+ self.tokenizer = tokenizer
19
+ self.embeddings = embeddings
20
+ self.document_vectors = self.create_document_vectors(documents_dict)
21
+
22
+ def create_document_vectors(self, documents_dict):
23
+ document_vectors = {}
24
+ for doc_name, content in documents_dict.items():
25
+ vectors = self.embeddings.encode(content, convert_to_tensor=True)
26
+ document_vectors[doc_name] = vectors
27
+ return document_vectors
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def query(self, user_input):
30
+ query_vector = self.embeddings.encode(user_input, convert_to_tensor=True)
31
+
32
+ # Find the most similar document
33
+ most_similar_doc = max(self.document_vectors.items(),
34
+ key=lambda x: torch.cosine_similarity(query_vector, x[1], dim=0))
35
+
36
+ # Generate response using the most similar document as context
37
+ response = self.generate_response(user_input, most_similar_doc[0], most_similar_doc[1])
38
+ return response
39
+
40
+ def generate_response(self, query, doc_name, doc_vector):
41
+ prompt = f"Based on the document '{doc_name}', answer the following question: {query}"
42
+ input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.model.device)
43
+
44
+ with torch.no_grad():
45
+ output = self.model.generate(input_ids, max_length=150, num_return_sequences=1)
46
+
47
+ response = self.tokenizer.decode(output[0], skip_special_tokens=True)
48
+ return response
49
 
50
  class DocumentRetrievalAndGeneration:
51
  def __init__(self, embedding_model_name, lm_model_id, data_folder):
52
  self.documents_dict = self.load_documents(data_folder)
53
  self.embeddings = SentenceTransformer(embedding_model_name)
54
  self.tokenizer, self.model = self.initialize_llm(lm_model_id)
55
+ self.multi_doc_system = MultiDocumentAgentSystem(self.documents_dict, self.model, self.tokenizer, self.embeddings)
 
 
56
 
57
  def load_documents(self, folder_path):
58
  documents_dict = {}
 
61
  file_path = os.path.join(folder_path, file_name)
62
  with open(file_path, 'r', encoding='utf-8') as file:
63
  content = file.read()
64
+ documents_dict[file_name[:-4]] = content
65
  return documents_dict
66
 
67
  def initialize_llm(self, model_id):
 
91
  top_k=20,
92
  temperature=0.8,
93
  repetition_penalty=1.2,
94
+ eos_token_id=self.tokenizer.eos_token_id,
95
  streamer=streamer,
96
  )
97
 
 
116
  return response, related_queries
117
 
118
  if __name__ == "__main__":
119
+ embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
120
+ lm_model_id = "facebook/opt-350m" # You can change this to a different open-source model
121
  data_folder = 'sample_embedding_folder2'
122
 
123
  doc_retrieval_gen = DocumentRetrievalAndGeneration(embedding_model_name, lm_model_id, data_folder)