File size: 9,025 Bytes
7627550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5d3830
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7627550
0adc74f
e5e07a1
7627550
e5e07a1
a5d3830
e5e07a1
81bb56a
e5e07a1
 
 
 
a5d3830
 
 
 
 
 
 
 
3fe3bde
11954cc
 
a5d3830
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7627550
a5d3830
 
bb8eccf
7627550
a5d3830
 
 
 
 
f5ac780
7627550
 
 
 
 
 
 
 
 
 
 
a60e367
7627550
a8dcf10
7627550
 
a5d3830
 
 
 
 
7627550
a5d3830
 
 
 
 
 
 
 
 
 
 
 
 
 
73d1a08
a5d3830
e5e07a1
0adc74f
a5d3830
0adc74f
 
b286610
a5d3830
 
 
 
 
 
 
 
 
78d0916
a5d3830
 
 
23182d5
 
 
 
 
a5d3830
23182d5
 
 
a5d3830
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# from langchain.document_loaders import TextLoader
# import pinecone
# from langchain.vectorstores import Pinecone
# import os
# from transformers import AutoTokenizer, AutoModel
# from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
# from langchain.agents.agent_toolkits import create_retriever_tool
# from langchain.chat_models import ChatOpenAI
# import torch
# from langchain.agents.openai_functions_agent.agent_token_buffer_memory import (AgentTokenBufferMemory)
# from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
# from langchain.schema.messages import SystemMessage
# from langchain.prompts import MessagesPlaceholder
# import gradio as gr
# import time
# from db_func import insert_one
# from global_variable_module import gobal_input, global_output
# import random   


# def get_bert_embeddings(sentence):
#     embeddings = []
#     input_ids = tokenizer.encode(sentence, return_tensors="pt")
#     with torch.no_grad():
#         output = model(input_ids)
#     embedding = output.last_hidden_state[:,0,:].numpy().tolist()
#     return embedding

# model_name = "BAAI/bge-base-en-v1.5"
# model = AutoModel.from_pretrained("/Users/aakashbhatnagar/Documents/masters/ophthal_llm/models/models--BAAI--bge-base-en-v1.5/snapshots/617ca489d9e86b49b8167676d8220688b99db36e")
# tokenizer = AutoTokenizer.from_pretrained("/Users/aakashbhatnagar/Documents/masters/ophthal_llm/models/models--BAAI--bge-base-en-v1.5/snapshots/617ca489d9e86b49b8167676d8220688b99db36e")
# prompt_file = open("prompts/version_2.txt", "r").read()

# pinecone.init(
#     api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
#     environment=os.getenv("PINECONE_ENV"),  # next to api key in console
# )

# index_name = "ophtal-knowledge-base"
# index = pinecone.Index(index_name)
# vectorstore = Pinecone(index, get_bert_embeddings, "text")
# retriever = vectorstore.as_retriever()
# tool = create_retriever_tool(
#     retriever,
#     "search_ophtal-knowledge-base",
#     "Searches and returns documents regarding the ophtal-knowledge-base.",
# )
# tools = [tool]
# system_message = SystemMessage(content=prompt_file)
# memory_key='history'

# llm = ChatOpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4", temperature=0.2)
# prompt = OpenAIFunctionsAgent.create_prompt(
#     system_message=system_message,
#     extra_prompt_messages=[MessagesPlaceholder(variable_name=memory_key)],
# )

# agent_executor = create_conversational_retrieval_agent(llm, tools, verbose=False, prompt=prompt)
# user_name = None

# def run(input_):
#     output = agent_executor({"input": input_})
#     output_text = output["output"]
#     source_text = ""
#     doc_text = ""
#     global_input = input_
#     global_output = output_text
        

#     if len(output["intermediate_steps"])>0:
#         documents = output["intermediate_steps"][0][1]
#         sources = []
#         docs = []

#         for doc in documents:
#             if doc.metadata["source"] not in sources:
#                 sources.append(doc.metadata["source"])
#                 docs.append(doc.page_content)

#         for i in range(len(sources)):
#             temp = sources[i].replace('.pdf',  '').replace('.txt', '').replace("AAO", "").replace("2022-2023", "").replace("data/book", "").replace("text", "").replace("  ", " ")
#             source_text += f"{i+1}. {temp}\n"
#             doc_text += f"{i+1}. {docs[i]}\n"

#         # output_text =  f"{output_text} \n\nSources: \n{source_text}\n\nDocuments: \n{doc_text}"
#         # output_text =  f"{output_text}"
    
#     doc_to_insert = {
#         "user": user_name,
#         "input": input_,
#         "output": output_text,
#         "source": source_text,
#         "documents": doc_text
#     }

#     insert_one(doc_to_insert)

#     return output_text

# def make_conversation(message, history):
#     text_ = run(message)
#     for i in range(len(text_)):
#         time.sleep(0.001)
#         yield text_[: i+1]
    
# def auth_function(username, password):
#     user_name = username
#     return username == password

# def random_response(message, accuracy, history):
#     print(type(message))
#     print(message)
#     print(accuracy)
#     out = random.choice(["Yes", "No"])
#     gobal_input = out
#     # open a txt file
#     with open("function hit", "a+") as f:
#         f.write(message)
#     return out

from langchain.document_loaders import TextLoader
import pinecone
from langchain.vectorstores import Pinecone
import os
from transformers import AutoTokenizer, AutoModel
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.chat_models import ChatOpenAI
import torch
from langchain.agents.openai_functions_agent.agent_token_buffer_memory import (AgentTokenBufferMemory)
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.schema.messages import SystemMessage
from langchain.prompts import MessagesPlaceholder
import gradio as gr
import time
from db_func import insert_one
from langchain.agents import AgentExecutor
import re
import wordninja

def clean_text(text):

    text = text.strip().lower()
    output_paragraph = ' '.join(''.join(text.split()).split(' '))
    words = wordninja.split(output_paragraph)

    return ' '.join(words)
    
def get_bert_embeddings(sentence):
    embeddings = []
    input_ids = tokenizer.encode(sentence, return_tensors="pt")
    with torch.no_grad():
        output = model(input_ids)
    embedding = output.last_hidden_state[:,0,:].numpy().tolist()
    return embedding

model_name = "BAAI/bge-base-en-v1.5"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt_file = open("prompts/version_2.txt", "r").read()

pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)

index_name = "ophtal-knowledge-base"
index = pinecone.Index(index_name)
vectorstore = Pinecone(index, get_bert_embeddings, "text")
retriever = vectorstore.as_retriever()
tool = create_retriever_tool(
    retriever,
    "search_ophtal-knowledge-base",
    "Searches and returns documents regarding the ophtal-knowledge-base.",
)
tools = [tool]
system_message = SystemMessage(content="You are an assistant to ophthamologists and your name is 'Dr.V AI'. Help users answer medical questions. You are supposed to answer only medical questions and not general questions.")
memory_key='history'

llm = ChatOpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4", temperature=0.2)
# llm = ChatOpenAI(openai_api_key="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", model="gpt-4", temperature=0.2)
prompt = OpenAIFunctionsAgent.create_prompt(
    system_message=system_message,
    extra_prompt_messages=[MessagesPlaceholder(variable_name=memory_key)],
)

memory = AgentTokenBufferMemory(memory_key=memory_key, llm=llm, max_token_limit=4000)



# agent_executor = create_conversational_retrieval_agent(llm, tools, verbose=False, prompt=prompt, )

agent = OpenAIFunctionsAgent(llm=llm, tools=tools, prompt=prompt)

agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    memory=memory,
    verbose=False,
    return_intermediate_steps=True,
    max_iterations = 2
)

user_name = None

def run(input_):
    output = agent_executor({"input": input_})
    output_text = output["output"]
    print(output_text)
    source_text = ""
    doc_text = ""    

    if len(output["intermediate_steps"])>0:
        documents = output["intermediate_steps"][0][1]
        sources = []
        docs = []

        for doc in documents:
            if doc.metadata["source"] not in sources:
                sources.append(doc.metadata["source"])
                docs.append(doc.page_content)

        for i in range(len(sources)):
            temp = sources[i].replace('.pdf',  '').replace('.txt', '').replace("AAO", "").replace("2022-2023", "").replace("data/book", "").replace("text", "").replace("  ", " ")
            source_text += f"{i+1}. {temp}\n"
            cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', clean_text(docs[i]))
            doc_text += f"{i+1}. {cleaned_text}\n"

        # output_text =  f"{output_text} \n\nSources: \n{source_text}"
        output_text =  f"{output_text} \n\nSources: \n{source_text}\n\nDocuments: \n{doc_text}"
        # output_text =  f"{output_text}"
    
    doc_to_insert = {
        "user": user_name,
        "input": input_,
        "output": output_text,
        "source": source_text,
        "documents": doc_text
    }

    insert_one(doc_to_insert)

    return output_text

def make_conversation(message, history):
     text_ = run(message)
     for i in range(len(text_)):
         time.sleep(0.001)
         yield text_[: i+1]
    
def auth_function(username, password):
    user_name = username
    return username == password