VanguardAI commited on
Commit
a685a6f
·
verified ·
1 Parent(s): c7c3138

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -26
app.py CHANGED
@@ -135,50 +135,79 @@ class DuckDuckGoSearchRun(Tool):
135
  return answer
136
 
137
  # Function to handle different input types and choose the right tool
138
- def handle_input(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
139
- # Initialize the LLM
140
- llm = ChatGroq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
141
 
142
- # Initialize tools
143
  tools = [
144
- DuckDuckGoSearchRun(),
145
- ImageGeneration(),
146
- NumpyCodeCalculator(),
 
 
 
 
 
 
 
147
  ]
148
 
 
 
 
 
 
 
 
149
  # Add the web search tool only if websearch mode is enabled
150
  if websearch:
151
- tools.append(WebSearch())
 
 
 
 
152
 
153
  # Add the document question answering tool only if a document is provided
154
  if document:
155
- tools.append(DocumentQuestionAnswering(document))
156
-
157
- # Handle voice input
158
- if voice_only and audio:
159
- # TODO: Implement Whisper integration for voice-to-text
160
- user_prompt = "Whisper transcription of audio" # Replace with actual transcription
161
 
162
- # Handle image and text input
163
- if image and user_prompt:
164
- image = Image.open(image).convert('RGB')
165
- messages = [{"role": "user", "content": [image, user_prompt]}]
166
- response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
167
- return response
168
 
169
  # Check if the input requires any tools
170
- requires_tool = any(tool.name.lower() in user_prompt.lower() for tool in tools)
171
-
172
- # Use agent if tools are required, otherwise use LLM directly
173
- if requires_tool:
 
 
 
 
174
  agent = initialize_agent(
175
  tools,
176
  llm,
177
  agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
178
  verbose=True
179
  )
180
- response = agent.run(user_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  else:
 
182
  response = llm.call(query=user_prompt)
183
 
184
  return response
@@ -394,6 +423,7 @@ def create_ui():
394
 
395
  return demo
396
 
 
397
  @spaces.GPU(duration=180)
398
  def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
399
  print("Starting main_interface function")
@@ -404,7 +434,7 @@ def main_interface(user_prompt, image=None, audio=None, voice_only=False, websea
404
  print(f"user_prompt: {user_prompt}, image: {image}, audio: {audio}, voice_only: {voice_only}, websearch: {websearch}, document: {document}")
405
 
406
  try:
407
- response = handle_input(user_prompt, image=image, audio=audio, voice_only=voice_only, websearch=websearch, document=document)
408
  print("handle_input function executed successfully")
409
  except Exception as e:
410
  print(f"Error in handle_input: {e}")
@@ -412,6 +442,12 @@ def main_interface(user_prompt, image=None, audio=None, voice_only=False, websea
412
 
413
  if voice_only:
414
  try:
 
 
 
 
 
 
415
  audio_output = play_voice_output(response)
416
  print("play_voice_output function executed successfully")
417
  return "Response generated.", audio_output
 
135
  return answer
136
 
137
  # Function to handle different input types and choose the right tool
138
+ def handle_input(user_prompt, image=None, audio=None, websearch=False, document=None):
139
+ # Initialize the search tool
140
+ search = DuckDuckGoSearchRun()
141
 
 
142
  tools = [
143
+ Tool(
144
+ name="Search",
145
+ func=search.run,
146
+ description="Useful for searching the internet for general information"
147
+ ),
148
+ Tool(
149
+ name="Image",
150
+ func=ImageGeneration()._run,
151
+ description="Useful for generating images based on text descriptions"
152
+ ),
153
  ]
154
 
155
+ # Add the numpy tool, but with a more specific description
156
+ tools.append(Tool(
157
+ name="Numpy",
158
+ func=NumpyCodeCalculator()._run,
159
+ description="Useful only for performing numerical computations, not for general searches"
160
+ ))
161
+
162
  # Add the web search tool only if websearch mode is enabled
163
  if websearch:
164
+ tools.append(Tool(
165
+ name="Web",
166
+ func=WebSearch()._run,
167
+ description="Useful for advanced web searching beyond general information"
168
+ ))
169
 
170
  # Add the document question answering tool only if a document is provided
171
  if document:
172
+ tools.append(Tool(
173
+ name="Document",
174
+ func=DocumentQuestionAnswering(document)._run,
175
+ description="Useful for answering questions about a specific document"
176
+ ))
 
177
 
178
+ llm = ChatGroq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
 
 
 
 
 
179
 
180
  # Check if the input requires any tools
181
+ requires_tool = False
182
+ for tool in tools:
183
+ if tool.name.lower() in user_prompt.lower():
184
+ requires_tool = True
185
+ break
186
+
187
+ if image or audio or requires_tool:
188
+ # Initialize the agent
189
  agent = initialize_agent(
190
  tools,
191
  llm,
192
  agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
193
  verbose=True
194
  )
195
+
196
+ if image:
197
+ image = Image.open(image).convert('RGB')
198
+ messages = [{"role": "user", "content": [image, user_prompt]}]
199
+ response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
200
+ elif audio:
201
+ transcription = client.audio.transcriptions.create(
202
+ file=(audio.name, audio.read()),
203
+ model="whisper-large-v3"
204
+ )
205
+ user_prompt = transcription.text
206
+ response = agent.run(user_prompt)
207
+ else:
208
+ response = agent.run(user_prompt)
209
  else:
210
+ # If no tools are required, use the LLM directly
211
  response = llm.call(query=user_prompt)
212
 
213
  return response
 
423
 
424
  return demo
425
 
426
+ # Main interface function
427
  @spaces.GPU(duration=180)
428
  def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
429
  print("Starting main_interface function")
 
434
  print(f"user_prompt: {user_prompt}, image: {image}, audio: {audio}, voice_only: {voice_only}, websearch: {websearch}, document: {document}")
435
 
436
  try:
437
+ response = handle_input(user_prompt, image=image, audio=audio, websearch=websearch, document=document)
438
  print("handle_input function executed successfully")
439
  except Exception as e:
440
  print(f"Error in handle_input: {e}")
 
442
 
443
  if voice_only:
444
  try:
445
+ transcription = client.audio.transcriptions.create(
446
+ file=("input.wav", open("input.wav", "rb").read()),
447
+ model="whisper-large-v3"
448
+ )
449
+ user_prompt = transcription.text
450
+ response = handle_input(user_prompt)
451
  audio_output = play_voice_output(response)
452
  print("play_voice_output function executed successfully")
453
  return "Response generated.", audio_output