VanguardAI commited on
Commit
e39cb32
·
verified ·
1 Parent(s): c533d1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -44
app.py CHANGED
@@ -8,13 +8,10 @@ from transformers import AutoModel, AutoTokenizer
8
  from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
9
  from parler_tts import ParlerTTSForConditionalGeneration
10
  import soundfile as sf
11
- from llama_index import VectorStoreIndex, SimpleDirectoryReader, LLMPredictor, PromptHelper
12
- from llama_index.embeddings import GroqEmbedding
13
- from llama_index.llms import GroqLLM
14
  from llama_index.agent import ReActAgent
15
  from llama_index.tools import FunctionTool
 
16
  from PIL import Image
17
- from decord import VideoReader, cpu
18
  from tavily import TavilyClient
19
  import requests
20
  from huggingface_hub import hf_hub_download
@@ -85,34 +82,8 @@ def image_generation(query):
85
  image.save("output.jpg")
86
  return "output.jpg"
87
 
88
- # Document Question Answering Tool
89
- def doc_question_answering(query, file_path):
90
- # Load documents
91
- documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
92
-
93
- # Initialize Groq embedding model
94
- embed_model = GroqEmbedding()
95
-
96
- # Initialize Groq LLM
97
- llm_predictor = LLMPredictor(llm=GroqLLM(model_name=MODEL))
98
-
99
- # Initialize prompt helper
100
- prompt_helper = PromptHelper()
101
-
102
- # Create index
103
- index = VectorStoreIndex.from_documents(
104
- documents,
105
- embed_model=embed_model,
106
- llm_predictor=llm_predictor,
107
- prompt_helper=prompt_helper
108
- )
109
-
110
- # Query the index
111
- response = index.query(query)
112
- return response.response
113
-
114
  # Function to handle different input types and choose the right tool
115
- def handle_input(user_prompt, image=None, video=None, audio=None, doc=None, websearch=False):
116
  if audio:
117
  if isinstance(audio, str):
118
  audio = open(audio, "rb")
@@ -128,14 +99,6 @@ def handle_input(user_prompt, image=None, video=None, audio=None, doc=None, webs
128
  FunctionTool.from_defaults(fn=image_generation, name="Image Generation"),
129
  ]
130
 
131
- if doc:
132
- tools.append(
133
- FunctionTool.from_defaults(
134
- fn=lambda query: doc_question_answering(query, doc.name),
135
- name="Document Question Answering"
136
- )
137
- )
138
-
139
  llm = GroqLLM(model_name=MODEL)
140
  agent = ReActAgent.from_tools(tools, llm=llm, verbose=True)
141
 
@@ -162,7 +125,6 @@ def create_ui():
162
  with gr.Column(scale=1):
163
  image_input = gr.Image(type="filepath", label="Upload an image", elem_id="image-icon")
164
  audio_input = gr.Audio(type="filepath", label="Upload audio", elem_id="mic-icon")
165
- doc_input = gr.File(type="filepath", label="Upload a document", elem_id="document-icon")
166
  voice_only_mode = gr.Checkbox(label="Enable Voice Only Mode", elem_id="voice-only-mode")
167
  websearch_mode = gr.Checkbox(label="Enable Web Search", elem_id="websearch-mode")
168
  with gr.Column(scale=1):
@@ -173,14 +135,14 @@ def create_ui():
173
 
174
  submit.click(
175
  fn=main_interface,
176
- inputs=[user_prompt, image_input, audio_input, doc_input, voice_only_mode, websearch_mode],
177
  outputs=[output_label, audio_output]
178
  )
179
 
180
  voice_only_mode.change(
181
  lambda x: gr.update(visible=not x),
182
  inputs=voice_only_mode,
183
- outputs=[user_prompt, image_input, doc_input, websearch_mode, submit]
184
  )
185
  voice_only_mode.change(
186
  lambda x: gr.update(visible=x),
@@ -192,13 +154,13 @@ def create_ui():
192
 
193
  # Main interface function
194
  @spaces.GPU()
195
- def main_interface(user_prompt, image=None, audio=None, doc=None, voice_only=False, websearch=False):
196
  vqa_model.to(device='cuda', dtype=torch.bfloat16)
197
  tts_model.to("cuda")
198
  unet.to("cuda")
199
  image_pipe.to("cuda")
200
 
201
- response = handle_input(user_prompt, image=image, audio=audio, doc=doc, websearch=websearch)
202
 
203
  if voice_only:
204
  audio_output = play_voice_output(response)
 
8
  from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
9
  from parler_tts import ParlerTTSForConditionalGeneration
10
  import soundfile as sf
 
 
 
11
  from llama_index.agent import ReActAgent
12
  from llama_index.tools import FunctionTool
13
+ from llama_index.llms import GroqLLM
14
  from PIL import Image
 
15
  from tavily import TavilyClient
16
  import requests
17
  from huggingface_hub import hf_hub_download
 
82
  image.save("output.jpg")
83
  return "output.jpg"
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  # Function to handle different input types and choose the right tool
86
+ def handle_input(user_prompt, image=None, audio=None, websearch=False):
87
  if audio:
88
  if isinstance(audio, str):
89
  audio = open(audio, "rb")
 
99
  FunctionTool.from_defaults(fn=image_generation, name="Image Generation"),
100
  ]
101
 
 
 
 
 
 
 
 
 
102
  llm = GroqLLM(model_name=MODEL)
103
  agent = ReActAgent.from_tools(tools, llm=llm, verbose=True)
104
 
 
125
  with gr.Column(scale=1):
126
  image_input = gr.Image(type="filepath", label="Upload an image", elem_id="image-icon")
127
  audio_input = gr.Audio(type="filepath", label="Upload audio", elem_id="mic-icon")
 
128
  voice_only_mode = gr.Checkbox(label="Enable Voice Only Mode", elem_id="voice-only-mode")
129
  websearch_mode = gr.Checkbox(label="Enable Web Search", elem_id="websearch-mode")
130
  with gr.Column(scale=1):
 
135
 
136
  submit.click(
137
  fn=main_interface,
138
+ inputs=[user_prompt, image_input, audio_input, voice_only_mode, websearch_mode],
139
  outputs=[output_label, audio_output]
140
  )
141
 
142
  voice_only_mode.change(
143
  lambda x: gr.update(visible=not x),
144
  inputs=voice_only_mode,
145
+ outputs=[user_prompt, image_input, websearch_mode, submit]
146
  )
147
  voice_only_mode.change(
148
  lambda x: gr.update(visible=x),
 
154
 
155
  # Main interface function
156
  @spaces.GPU()
157
+ def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False):
158
  vqa_model.to(device='cuda', dtype=torch.bfloat16)
159
  tts_model.to("cuda")
160
  unet.to("cuda")
161
  image_pipe.to("cuda")
162
 
163
+ response = handle_input(user_prompt, image=image, audio=audio, websearch=websearch)
164
 
165
  if voice_only:
166
  audio_output = play_voice_output(response)