# import packages import gradio as gr import copy from llama_cpp import Llama from huggingface_hub import hf_hub_download import chromadb from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction from chromadb.utils.data_loaders import ImageLoader from chromadb.config import Settings from datasets import load_dataset import numpy as np from tqdm import tqdm import shutil import os from chromadb.utils import embedding_functions import gradio as gr from PIL import Image import requests from io import BytesIO from transformers import pipeline from bark import SAMPLE_RATE, generate_audio, preload_models import json # Initialize the Llama model llm = Llama( ## original model # model_path=hf_hub_download( # repo_id="microsoft/Phi-3-mini-4k-instruct-gguf", # filename="Phi-3-mini-4k-instruct-q4.gguf", # ), ## compressed model model_path=hf_hub_download( repo_id="TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF", filename="capybarahermes-2.5-mistral-7b.Q2_K.gguf", ), n_ctx=2048, n_gpu_layers=50, # Adjust based on your VRAM ) # use of clip model for embedding client = chromadb.PersistentClient(path="DB") embedding_function = OpenCLIPEmbeddingFunction() image_loader = ImageLoader() # must be if you reads from URIs # initialize separate collection for image and text data collection_images = client.create_collection( name='collection_images', embedding_function=embedding_function, data_loader=image_loader) collection_text = client.create_collection( name='collection_text', embedding_function=embedding_function, ) # Get the uris to the images IMAGE_FOLDER = 'images' image_uris = sorted([os.path.join(IMAGE_FOLDER, image_name) for image_name in os.listdir(IMAGE_FOLDER) if not image_name.endswith('.txt')]) ids = [str(i) for i in range(len(image_uris))] collection_images.add(ids=ids, uris=image_uris) # Path to the backup file load_path = 'text_collection_backup.json' # Load the data from the JSON file with open(load_path, 'r') as f: loaded_data = json.load(f) # Extract the documents and IDs loaded_documents = loaded_data['documents'] loaded_ids = loaded_data['ids'] # Assuming 'client' is already set up for ChromaDB # Create or get a collection to store the loaded data # collection_text = client.create_collection( # name='collection_text', # Ensure the collection name is consistent if required # embedding_function=default_ef # Use the same embedding function as before # ) # Add data to the collection collection_text.add( documents=loaded_documents, ids=loaded_ids ) # Initialize the transcriber transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en",device ='cuda') # Preload TTS models preload_models() image_path = "dom_bremen.jpg" absolute_path = os.path.abspath(image_path) def transcribe(audio): sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) return transcriber({"sampling_rate": sr, "raw": y})["text"] fixed_prompt = "en_speaker_5" def generate_audio_output(text): audio_arr = generate_audio(text, history_prompt=fixed_prompt) audio_arr = (audio_arr * 32767).astype(np.int16) return (SAMPLE_RATE, audio_arr) # Function to retrieve and generate text based on input query def generate_text(message, max_tokens=150, temperature=0.2, top_p=0.9): try: # Retrieve context and image from vector store retrieved_image = collection_images.query(query_texts=message, include=['data'], n_results=1) context_text = collection_text.query(query_texts=message, n_results=1) context = context_text['documents'][0] if context_text else "No relevant context found." image_data = retrieved_image['uris'][0] if retrieved_image else None image_url = image_data if image_data else None # Log the image URL for debugging print(f"Retrieved image URL: {image_url}") # Create prompt template for LLM prompt_template = ( f"Context: {context}\n\n" f"Question: {message}\n\n" f"You are a guide to city of Bremen from Germany, generate response based on context." ) # Generate text using the language model output = llm( prompt_template, temperature=temperature, top_p=top_p, top_k=50, repeat_penalty=1.1, max_tokens=max_tokens, ) # Process the output input_string = output['choices'][0]['text'].strip() cleaned_text = input_string.strip("[]'").replace('\\n', '\n') continuous_text = '\n'.join(cleaned_text.split('\n')) return continuous_text, image_url[0] except Exception as e: return f"Error: {str(e)}", None # Function to load and display an image from a file path def load_image_from_path(file_path): try: img = Image.open(file_path) return img except Exception as e: print(f"Error loading image: {str(e)}") return None def process_audio(audio): # Transcribe the audio transcribed_text = transcribe(audio) text_output, image_path = generate_text(transcribed_text) if image_path: image_output = load_image_from_path(image_path) else: image_output = None # Handle cases where no image is retrieved # return text_output, image_output # Generate audio output audio_output = generate_audio_output(text_output) return text_output,audio_output,image_output def gen_tts(text): audio_arr = generate_audio(text, history_prompt=fixed_prompt) audio_arr = (audio_arr * 32767).astype(np.int16) return (SAMPLE_RATE, audio_arr) # Define the Gradio interface # with gr.Blocks() as app: demo = gr.Interface( fn=process_audio, inputs=gr.Audio(sources=["microphone"], label="Input Audio"), outputs=[ gr.Textbox(label="Generated Text"), gr.Audio(label="Generated Audio"), gr.Image(label="Retrieved Image") # New output component for the image ], title="moinBremen - Your Personal Tour Guide for our City of Bremen", description="Ask your question about Bremen by speaking into the microphone. The system will transcribe your question, generate a response, and read it out loud.", css=""".gradio-container { background: url('file=/content/dom_bremen.jpg') no-repeat center center fixed; background-size: cover; }""", cache_examples=False, ) demo.launch(allowed_paths=[absolute_path])