import os import openai import torch import urllib import gradio as gr import pytube as pt from transformers import pipeline from huggingface_hub import model_info openai.api_key = os.getenv('OPEN_AI_KEY') hf_t_key = ('HF_TOKEN_KEY') MODEL_NAME = "openai/whisper-small" lang = "en" device = 0 if torch.cuda.is_available() else "cpu" pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device, ) pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe") def transcribe(microphone, file_upload): warn_output = "" if (microphone is not None) and (file_upload is not None): warn_output = ( "WARNING: You've uploaded a recorded audio file . " "The recorded file from the microphone uploaded, transcribed and immediately discarded.\n" ) elif (microphone is None) and (file_upload is None): return "ERROR: You have to either use the microphone or upload an audio file" file = microphone if microphone is not None else file_upload text = pipe(file)["text"] return warn_output + text def _return_yt_html_embed(yt_url): video_id = yt_url.split("?v=")[-1] HTML_str = ( f'
' "
" ) return HTML_str def yt_transcribe(yt_url): yt = pt.YouTube(yt_url) html_embed_str = _return_yt_html_embed(yt_url) stream = yt.streams.filter(only_audio=True)[0] stream.download(filename="audio.mp3") text = pipe("audio.mp3")["text"] return html_embed_str, text def predict(message, history): history_openai_format = [] for human, assistant in history: history_openai_format.append({"role": "user", "content": human }) history_openai_format.append({"role": "assistant", "content": assistant}) history_openai_format.append({"role": "user", "content": message}) response = openai.ChatCompletion.create( model= 'ft:gpt-4o-mini-2024-07-18:2292030-peach-tech:colleague-ai:9wqEywaW', messages= history_openai_format, temperature=1.0, stream=True ) partial_message = "" for chunk in response: if len(chunk['choices'][0]['delta']) != 0: partial_message = partial_message + chunk['choices'][0]['delta']['content'] yield partial_message A1 = gr.ChatInterface(predict, title="COLLEAGUE", description="An All-In-One AI Productivity Suite By Peach State Innovation and Technology. Select The Corresponding Tab For Accessibility", textbox=gr.Textbox(placeholder="Enter your question/prompt here..."), theme= gr.themes.Glass(primary_hue="neutral", neutral_hue="slate"), retry_btn=None, clear_btn="Clear Conversation") A3 = gr.load( "models/Salesforce/blip-image-captioning-large", title=" ", description="Upload Any Type of Imagery (photos, medical imagery, etc.), I'll Give You Its Description", outputs=[gr.Textbox(label="I see...")], theme= gr.themes.Glass(primary_hue="neutral", neutral_hue="slate")) A4 = gr.load( "models/stabilityai/stable-diffusion-xl-base-1.0", inputs=[gr.Textbox(label="Enter Your Image Description")], outputs=[gr.Image(label="Image")], title=" ", description="Bring Your Imagination Into Existence, Create Unique Images With COLLEAGUE", allow_flagging="never", examples=["A gigantic celtic leprechaun wandering the streets of downtown Atlanta","A child eating pizza in a Brazilian favela"]) A5 = gr.HTML( value=(""" """), ) A6 = gr.HTML( value=(""" """), ) mf_transcribe = gr.Interface( fn=transcribe, inputs=[ gr.Microphone(type="filepath"), gr.Audio(type="filepath"), ], outputs="text", title=" ", description=( "Transcribe real-time speech and audio files of any length at the click of a button." ), allow_flagging="never", ) yt_transcribe = gr.Interface( fn=yt_transcribe, inputs=[gr.Textbox(lines=1, placeholder="Paste your YouTube video URL/web address here", label="YouTube Video URL")], outputs=["html", "text"], title=" ", description=( "Short on Time? Get The Core Details and Transcribe YouTube videos at the click of a button." ), allow_flagging="never", ) clp = gr.TabbedInterface([A1, A5, A6, mf_transcribe, A3, A4], ["Chat", "Write", "Summarize", "Audio Transcription", "Describe", "Create"], theme= gr.themes.Glass(primary_hue="neutral", neutral_hue="slate")) clp.queue().launch()