Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,671 Bytes
4b4bdb0 b141d5b 4b4bdb0 fcd1ba4 4b4bdb0 fcd1ba4 4b4bdb0 0cf4b8d fcd1ba4 ee59b20 3d9332b ee59b20 0cf4b8d 4b4bdb0 fcd1ba4 ee59b20 fcd1ba4 87af32a b141d5b fcd1ba4 aacc262 fcd1ba4 0cf4b8d fcd1ba4 0cf4b8d fcd1ba4 0cf4b8d fcd1ba4 93cb0e6 fcd1ba4 4b4bdb0 0cf4b8d 93cb0e6 0cf4b8d 4b4bdb0 fcd1ba4 0cf4b8d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import json
import os
import time
from pathlib import Path
import anthropic
import gradio as gr
import numpy as np
from dotenv import load_dotenv
from elevenlabs import ElevenLabs
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, StreamingResponse
from fastrtc import (
AdditionalOutputs,
ReplyOnPause,
Stream,
get_tts_model,
get_twilio_turn_credentials,
)
from fastrtc.utils import audio_to_bytes
from gradio.utils import get_space
from groq import Groq
from pydantic import BaseModel
load_dotenv()
groq_client = Groq()
claude_client = anthropic.Anthropic()
tts_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
curr_dir = Path(__file__).parent
tts_model = get_tts_model()
def response(
audio: tuple[int, np.ndarray],
chatbot: list[dict] | None = None,
):
chatbot = chatbot or []
messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
prompt = groq_client.audio.transcriptions.create(
file=("audio-file.mp3", audio_to_bytes(audio)),
model="whisper-large-v3-turbo",
response_format="verbose_json",
).text
chatbot.append({"role": "user", "content": prompt})
yield AdditionalOutputs(chatbot)
messages.append({"role": "user", "content": prompt})
response = claude_client.messages.create(
model="claude-3-5-haiku-20241022",
max_tokens=512,
messages=messages, # type: ignore
)
response_text = " ".join(
block.text # type: ignore
for block in response.content
if getattr(block, "type", None) == "text"
)
chatbot.append({"role": "assistant", "content": response_text})
start = time.time()
print("starting tts", start)
for i, chunk in enumerate(tts_model.stream_tts_sync(response_text)):
print("chunk", i, time.time() - start)
yield chunk
print("finished tts", time.time() - start)
yield AdditionalOutputs(chatbot)
chatbot = gr.Chatbot(type="messages")
stream = Stream(
modality="audio",
mode="send-receive",
handler=ReplyOnPause(response),
additional_outputs_handler=lambda a, b: b,
additional_inputs=[chatbot],
additional_outputs=[chatbot],
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
concurrency_limit=5 if get_space() else None,
time_limit=90 if get_space() else None,
)
class Message(BaseModel):
role: str
content: str
class InputData(BaseModel):
webrtc_id: str
chatbot: list[Message]
app = FastAPI()
stream.mount(app)
@app.get("/")
async def _():
rtc_config = get_twilio_turn_credentials() if get_space() else None
html_content = (curr_dir / "index.html").read_text()
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
return HTMLResponse(content=html_content, status_code=200)
@app.post("/input_hook")
async def _(body: InputData):
stream.set_input(body.webrtc_id, body.model_dump()["chatbot"])
return {"status": "ok"}
@app.get("/outputs")
def _(webrtc_id: str):
async def output_stream():
async for output in stream.output_stream(webrtc_id):
chatbot = output.args[0]
yield f"event: output\ndata: {json.dumps(chatbot[-1])}\n\n"
return StreamingResponse(output_stream(), media_type="text/event-stream")
if __name__ == "__main__":
import os
if (mode := os.getenv("MODE")) == "UI":
stream.ui.launch(server_port=7860, server_name="0.0.0.0")
elif mode == "PHONE":
stream.fastphone(host="0.0.0.0", port=7860)
else:
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
|