Spaces:
Runtime error
Runtime error
File size: 8,699 Bytes
c6b13bd a26dce9 c6b13bd aaa8c83 c6b13bd 8012949 c6b13bd 8012949 c6b13bd a26dce9 7642e05 32fae2e 7642e05 a26dce9 32fae2e 7642e05 a26dce9 7642e05 a26dce9 7642e05 a26dce9 c6b13bd 32fae2e c6b13bd a26dce9 c6b13bd 32fae2e c6b13bd 989fd36 32fae2e aaa8c83 32fae2e c6b13bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import gradio as gr
from edge_tts import list_voices
import edge_tts
import asyncio
import tempfile
import numpy as np
import soxr
from pydub import AudioSegment
import torch
import sentencepiece as spm
import onnxruntime as ort
from huggingface_hub import hf_hub_download, InferenceClient
import requests
from bs4 import BeautifulSoup
import urllib
import random
import re
import time
# List of user agents to choose from for requests
_useragent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
]
def get_useragent():
"""Returns a random user agent from the list."""
return random.choice(_useragent_list)
def extract_text_from_webpage(html_content):
"""Extracts visible text from HTML content using BeautifulSoup."""
soup = BeautifulSoup(html_content, "html.parser")
# Remove unwanted tags
for tag in soup(["script", "style", "header", "footer", "nav"]):
tag.extract()
# Get the remaining visible text
visible_text = soup.get_text(strip=True)
visible_text = visible_text[:8000]
return visible_text
def search(term, num_results=2, timeout=5, ssl_verify=None):
"""Performs a Google search and returns the results."""
escaped_term = urllib.parse.quote_plus(term)
all_results = []
resp = requests.get(
url="https://www.google.com/search",
headers={"User-Agent": get_useragent()}, # Set random user agent
params={
"q": term,
"num": num_results,
"udm": 14,
},
timeout=timeout,
verify=ssl_verify,
)
resp.raise_for_status() # Raise an exception if request fails
soup = BeautifulSoup(resp.text, "html.parser")
result_block = soup.find_all("div", attrs={"class": "g"})
for result in result_block:
link = result.find("a", href=True)
if link:
link = link["href"]
try:
# Fetch webpage content
webpage = requests.get(link, headers={"User-Agent": get_useragent()})
webpage.raise_for_status()
# Extract visible text from webpage
visible_text = extract_text_from_webpage(webpage.text)
all_results.append({"link": link, "text": visible_text})
except requests.exceptions.RequestException as e:
print(f"Error fetching or processing {link}: {e}")
all_results.append({"link": link, "text": None})
else:
all_results.append({"link": None, "text": None})
print(all_results)
return all_results
# Speech Recognition Model Configuration
model_name = "neongeckocom/stt_pt_citrinet_512_gamma_0_25"
sample_rate = 16000
# Download preprocessor, encoder and tokenizer
preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
# Mistral Model Configuration
client1 = InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
system_instructions1 = "<s>[SYSTEM] Responda como OpenGPT 4o, Feito por 'KingNish', Mantenha a conversa muito curta, clara, amigável e concisa. O texto fornecido é uma solicitação de um tipo específico de resposta sua, assistente virtual. A solicitação pede que você forneça respostas amigáveis. A expectativa é que eu evite apresentações e passe a responder a dúvida diretamente, elaborando todos os aspectos da solicitação. Como um assistente com tecnologia de IA, sua tarefa é gerar respostas que pareçam ter sido criadas pelo usuário e não por um assistente com tecnologia de IA. [USER]"
def resample(audio_fp32, sr):
return soxr.resample(audio_fp32, sr, sample_rate)
def to_float32(audio_buffer):
return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
def transcribe(audio_path):
audio_file = AudioSegment.from_file(audio_path)
sr = audio_file.frame_rate
audio_buffer = np.array(audio_file.get_array_of_samples())
audio_fp32 = to_float32(audio_buffer)
audio_16k = resample(audio_fp32, sr)
input_signal = torch.tensor(audio_16k).unsqueeze(0)
length = torch.tensor(len(audio_16k)).unsqueeze(0)
processed_signal, _ = preprocessor.forward(input_signal=input_signal, length=length)
logits = encoder.run(None, {'audio_signal': processed_signal.numpy(), 'length': length.numpy()})[0][0]
blank_id = tokenizer.vocab_size()
decoded_prediction = [p for p in logits.argmax(axis=1).tolist() if p != blank_id]
text = tokenizer.decode_ids(decoded_prediction)
return text
def model(text, web_search):
if web_search is True:
"""Performs a web search, feeds the results to a language model, and returns the answer."""
web_results = search(text)
web2 = ' '.join([f"Link: {res['link']}\nText: {res['text']}\n\n" for res in web_results])
formatted_prompt = system_instructions1 + text + "[WEB]" + str(web2) + "[OpenGPT 4o]"
stream = client1.text_generation(formatted_prompt, max_new_tokens=300, stream=True, details=True, return_full_text=False)
return "".join([response.token.text for response in stream if response.token.text != "</s>"])
else:
formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
stream = client1.text_generation(formatted_prompt, max_new_tokens=300, stream=True, details=True, return_full_text=False)
return "".join([response.token.text for response in stream if response.token.text != "</s>"])
async def get_voices():
voices = await edge_tts.list_voices()
return list(voices)
# Executar a função assíncrona para obter as vozes
voices = asyncio.run(get_voices())
# Filtrar as vozes em português do Brasil
pt_br_voices = [voice for voice in voices if voice["Locale"] == "pt-BR"]
# Escolher uma voz (por exemplo, a primeira da lista)
chosen_voice = pt_br_voices[0]["Name"] if pt_br_voices else None
async def respond(audio, web_search):
if audio is None:
return None
user = transcribe(audio)
reply = model(user, web_search)
if chosen_voice:
communicate = edge_tts.Communicate(reply, voice=chosen_voice)
else:
communicate = edge_tts.Communicate(reply) # Usa a voz padrão se nenhuma voz pt-BR for encontrada
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path
def transcribe_and_respond(audio, web_search):
return asyncio.run(respond(audio, web_search))
with gr.Blocks() as demo:
with gr.Row():
web_search = gr.Checkbox(label="Web Search", value=False)
Adjusted Gradio Audio Component with Silence Threshold
input_audio = gr.Audio(
sources=["microphone"],
type="filepath",
streaming=True,
min_value=-0.1, # Adjust this value to set the silence threshold
max_value=0.1 # Adjust this value to set the silence threshold
)
output_audio = gr.Audio(label="AI Response", autoplay=True)
is_recording = gr.State(False)
last_interaction_time = gr.State(time.time())
def toggle_recording():
return not is_recording.value
def process_audio(audio, web_search, is_rec):
current_time = time.time()
if is_rec and (current_time - last_interaction_time.value > 2):
last_interaction_time.value = current_time
return transcribe_and_respond(audio, web_search), False
return None, is_rec
input_audio.stream(process_audio, inputs=[input_audio, web_search, is_recording], outputs=[output_audio, is_recording])
demo.load(toggle_recording, outputs=[is_recording])
if __name__ == "__main__":
demo.queue(max_size=200).launch() |