File size: 3,035 Bytes
9b5b26a
 
 
 
c19d193
6aae614
0759335
 
8fe992b
9b5b26a
 
8a10c10
 
 
d2943b1
089099f
20d75d2
 
8a10c10
 
 
 
dbfe74a
8a10c10
a3cfb1d
 
 
 
 
 
 
 
 
 
dbfe74a
a3cfb1d
8a10c10
6da24fc
 
 
20d75d2
6da24fc
 
20d75d2
a3cfb1d
6da24fc
 
dbfe74a
a3cfb1d
 
 
 
20d75d2
a3cfb1d
13a73d4
a3cfb1d
13a73d4
 
8c01ffb
6aae614
0759335
 
 
 
ae7a494
 
 
 
e121372
bf6d34c
 
29ec968
fe328e0
13d500a
8c01ffb
 
9b5b26a
 
8c01ffb
861422e
 
9b5b26a
8c01ffb
8fe992b
0759335
8c01ffb
 
 
 
 
 
861422e
8fe992b
 
9b5b26a
a3cfb1d
20d75d2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
import datetime
import requests
import pytz
import yaml
from tools.final_answer import FinalAnswerTool
from tools.web_search import DuckDuckGoSearchTool
from tools.visit_webpage import VisitWebpageTool

from Gradio_UI import GradioUI

from kokoro import KPipeline
import soundfile as sf
import os
import numpy as np

import gradio as gr


# Initialize the Kokoro pipeline
pipeline = KPipeline(lang_code='a')  # 'a' stands for American English

@tool
def text_to_speech_kokoro(text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
    """Convert text to speech using the Kokoro-82M model.
    
    Args:
        text (str): The text to be converted to speech.
        voice (str, optional): The voice to use for speech synthesis. Defaults to 'af_heart'.
        speed (float, optional): The speed of the speech. Defaults to 1.0.
    
    Returns:
        str: The path to the generated audio file.
    """
    try:
        # Generate speech audio
        generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
        audio_segments = []
        for _, _, audio in generator:
            audio_segments.append(audio)
        
        if not audio_segments:
            raise ValueError("No audio generated.")
        
        # Concatenate segments into one audio array
        full_audio = np.concatenate(audio_segments)
        sample_rate = 24000  # Kokoro outputs at 24 kHz

        # Ensure the tools folder exists and save the file there
        os.makedirs("tools", exist_ok=True)
        filename = os.path.join("tools", "output.wav")
        sf.write(filename, full_audio, sample_rate)

        return filename  # Return the file path
    except Exception as e:
        return f"Error generating speech: {str(e)}"



final_answer = FinalAnswerTool()
web_search_tool = DuckDuckGoSearchTool()
visit_webpage_tool = VisitWebpageTool()



# If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder:
# model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud' 

model = HfApiModel(
max_tokens=2096,
temperature=0.5,
model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded
custom_role_conversions=None,
)


# Import tool from Hub
image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)

with open("prompts.yaml", 'r') as stream:
    prompt_templates = yaml.safe_load(stream)
    
agent = CodeAgent(
    model=model,
    tools=[visit_webpage_tool, web_search_tool, final_answer, image_generation_tool, get_current_time_in_timezone, get_random_cocktail, search_dad_jokes, text_to_speech_kokoro], ## add your tools here (don't remove final answer)
    max_steps=6,
    verbosity_level=1,
    grammar=None,
    planning_interval=None,
    name=None,
    description=None,
    prompt_templates=prompt_templates
)


GradioUI(agent).launch()