hu-po commited on
Commit
f27306d
·
1 Parent(s): b038b32

release 0.2

Browse files
src/__pycache__/elevenlabs.cpython-310.pyc DELETED
Binary file (4.12 kB)
 
src/__pycache__/elevenlabs.cpython-39.pyc DELETED
Binary file (4.11 kB)
 
src/__pycache__/openailib.cpython-310.pyc DELETED
Binary file (1.23 kB)
 
src/__pycache__/openailib.cpython-39.pyc DELETED
Binary file (1.23 kB)
 
src/__pycache__/tube.cpython-310.pyc DELETED
Binary file (1.82 kB)
 
src/__pycache__/tube.cpython-39.pyc DELETED
Binary file (1.81 kB)
 
src/__pycache__/utils.cpython-310.pyc DELETED
Binary file (639 Bytes)
 
src/__pycache__/utils.cpython-39.pyc DELETED
Binary file (637 Bytes)
 
src/elevenlabs.py DELETED
@@ -1,115 +0,0 @@
1
- import asyncio
2
- import io
3
- import logging
4
- import os
5
- import time
6
- from concurrent.futures import ThreadPoolExecutor
7
- from dataclasses import dataclass
8
- from typing import Dict, List, Union, Tuple
9
-
10
- import sounddevice as sd
11
- import soundfile as sf
12
- from elevenlabslib import ElevenLabsUser, ElevenLabsVoice
13
-
14
- from .utils import timeit
15
-
16
- logging.basicConfig(level=logging.INFO)
17
- log = logging.getLogger(__name__)
18
-
19
- USER = ElevenLabsUser(os.environ["ELEVENLABS_API_KEY"])
20
-
21
-
22
- @dataclass
23
- class Speaker:
24
- name: str
25
- voice: ElevenLabsVoice
26
- color: str
27
- description: str = None
28
-
29
-
30
- async def text_to_speechbytes_async(text, speaker, loop):
31
- with ThreadPoolExecutor() as executor:
32
- speech_bytes = await loop.run_in_executor(executor, text_to_speechbytes, text, speaker.voice)
33
- return speech_bytes
34
-
35
-
36
- async def play_history(history: List[Tuple[Speaker, str]]):
37
- loop = asyncio.get_event_loop()
38
-
39
- # Create a list of tasks for all text_to_speechbytes function calls
40
- tasks = [text_to_speechbytes_async(
41
- text, speaker, loop) for speaker, text in history]
42
-
43
- # Run tasks concurrently, waiting for the first one to complete
44
- for speech_bytes in await asyncio.gather(*tasks):
45
- audioFile = io.BytesIO(speech_bytes)
46
- soundFile = sf.SoundFile(audioFile)
47
- sd.play(soundFile.read(), samplerate=soundFile.samplerate, blocking=True)
48
-
49
-
50
- async def save_history(history: List[Tuple[Speaker, str]], audio_savepath: str):
51
- loop = asyncio.get_event_loop()
52
-
53
- # Create a list of tasks for all text_to_speechbytes function calls
54
- tasks = [text_to_speechbytes_async(
55
- text, speaker, loop) for speaker, text in history]
56
-
57
- # Run tasks concurrently, waiting for the first one to complete
58
- all_speech_bytes = await asyncio.gather(*tasks)
59
-
60
- # Combine all audio bytes into a single audio file
61
- concatenated_audio = io.BytesIO(b''.join(all_speech_bytes))
62
-
63
- # Save the combined audio file to disk
64
- with sf.SoundFile(concatenated_audio, mode='r') as soundFile:
65
- with sf.SoundFile(
66
- audio_savepath, mode='w',
67
- samplerate=soundFile.samplerate,
68
- channels=soundFile.channels,
69
- ) as outputFile:
70
- outputFile.write(soundFile.read())
71
-
72
-
73
- def check_voice_exists(voice: Union[ElevenLabsVoice, str]) -> Union[ElevenLabsVoice, None]:
74
- log.info(f"Getting voice {voice}...")
75
- _available_voices = USER.get_voices_by_name(voice)
76
- if _available_voices:
77
- log.info(f"Voice {voice} already exists, found {_available_voices}.")
78
- return _available_voices[0]
79
- return None
80
-
81
-
82
- @timeit
83
- def get_make_voice(voice: Union[ElevenLabsVoice, str], audio_path: List[str] = None) -> ElevenLabsVoice:
84
- _voice = check_voice_exists(voice)
85
- if _voice is not None:
86
- return _voice
87
- else:
88
- if USER.get_voice_clone_available():
89
- assert audio_path is not None, "audio_path must be provided"
90
- assert isinstance(audio_path, list), "audio_path must be a list"
91
- log.info(f"Cloning voice {voice}...")
92
- _audio_source_dict = {
93
- # Audio path is a PosixPath
94
- _.name: open(_, "rb").read() for _ in audio_path
95
- }
96
- newVoice = USER.clone_voice_bytes(voice, _audio_source_dict)
97
- return newVoice
98
- raise ValueError(
99
- f"Voice {voice} does not exist and cloning is not available.")
100
-
101
-
102
- @timeit
103
- def text_to_speech(text: str, voice: ElevenLabsVoice):
104
- log.info(f"Generating audio using voice {voice}...")
105
- time_start = time.time()
106
- voice.generate_and_play_audio(text, playInBackground=False)
107
- duration = time.time() - time_start
108
- return duration
109
-
110
-
111
- @timeit
112
- def text_to_speechbytes(text: str, voice: ElevenLabsVoice):
113
- log.info(f"Generating audio for voice {voice} text {text}...")
114
- audio_bytes = voice.generate_audio_bytes(text)
115
- return audio_bytes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/openailib.py DELETED
@@ -1,47 +0,0 @@
1
- import logging
2
- import os
3
-
4
- from .utils import timeit
5
-
6
- import openai
7
- openai.api_key = os.getenv("OPENAI_API_KEY")
8
-
9
- logging.basicConfig(level=logging.INFO)
10
- log = logging.getLogger(__name__)
11
-
12
-
13
- @timeit
14
- def speech_to_text(audio_path):
15
- log.info("Transcribing audio...")
16
- transcript = openai.Audio.transcribe("whisper-1", open(audio_path, "rb"))
17
- text = transcript["text"]
18
- log.info(f"Transcript: \n\t{text}")
19
- return text
20
-
21
-
22
- @timeit
23
- def top_response(prompt, system=None, model="gpt-3.5-turbo", max_tokens=20, temperature=0.8):
24
- _prompt = [
25
- {
26
- "role": "user",
27
- "content": prompt,
28
- },
29
- ]
30
- if system:
31
- _prompt = [
32
- {
33
- "role": "system",
34
- "content": system,
35
- },
36
- ] + _prompt
37
- log.info(f"API call to {model} with prompt: \n\n\t{_prompt}\n\n")
38
- _response = openai.ChatCompletion.create(
39
- model=model,
40
- messages=_prompt,
41
- temperature=temperature,
42
- n=1,
43
- max_tokens=max_tokens,
44
- )
45
- log.info(f"API reponse: \n\t{_response}")
46
- response: str = _response['choices'][0]['message']['content']
47
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/src/__pycache__/elevenlabs.cpython-310.pyc DELETED
Binary file (4.12 kB)
 
src/src/__pycache__/elevenlabs.cpython-39.pyc DELETED
Binary file (4.11 kB)
 
src/src/__pycache__/openailib.cpython-310.pyc DELETED
Binary file (1.23 kB)
 
src/src/__pycache__/openailib.cpython-39.pyc DELETED
Binary file (1.23 kB)
 
src/src/__pycache__/tube.cpython-310.pyc DELETED
Binary file (1.82 kB)
 
src/src/__pycache__/tube.cpython-39.pyc DELETED
Binary file (1.81 kB)
 
src/src/__pycache__/utils.cpython-310.pyc DELETED
Binary file (639 Bytes)
 
src/src/__pycache__/utils.cpython-39.pyc DELETED
Binary file (637 Bytes)
 
src/src/elevenlabs.py DELETED
@@ -1,127 +0,0 @@
1
- import asyncio
2
- import io
3
- import logging
4
- import os
5
- import time
6
- from concurrent.futures import ThreadPoolExecutor
7
- from dataclasses import dataclass
8
- from typing import List, Union, Tuple
9
-
10
- import sounddevice as sd
11
- import soundfile as sf
12
- from elevenlabslib import ElevenLabsUser, ElevenLabsVoice
13
-
14
- from .utils import timeit
15
-
16
- logging.basicConfig(level=logging.INFO)
17
- log = logging.getLogger(__name__)
18
-
19
- try:
20
- USER = ElevenLabsUser(os.environ["ELEVENLABS_API_KEY"])
21
- except KeyError as e:
22
- log.warning("ELEVENLABS_API_KEY not found in environment variables.")
23
- pass
24
-
25
-
26
- @dataclass
27
- class Speaker:
28
- name: str
29
- voice: ElevenLabsVoice
30
- color: str
31
- description: str = None
32
-
33
-
34
- async def text_to_speechbytes_async(text, speaker, loop):
35
- with ThreadPoolExecutor() as executor:
36
- speech_bytes = await loop.run_in_executor(executor, text_to_speechbytes, text, speaker.voice)
37
- return speech_bytes
38
-
39
-
40
- async def play_history(history: List[Tuple[Speaker, str]]):
41
- loop = asyncio.get_event_loop()
42
-
43
- # Create a list of tasks for all text_to_speechbytes function calls
44
- tasks = [text_to_speechbytes_async(
45
- text, speaker, loop) for speaker, text in history]
46
-
47
- # Run tasks concurrently, waiting for the first one to complete
48
- for speech_bytes in await asyncio.gather(*tasks):
49
- audioFile = io.BytesIO(speech_bytes)
50
- soundFile = sf.SoundFile(audioFile)
51
- sd.play(soundFile.read(), samplerate=soundFile.samplerate, blocking=True)
52
-
53
-
54
- async def save_history(history: List[Tuple[Speaker, str]], audio_savepath: str):
55
- loop = asyncio.get_event_loop()
56
-
57
- # Create a list of tasks for all text_to_speechbytes function calls
58
- tasks = [text_to_speechbytes_async(
59
- text, speaker, loop) for speaker, text in history]
60
-
61
- # Run tasks concurrently, waiting for the first one to complete
62
- all_speech_bytes = await asyncio.gather(*tasks)
63
-
64
- # Combine all audio bytes into a single audio file
65
- concatenated_audio = io.BytesIO(b''.join(all_speech_bytes))
66
-
67
- # Save the combined audio file to disk
68
- with sf.SoundFile(concatenated_audio, mode='r') as soundFile:
69
- with sf.SoundFile(
70
- audio_savepath, mode='w',
71
- samplerate=soundFile.samplerate,
72
- channels=soundFile.channels,
73
- ) as outputFile:
74
- outputFile.write(soundFile.read())
75
-
76
-
77
- def check_voice_exists(voice: Union[ElevenLabsVoice, str]) -> Union[ElevenLabsVoice, None]:
78
- if USER is None:
79
- log.warning(
80
- "No ElevenLabsUser found, have you set the ELEVENLABS_API_KEY environment variable?")
81
- return None
82
- log.info(f"Getting voice {voice}...")
83
- _available_voices = USER.get_voices_by_name(voice)
84
- if _available_voices:
85
- log.info(f"Voice {voice} already exists, found {_available_voices}.")
86
- return _available_voices[0]
87
- return None
88
-
89
-
90
- @timeit
91
- def get_make_voice(voice: Union[ElevenLabsVoice, str], audio_path: List[str] = None) -> ElevenLabsVoice:
92
- if USER is None:
93
- log.warning(
94
- "No ElevenLabsUser found, have you set the ELEVENLABS_API_KEY environment variable?")
95
- return None
96
- _voice = check_voice_exists(voice)
97
- if _voice is not None:
98
- return _voice
99
- else:
100
- if USER.get_voice_clone_available():
101
- assert audio_path is not None, "audio_path must be provided"
102
- assert isinstance(audio_path, list), "audio_path must be a list"
103
- log.info(f"Cloning voice {voice}...")
104
- _audio_source_dict = {
105
- # Audio path is a PosixPath
106
- _.name: open(_, "rb").read() for _ in audio_path
107
- }
108
- newVoice = USER.clone_voice_bytes(voice, _audio_source_dict)
109
- return newVoice
110
- raise ValueError(
111
- f"Voice {voice} does not exist and cloning is not available.")
112
-
113
-
114
- @timeit
115
- def text_to_speech(text: str, voice: ElevenLabsVoice):
116
- log.info(f"Generating audio using voice {voice}...")
117
- time_start = time.time()
118
- voice.generate_and_play_audio(text, playInBackground=False)
119
- duration = time.time() - time_start
120
- return duration
121
-
122
-
123
- @timeit
124
- def text_to_speechbytes(text: str, voice: ElevenLabsVoice):
125
- log.info(f"Generating audio for voice {voice} text {text}...")
126
- audio_bytes = voice.generate_audio_bytes(text)
127
- return audio_bytes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/src/openailib.py DELETED
@@ -1,52 +0,0 @@
1
- import logging
2
- import os
3
-
4
- from .utils import timeit
5
-
6
- import openai
7
-
8
- logging.basicConfig(level=logging.INFO)
9
- log = logging.getLogger(__name__)
10
-
11
- try:
12
- openai.api_key = os.getenv("OPENAI_API_KEY")
13
- except KeyError as e:
14
- log.warning("OPENAI_API_KEY not found in environment variables.")
15
- pass
16
-
17
-
18
- @timeit
19
- def speech_to_text(audio_path):
20
- log.info("Transcribing audio...")
21
- transcript = openai.Audio.transcribe("whisper-1", open(audio_path, "rb"))
22
- text = transcript["text"]
23
- log.info(f"Transcript: \n\t{text}")
24
- return text
25
-
26
-
27
- @timeit
28
- def top_response(prompt, system=None, model="gpt-3.5-turbo", max_tokens=20, temperature=0.8):
29
- _prompt = [
30
- {
31
- "role": "user",
32
- "content": prompt,
33
- },
34
- ]
35
- if system:
36
- _prompt = [
37
- {
38
- "role": "system",
39
- "content": system,
40
- },
41
- ] + _prompt
42
- log.info(f"API call to {model} with prompt: \n\n\t{_prompt}\n\n")
43
- _response = openai.ChatCompletion.create(
44
- model=model,
45
- messages=_prompt,
46
- temperature=temperature,
47
- n=1,
48
- max_tokens=max_tokens,
49
- )
50
- log.info(f"API reponse: \n\t{_response}")
51
- response: str = _response['choices'][0]['message']['content']
52
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/src/tube.py DELETED
@@ -1,64 +0,0 @@
1
- '''
2
- Extract audio from a YouTube video
3
-
4
- Usage:
5
- tube.py <url> <person> [-s <start_time>] [-d <duration>]
6
- '''
7
-
8
- import subprocess
9
- from pathlib import Path
10
- import datetime
11
- import argparse
12
- import os
13
- from pytube import YouTube
14
-
15
- # Define argparse arguments
16
- parser = argparse.ArgumentParser(description='Extract audio from a YouTube video')
17
- parser.add_argument('url', type=str, help='the YouTube video URL')
18
- parser.add_argument('person', type=str, help='the name of the person speaking')
19
- parser.add_argument('-s', '--start-time', type=float, default=0, help='the start time in minutes for the extracted audio (default: 0)')
20
- parser.add_argument('-d', '--duration', type=int, help='the duration in seconds for the extracted audio (default: 60)')
21
-
22
-
23
- # 200 seconds seems to be max duration for single clips
24
- def extract_audio(url: str, label: str, start_minute: float = 0, duration: int = 200):
25
-
26
- # Download the YouTube video
27
- youtube_object = YouTube(url)
28
- stream = youtube_object.streams.first()
29
- video_path = Path(stream.download(skip_existing=True))
30
-
31
- # Convert start time to seconds
32
- start_time_seconds = int(start_minute * 60)
33
-
34
- # Format the start time in HH:MM:SS.mmm format
35
- start_time_formatted = str(datetime.timedelta(seconds=start_time_seconds))
36
- start_time_formatted = start_time_formatted[:11] + start_time_formatted[12:]
37
-
38
- # Set the output path using the audio file name
39
- output_path = video_path.parent / f"{label}.wav"
40
-
41
- # Run ffmpeg to extract the audio
42
- cmd = ['ffmpeg', '-y', '-i', str(video_path), '-ss', start_time_formatted]
43
- if duration is not None:
44
- # Format the duration in HH:MM:SS.mmm format
45
- duration_formatted = str(datetime.timedelta(seconds=duration))
46
- duration_formatted = duration_formatted[:11] + duration_formatted[12:]
47
- cmd += ['-t', duration_formatted]
48
- cmd += ['-q:a', '0', '-map', 'a', str(output_path)]
49
- subprocess.run(cmd)
50
-
51
- # remove the extra .3gpp file that is created:
52
- for file in os.listdir(video_path.parent):
53
- if file.endswith(".3gpp"):
54
- os.remove(os.path.join(video_path.parent, file))
55
-
56
- return output_path
57
-
58
- if __name__ == '__main__':
59
-
60
- # Parse the arguments
61
- args = parser.parse_args()
62
-
63
- # Extract the audio
64
- extract_audio(args.url, args.person, args.start_time, args.duration)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/src/utils.py DELETED
@@ -1,16 +0,0 @@
1
- import time
2
- import logging
3
-
4
- log = logging.getLogger(__name__)
5
-
6
- # Decorator to time a function
7
- def timeit(func):
8
- def timed(*args, **kwargs):
9
- time_start = time.time()
10
- result = func(*args, **kwargs)
11
- _yellow = "\x1b[33;20m"
12
- _reset = "\x1b[0m"
13
- _msg = f"{_yellow}{func.__name__} duration: {time.time() - time_start:.2f} seconds{_reset}"
14
- log.info(_msg)
15
- return result
16
- return timed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tube.py DELETED
@@ -1,64 +0,0 @@
1
- '''
2
- Extract audio from a YouTube video
3
-
4
- Usage:
5
- tube.py <url> <person> [-s <start_time>] [-d <duration>]
6
- '''
7
-
8
- import subprocess
9
- from pathlib import Path
10
- import datetime
11
- import argparse
12
- import os
13
- from pytube import YouTube
14
-
15
- # Define argparse arguments
16
- parser = argparse.ArgumentParser(description='Extract audio from a YouTube video')
17
- parser.add_argument('url', type=str, help='the YouTube video URL')
18
- parser.add_argument('person', type=str, help='the name of the person speaking')
19
- parser.add_argument('-s', '--start-time', type=float, default=0, help='the start time in minutes for the extracted audio (default: 0)')
20
- parser.add_argument('-d', '--duration', type=int, help='the duration in seconds for the extracted audio (default: 60)')
21
-
22
-
23
- # 200 seconds seems to be max duration for single clips
24
- def extract_audio(url: str, label: str, start_minute: float = 0, duration: int = 200):
25
-
26
- # Download the YouTube video
27
- youtube_object = YouTube(url)
28
- stream = youtube_object.streams.first()
29
- video_path = Path(stream.download(skip_existing=True))
30
-
31
- # Convert start time to seconds
32
- start_time_seconds = int(start_minute * 60)
33
-
34
- # Format the start time in HH:MM:SS.mmm format
35
- start_time_formatted = str(datetime.timedelta(seconds=start_time_seconds))
36
- start_time_formatted = start_time_formatted[:11] + start_time_formatted[12:]
37
-
38
- # Set the output path using the audio file name
39
- output_path = video_path.parent / f"{label}.wav"
40
-
41
- # Run ffmpeg to extract the audio
42
- cmd = ['ffmpeg', '-y', '-i', str(video_path), '-ss', start_time_formatted]
43
- if duration is not None:
44
- # Format the duration in HH:MM:SS.mmm format
45
- duration_formatted = str(datetime.timedelta(seconds=duration))
46
- duration_formatted = duration_formatted[:11] + duration_formatted[12:]
47
- cmd += ['-t', duration_formatted]
48
- cmd += ['-q:a', '0', '-map', 'a', str(output_path)]
49
- subprocess.run(cmd)
50
-
51
- # remove the extra .3gpp file that is created:
52
- for file in os.listdir(video_path.parent):
53
- if file.endswith(".3gpp"):
54
- os.remove(os.path.join(video_path.parent, file))
55
-
56
- return output_path
57
-
58
- if __name__ == '__main__':
59
-
60
- # Parse the arguments
61
- args = parser.parse_args()
62
-
63
- # Extract the audio
64
- extract_audio(args.url, args.person, args.start_time, args.duration)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils.py DELETED
@@ -1,16 +0,0 @@
1
- import time
2
- import logging
3
-
4
- log = logging.getLogger(__name__)
5
-
6
- # Decorator to time a function
7
- def timeit(func):
8
- def timed(*args, **kwargs):
9
- time_start = time.time()
10
- result = func(*args, **kwargs)
11
- _yellow = "\x1b[33;20m"
12
- _reset = "\x1b[0m"
13
- _msg = f"{_yellow}{func.__name__} duration: {time.time() - time_start:.2f} seconds{_reset}"
14
- log.info(_msg)
15
- return result
16
- return timed