Spaces:
Runtime error
Runtime error
nahue-passano
commited on
Commit
·
7405904
1
Parent(s):
6125290
update: new release
Browse files- app.py +98 -74
- pyproject.toml +3 -0
- requirements.txt +4 -1
- utils/audio.py +96 -0
- utils/files.py +71 -0
- utils/text.py +142 -0
app.py
CHANGED
@@ -1,97 +1,121 @@
|
|
1 |
-
from io import StringIO
|
2 |
-
import os
|
3 |
-
import tempfile
|
4 |
import streamlit as st
|
5 |
-
import json
|
6 |
import whisper_timestamped as whisper
|
7 |
import pandas as pd
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"}
|
10 |
LANGUAGES = {"English": "en", "Spanish": "es"}
|
11 |
MODEL_SIZES = {"Medium": "medium", "Large": "large"}
|
12 |
|
13 |
|
14 |
-
def save_temp_file(file):
|
15 |
-
temp_dir = tempfile.gettempdir()
|
16 |
-
temp_file_path = os.path.join(temp_dir, file.name)
|
17 |
-
with open(temp_file_path, "wb") as temp_file:
|
18 |
-
temp_file.write(file.getvalue())
|
19 |
-
return temp_file_path
|
20 |
-
|
21 |
-
|
22 |
@st.cache_resource(show_spinner=False)
|
23 |
def load_model(model_size: str):
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
return whisper.load_model(
|
26 |
MODEL_SIZES[model_size], device="cpu", download_root="models"
|
27 |
)
|
28 |
|
29 |
|
30 |
-
def
|
31 |
-
|
32 |
-
|
33 |
-
)
|
34 |
-
for sentence_i in timestamp_dict["segments"]:
|
35 |
-
sentence_i = pd.DataFrame(
|
36 |
-
{
|
37 |
-
"Audio file": [filename],
|
38 |
-
"Sentence": [str(sentence_i["text"])],
|
39 |
-
"Start": [sentence_i["start"]],
|
40 |
-
"End": [sentence_i["end"]],
|
41 |
-
"Duration": [sentence_i["end"] - sentence_i["start"]],
|
42 |
-
}
|
43 |
-
)
|
44 |
-
sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
|
45 |
-
return sentence_df
|
46 |
-
|
47 |
-
|
48 |
-
def get_word_data(filename: str, timestamp_dict: dict):
|
49 |
-
pass
|
50 |
-
|
51 |
-
|
52 |
-
def get_word_data():
|
53 |
-
pass
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
"Load audio file to transcribe", type=["wav", "mp3"], accept_multiple_files=True
|
61 |
-
)
|
62 |
-
|
63 |
-
stamp_type, lang, size = st.columns(3)
|
64 |
-
|
65 |
-
with stamp_type:
|
66 |
-
timestamp_type = st.selectbox("Timestamp type", options=list(STAMP_TYPES.keys()))
|
67 |
-
|
68 |
-
with lang:
|
69 |
-
language = st.selectbox("Language", options=list(LANGUAGES.keys()))
|
70 |
|
71 |
-
with
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
tmp_audio = save_temp_file(audio_i)
|
82 |
-
tmp_audio_file = whisper.load_audio(tmp_audio)
|
83 |
-
timestamp_result = whisper.transcribe(
|
84 |
-
model, tmp_audio_file, language=LANGUAGES[language]
|
85 |
)
|
86 |
-
audio_i_df = get_sentence_data(audio_i.name, timestamp_result)
|
87 |
-
sentences_df = pd.concat([sentences_df, audio_i_df], ignore_index=True)
|
88 |
|
89 |
-
st.dataframe(sentences_df)
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
sentences_df.to_csv(index=False),
|
94 |
-
file_name="timestamps.csv",
|
95 |
-
mime="text/csv",
|
96 |
-
use_container_width=True,
|
97 |
-
)
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
|
|
2 |
import whisper_timestamped as whisper
|
3 |
import pandas as pd
|
4 |
|
5 |
+
from utils.files import (
|
6 |
+
create_temp_directory,
|
7 |
+
save_temp_file,
|
8 |
+
compress_utterances_folder,
|
9 |
+
)
|
10 |
+
from utils.text import get_sentence_data, get_word_data, generate_transcriptions_splits
|
11 |
+
from utils.audio import generate_audio_splits
|
12 |
+
|
13 |
STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"}
|
14 |
LANGUAGES = {"English": "en", "Spanish": "es"}
|
15 |
MODEL_SIZES = {"Medium": "medium", "Large": "large"}
|
16 |
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
@st.cache_resource(show_spinner=False)
|
19 |
def load_model(model_size: str):
|
20 |
+
"""Loads the Whisper model with size model_size
|
21 |
+
|
22 |
+
Parameters
|
23 |
+
----------
|
24 |
+
model_size : str
|
25 |
+
Available size of the whisper model
|
26 |
+
|
27 |
+
Returns
|
28 |
+
-------
|
29 |
+
_type_
|
30 |
+
Whisper model
|
31 |
+
"""
|
32 |
return whisper.load_model(
|
33 |
MODEL_SIZES[model_size], device="cpu", download_root="models"
|
34 |
)
|
35 |
|
36 |
|
37 |
+
def main_app():
|
38 |
+
st.title("🗣️💬 LibriSpeech Corpus Generator")
|
39 |
+
st.divider()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
# Audio load
|
42 |
+
audio_file = st.file_uploader(
|
43 |
+
"Load audio files to process", type=["wav", "mp3"], accept_multiple_files=True
|
44 |
+
)
|
45 |
+
st.divider()
|
46 |
+
stamp_type, lang, size = st.columns(3)
|
47 |
|
48 |
+
with stamp_type:
|
49 |
+
timestamp_type = st.selectbox(
|
50 |
+
"Division level", options=list(STAMP_TYPES.keys())
|
51 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
+
with lang:
|
54 |
+
language = st.selectbox("Language", options=list(LANGUAGES.keys()))
|
55 |
+
|
56 |
+
with size:
|
57 |
+
model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys()))
|
58 |
+
st.divider()
|
59 |
+
|
60 |
+
if st.button("Process audios", use_container_width=True):
|
61 |
+
with st.spinner("Loading model..."):
|
62 |
+
model = load_model(model_size)
|
63 |
+
|
64 |
+
timestamps_df = pd.DataFrame()
|
65 |
+
temp_dir = create_temp_directory()
|
66 |
+
utterances_folder = temp_dir / "utterances_segments"
|
67 |
+
utterances_folder.mkdir(exist_ok=True)
|
68 |
+
for audio_i in audio_file:
|
69 |
+
with st.spinner(f"Processing audio: {audio_i.name}"):
|
70 |
+
tmp_audio = save_temp_file(audio_i)
|
71 |
+
|
72 |
+
# Whisper inference
|
73 |
+
tmp_audio_file = whisper.load_audio(tmp_audio)
|
74 |
+
timestamp_result = whisper.transcribe(
|
75 |
+
model, tmp_audio_file, language=LANGUAGES[language]
|
76 |
+
)
|
77 |
+
|
78 |
+
# Stamp level
|
79 |
+
if timestamp_type == "Sentence-level":
|
80 |
+
audio_i_df = get_sentence_data(audio_i.name, timestamp_result)
|
81 |
+
|
82 |
+
if timestamp_type == "Word-level":
|
83 |
+
audio_i_df = get_word_data(audio_i.name, timestamp_result)
|
84 |
+
|
85 |
+
# Timestamps in dataframe
|
86 |
+
timestamps_df = pd.concat(
|
87 |
+
[timestamps_df, audio_i_df], ignore_index=True
|
88 |
+
)
|
89 |
+
|
90 |
+
generate_audio_splits(tmp_audio, audio_i_df, utterances_folder)
|
91 |
+
generate_transcriptions_splits(tmp_audio, audio_i_df, utterances_folder)
|
92 |
+
st.divider()
|
93 |
+
st.markdown(
|
94 |
+
"<h3 style='text-align: center;'>Timestamps</h3>",
|
95 |
+
unsafe_allow_html=True,
|
96 |
+
)
|
97 |
+
st.dataframe(timestamps_df)
|
98 |
+
st.divider()
|
99 |
+
col1, col2 = st.columns(2)
|
100 |
+
|
101 |
+
with col1:
|
102 |
+
st.download_button(
|
103 |
+
"Download timestamps in .csv",
|
104 |
+
timestamps_df.to_csv(index=False),
|
105 |
+
file_name="timestamps.csv",
|
106 |
+
mime="text/csv",
|
107 |
+
use_container_width=True,
|
108 |
+
)
|
109 |
|
110 |
+
with col2:
|
111 |
+
st.download_button(
|
112 |
+
"Download LibriSpeech-like dataset",
|
113 |
+
data=compress_utterances_folder(utterances_folder),
|
114 |
+
file_name="librispeech-like-dataset.zip",
|
115 |
+
mime="application/zip",
|
116 |
+
use_container_width=True,
|
|
|
|
|
|
|
|
|
117 |
)
|
|
|
|
|
118 |
|
|
|
119 |
|
120 |
+
if __name__ == "__main__":
|
121 |
+
main_app()
|
|
|
|
|
|
|
|
|
|
pyproject.toml
CHANGED
@@ -14,6 +14,9 @@ openai-whisper = "*"
|
|
14 |
torch = "1.13"
|
15 |
matplotlib = "^3.7.1"
|
16 |
streamlit = "^1.24.0"
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
[build-system]
|
|
|
14 |
torch = "1.13"
|
15 |
matplotlib = "^3.7.1"
|
16 |
streamlit = "^1.24.0"
|
17 |
+
sounddevice = "^0.4.6"
|
18 |
+
soundfile = "^0.12.1"
|
19 |
+
pydub = "^0.25.1"
|
20 |
|
21 |
|
22 |
[build-system]
|
requirements.txt
CHANGED
@@ -2,4 +2,7 @@ Cython
|
|
2 |
dtw-python
|
3 |
openai-whisper
|
4 |
torch==1.13
|
5 |
-
streamlit==1.24
|
|
|
|
|
|
|
|
2 |
dtw-python
|
3 |
openai-whisper
|
4 |
torch==1.13
|
5 |
+
streamlit==1.24
|
6 |
+
pandas
|
7 |
+
numpy
|
8 |
+
soundfile
|
utils/audio.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Tuple, List
|
2 |
+
from pathlib import Path
|
3 |
+
import numpy as np
|
4 |
+
import soundfile as sf
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
from utils.text import filter_dataframe_by_audiofile
|
8 |
+
|
9 |
+
|
10 |
+
def load_audio(audio_path: Path) -> Tuple[np.ndarray, float]:
|
11 |
+
"""Loads an audio given its path
|
12 |
+
|
13 |
+
Parameters
|
14 |
+
----------
|
15 |
+
audio_path : Path
|
16 |
+
Path of the audio file
|
17 |
+
|
18 |
+
Returns
|
19 |
+
-------
|
20 |
+
Tuple[np.ndarray, float]
|
21 |
+
Audio array and sample rate
|
22 |
+
"""
|
23 |
+
audio_array, sample_rate = sf.read(str(audio_path))
|
24 |
+
return audio_array, sample_rate
|
25 |
+
|
26 |
+
|
27 |
+
def split_audio(
|
28 |
+
audio_array: np.ndarray, sample_rate: float, timestamp_list: list
|
29 |
+
) -> List[np.ndarray]:
|
30 |
+
"""Slices audio_array with timestamps in timestamp_list
|
31 |
+
|
32 |
+
Parameters
|
33 |
+
----------
|
34 |
+
audio_array : np.ndarray
|
35 |
+
Array of the audio to be splitted
|
36 |
+
sample_rate : float
|
37 |
+
Audio sample rate
|
38 |
+
timestamp_list : list
|
39 |
+
List of tuples containing the start and end of each stamp.
|
40 |
+
|
41 |
+
Returns
|
42 |
+
-------
|
43 |
+
List[np.ndarray]
|
44 |
+
List of numpy arrays with audio splits
|
45 |
+
"""
|
46 |
+
audio_segments = []
|
47 |
+
for timestamp_i in timestamp_list:
|
48 |
+
start_sample = round(timestamp_i[0] * sample_rate)
|
49 |
+
end_sample = round(timestamp_i[1] * sample_rate)
|
50 |
+
audio_segments.append(audio_array[start_sample:end_sample])
|
51 |
+
|
52 |
+
return audio_segments
|
53 |
+
|
54 |
+
|
55 |
+
def save_audio_segments(
|
56 |
+
destination: Path,
|
57 |
+
audio_path: Path,
|
58 |
+
audio_segments: List[np.ndarray],
|
59 |
+
sample_rate: float,
|
60 |
+
) -> None:
|
61 |
+
"""Saves audio segments from audio_segments in destination path.
|
62 |
+
|
63 |
+
Parameters
|
64 |
+
----------
|
65 |
+
destination : Path
|
66 |
+
Path were segments will be saved
|
67 |
+
audio_name : Path
|
68 |
+
Name of the original audio file
|
69 |
+
audio_segments : List[np.ndarray]
|
70 |
+
List containing numpy arrays with the audio segments
|
71 |
+
sample_rate : float
|
72 |
+
Sample rate of the original audio file
|
73 |
+
"""
|
74 |
+
for i, segment in enumerate(audio_segments):
|
75 |
+
segment_path = destination / f"{audio_path.stem}-{i}.wav"
|
76 |
+
sf.write(str(segment_path), segment, sample_rate)
|
77 |
+
|
78 |
+
|
79 |
+
def generate_audio_splits(
|
80 |
+
audio_path: Path, timestamps_df: pd.DataFrame, destination: Path
|
81 |
+
) -> None:
|
82 |
+
"""Splits an audio given its path and timestamps
|
83 |
+
|
84 |
+
Parameters
|
85 |
+
----------
|
86 |
+
audio_path : Path
|
87 |
+
Path of the audio
|
88 |
+
timestamps_df : pd.DataFrame
|
89 |
+
DataFrame containing start and end of the utterances
|
90 |
+
destination : Path
|
91 |
+
Path were segments will be saved.
|
92 |
+
"""
|
93 |
+
audio_array, sample_rate = load_audio(audio_path)
|
94 |
+
timestamp_list = filter_dataframe_by_audiofile(timestamps_df, audio_path.name)
|
95 |
+
audio_segments = split_audio(audio_array, sample_rate, timestamp_list)
|
96 |
+
save_audio_segments(destination, audio_path, audio_segments, sample_rate)
|
utils/files.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import zipfile
|
3 |
+
import shutil
|
4 |
+
import io
|
5 |
+
import streamlit as st
|
6 |
+
|
7 |
+
|
8 |
+
def save_temp_file(file: st.runtime.uploaded_file_manager.UploadedFile) -> str:
|
9 |
+
"""Saves in a temporary directory an Streamlit uploaded file
|
10 |
+
|
11 |
+
Parameters
|
12 |
+
----------
|
13 |
+
file : st.runtime.uploaded_file_manager.UploadedFile
|
14 |
+
File from st.file_uploader return
|
15 |
+
|
16 |
+
Returns
|
17 |
+
-------
|
18 |
+
str
|
19 |
+
Path were file is saved temporary
|
20 |
+
"""
|
21 |
+
temp_dir = Path(".temp")
|
22 |
+
temp_file_path = temp_dir.joinpath(file.name)
|
23 |
+
with open(str(temp_file_path), "wb") as temp_file:
|
24 |
+
temp_file.write(file.getvalue())
|
25 |
+
return temp_file_path
|
26 |
+
|
27 |
+
|
28 |
+
def create_temp_directory(dir_name: str = ".temp") -> Path:
|
29 |
+
"""Create a temporary directory.
|
30 |
+
|
31 |
+
Parameters
|
32 |
+
----------
|
33 |
+
dir_name : str, optional
|
34 |
+
Name of the temporary directory, by default ".temp"
|
35 |
+
|
36 |
+
Returns
|
37 |
+
-------
|
38 |
+
Path
|
39 |
+
Path object representing the created temporary directory.
|
40 |
+
"""
|
41 |
+
temp_dir = Path(dir_name)
|
42 |
+
temp_dir.mkdir(exist_ok=True)
|
43 |
+
return temp_dir
|
44 |
+
|
45 |
+
|
46 |
+
def clean_temp_directory() -> None:
|
47 |
+
"""Cleans .temp directory"""
|
48 |
+
shutil.rmtree(Path(".temp"))
|
49 |
+
|
50 |
+
|
51 |
+
def compress_utterances_folder(utterances_folder: Path) -> io.BytesIO:
|
52 |
+
"""Compresses the contents of utterances_folder into a zip file.
|
53 |
+
|
54 |
+
Parameters
|
55 |
+
----------
|
56 |
+
utterances_folder : Path
|
57 |
+
Path to the folder containing utterances.
|
58 |
+
|
59 |
+
Returns
|
60 |
+
-------
|
61 |
+
io.BytesIO
|
62 |
+
A BytesIO object representing the compressed zip file.
|
63 |
+
"""
|
64 |
+
memory_file = io.BytesIO()
|
65 |
+
with zipfile.ZipFile(memory_file, "w") as zip_file:
|
66 |
+
for file_i in utterances_folder.iterdir():
|
67 |
+
zip_file.write(str(file_i), arcname=file_i.name)
|
68 |
+
|
69 |
+
memory_file.seek(0)
|
70 |
+
clean_temp_directory()
|
71 |
+
return memory_file
|
utils/text.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
from pathlib import Path
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
|
6 |
+
def get_sentence_data(filename: str, timestamp_dict: dict) -> pd.DataFrame:
|
7 |
+
"""Extracts the sentences from the output dictionary of whisper inference
|
8 |
+
|
9 |
+
Parameters
|
10 |
+
----------
|
11 |
+
filename : str
|
12 |
+
Name of the audio analyzed
|
13 |
+
timestamp_dict : dict
|
14 |
+
Output dictionary from whisper inference
|
15 |
+
|
16 |
+
Returns
|
17 |
+
-------
|
18 |
+
pd.DataFrame
|
19 |
+
DataFrame containing audio filename, start, end and duration of sentences with
|
20 |
+
its transcriptions.
|
21 |
+
"""
|
22 |
+
sentence_df = pd.DataFrame(
|
23 |
+
columns=["Audio file", "Sentence", "Start", "End", "Duration"]
|
24 |
+
)
|
25 |
+
for sentence_i in timestamp_dict["segments"]:
|
26 |
+
sentence_i = pd.DataFrame(
|
27 |
+
{
|
28 |
+
"Audio file": [filename],
|
29 |
+
"Sentence": [str(sentence_i["text"])],
|
30 |
+
"Start": [sentence_i["start"]],
|
31 |
+
"End": [sentence_i["end"]],
|
32 |
+
"Duration": [sentence_i["end"] - sentence_i["start"]],
|
33 |
+
}
|
34 |
+
)
|
35 |
+
sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
|
36 |
+
return sentence_df
|
37 |
+
|
38 |
+
|
39 |
+
def get_word_data(filename: str, timestamp_dict: dict):
|
40 |
+
"""Extracts the words from the output dictionary of whisper inference
|
41 |
+
|
42 |
+
Parameters
|
43 |
+
----------
|
44 |
+
filename : str
|
45 |
+
Name of the audio analyzed
|
46 |
+
timestamp_dict : dict
|
47 |
+
Output dictionary from whisper inference
|
48 |
+
|
49 |
+
Returns
|
50 |
+
-------
|
51 |
+
pd.DataFrame
|
52 |
+
DataFrame containing audio filename, start, end and duration of words with
|
53 |
+
its transcriptions.
|
54 |
+
"""
|
55 |
+
word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"])
|
56 |
+
for sentence_i in timestamp_dict["segments"]:
|
57 |
+
for word_i in sentence_i["words"]:
|
58 |
+
word_i_df = pd.DataFrame(
|
59 |
+
{
|
60 |
+
"Audio file": [filename],
|
61 |
+
"Word": [str(word_i["text"])],
|
62 |
+
"Start": [word_i["start"]],
|
63 |
+
"End": [word_i["end"]],
|
64 |
+
"Duration": [word_i["end"] - word_i["start"]],
|
65 |
+
}
|
66 |
+
)
|
67 |
+
word_df = pd.concat([word_df, word_i_df], ignore_index=True)
|
68 |
+
return word_df
|
69 |
+
|
70 |
+
|
71 |
+
def filter_dataframe_by_audiofile(timestamps_df: pd.DataFrame, audio_file: str) -> List:
|
72 |
+
"""Generates a list from timestamps_df with the timestamps belonging to audio_file
|
73 |
+
|
74 |
+
Parameters
|
75 |
+
----------
|
76 |
+
timestamps_df : pd.DataFrame
|
77 |
+
Dataframe containing timestamps
|
78 |
+
audio_file : str
|
79 |
+
Name of the audio file.
|
80 |
+
|
81 |
+
Returns
|
82 |
+
-------
|
83 |
+
List
|
84 |
+
List of tuples containing the start and end of each stamp.
|
85 |
+
E.g: [(start_1, end_2), ..., (start_n, end_n)]
|
86 |
+
"""
|
87 |
+
audio_df = timestamps_df[timestamps_df["Audio file"] == audio_file]
|
88 |
+
return list(zip(audio_df["Start"], audio_df["End"]))
|
89 |
+
|
90 |
+
|
91 |
+
def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
|
92 |
+
"""Gives column with transcriptions
|
93 |
+
|
94 |
+
Parameters
|
95 |
+
----------
|
96 |
+
timestamps_df : pd.DataFrame
|
97 |
+
DataFrame with transcriptions
|
98 |
+
|
99 |
+
Returns
|
100 |
+
-------
|
101 |
+
List[str]
|
102 |
+
List of the transcriptions
|
103 |
+
"""
|
104 |
+
return timestamps_df.iloc[:, 1].tolist()
|
105 |
+
|
106 |
+
|
107 |
+
def save_transcriptions_segments(
|
108 |
+
audio_path: Path, transcriptions_list: List[str], destination: Path
|
109 |
+
) -> None:
|
110 |
+
"""Save transcription segments to text files.
|
111 |
+
|
112 |
+
Parameters
|
113 |
+
----------
|
114 |
+
audio_path : Path
|
115 |
+
Path to the audio file.
|
116 |
+
transcriptions_list : List[str]
|
117 |
+
List of transcriptions.
|
118 |
+
destination : Path
|
119 |
+
Destination path for the text files.
|
120 |
+
"""
|
121 |
+
for i, transcription_i in enumerate(transcriptions_list):
|
122 |
+
transcription_i_path = destination / f"{audio_path.stem}-{i}.txt"
|
123 |
+
with open(str(transcription_i_path), "w") as file:
|
124 |
+
file.write(transcription_i)
|
125 |
+
|
126 |
+
|
127 |
+
def generate_transcriptions_splits(
|
128 |
+
audio_path: Path, timestamps_df: pd.DataFrame, destination: Path
|
129 |
+
):
|
130 |
+
"""Generate and save transcription splits based on timestamps.
|
131 |
+
|
132 |
+
Parameters
|
133 |
+
----------
|
134 |
+
audio_path : Path
|
135 |
+
Path to the audio file.
|
136 |
+
timestamps_df : pd.DataFrame
|
137 |
+
DataFrame containing timestamps.
|
138 |
+
destination : Path
|
139 |
+
Destination path for the text files.
|
140 |
+
"""
|
141 |
+
transcriptions_list = get_utterances_transcriptions(timestamps_df)
|
142 |
+
save_transcriptions_segments(audio_path, transcriptions_list, destination)
|