Spaces:

nahue-passano
/

librispeech-corpus-generator

Runtime error

App Files Files Community

nahue-passano commited on Jul 13, 2023

Commit

7405904

1 Parent(s): 6125290

update: new release

Browse files

Files changed (6) hide show

app.py +98 -74
pyproject.toml +3 -0
requirements.txt +4 -1
utils/audio.py +96 -0
utils/files.py +71 -0
utils/text.py +142 -0

app.py CHANGED Viewed

@@ -1,97 +1,121 @@
-from io import StringIO
-import os
-import tempfile
 import streamlit as st
-import json
 import whisper_timestamped as whisper
 import pandas as pd
 STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"}
 LANGUAGES = {"English": "en", "Spanish": "es"}
 MODEL_SIZES = {"Medium": "medium", "Large": "large"}
-def save_temp_file(file):
-    temp_dir = tempfile.gettempdir()
-    temp_file_path = os.path.join(temp_dir, file.name)
-    with open(temp_file_path, "wb") as temp_file:
-        temp_file.write(file.getvalue())
-    return temp_file_path
 @st.cache_resource(show_spinner=False)
 def load_model(model_size: str):
-    print(f"model size : {MODEL_SIZES[model_size]}")
     return whisper.load_model(
         MODEL_SIZES[model_size], device="cpu", download_root="models"
     )
-def get_sentence_data(filename: str, timestamp_dict: dict):
-    sentence_df = pd.DataFrame(
-        columns=["Audio file", "Sentence", "Start", "End", "Duration"]
-    )
-    for sentence_i in timestamp_dict["segments"]:
-        sentence_i = pd.DataFrame(
-            {
-                "Audio file": [filename],
-                "Sentence": [str(sentence_i["text"])],
-                "Start": [sentence_i["start"]],
-                "End": [sentence_i["end"]],
-                "Duration": [sentence_i["end"] - sentence_i["start"]],
-            }
-        )
-        sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
-    return sentence_df
-def get_word_data(filename: str, timestamp_dict: dict):
-    pass
-def get_word_data():
-    pass
-st.title("⏱️🧾 Timestamp generator")
-# Audio load
-audio_file = st.file_uploader(
-    "Load audio file to transcribe", type=["wav", "mp3"], accept_multiple_files=True
-)
-stamp_type, lang, size = st.columns(3)
-with stamp_type:
-    timestamp_type = st.selectbox("Timestamp type", options=list(STAMP_TYPES.keys()))
-with lang:
-    language = st.selectbox("Language", options=list(LANGUAGES.keys()))
-with size:
-    model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys()))
-# Botón para generar el timestamp
-if st.button("Generate Timestamp", use_container_width=True):
-    with st.spinner("Loading model..."):
-        model = load_model(model_size)
-    sentences_df = pd.DataFrame()
-    for audio_i in audio_file:
-        with st.spinner(f"Processing audio: {audio_i.name}"):
-            tmp_audio = save_temp_file(audio_i)
-            tmp_audio_file = whisper.load_audio(tmp_audio)
-            timestamp_result = whisper.transcribe(
-                model, tmp_audio_file, language=LANGUAGES[language]
             )
-            audio_i_df = get_sentence_data(audio_i.name, timestamp_result)
-            sentences_df = pd.concat([sentences_df, audio_i_df], ignore_index=True)
-    st.dataframe(sentences_df)
-    st.download_button(
-        "Save timestamps",
-        sentences_df.to_csv(index=False),
-        file_name="timestamps.csv",
-        mime="text/csv",
-        use_container_width=True,
-    )

 import streamlit as st
 import whisper_timestamped as whisper
 import pandas as pd
+from utils.files import (
+    create_temp_directory,
+    save_temp_file,
+    compress_utterances_folder,
+)
+from utils.text import get_sentence_data, get_word_data, generate_transcriptions_splits
+from utils.audio import generate_audio_splits
 STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"}
 LANGUAGES = {"English": "en", "Spanish": "es"}
 MODEL_SIZES = {"Medium": "medium", "Large": "large"}
 @st.cache_resource(show_spinner=False)
 def load_model(model_size: str):
+    """Loads the Whisper model with size model_size
+    Parameters
+    ----------
+    model_size : str
+        Available size of the whisper model
+    Returns
+    -------
+    _type_
+        Whisper model
+    """
     return whisper.load_model(
         MODEL_SIZES[model_size], device="cpu", download_root="models"
     )
+def main_app():
+    st.title("🗣️💬 LibriSpeech Corpus Generator")
+    st.divider()
+    # Audio load
+    audio_file = st.file_uploader(
+        "Load audio files to process", type=["wav", "mp3"], accept_multiple_files=True
+    )
+    st.divider()
+    stamp_type, lang, size = st.columns(3)
+    with stamp_type:
+        timestamp_type = st.selectbox(
+            "Division level", options=list(STAMP_TYPES.keys())
+        )
+    with lang:
+        language = st.selectbox("Language", options=list(LANGUAGES.keys()))
+    with size:
+        model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys()))
+    st.divider()
+    if st.button("Process audios", use_container_width=True):
+        with st.spinner("Loading model..."):
+            model = load_model(model_size)
+        timestamps_df = pd.DataFrame()
+        temp_dir = create_temp_directory()
+        utterances_folder = temp_dir / "utterances_segments"
+        utterances_folder.mkdir(exist_ok=True)
+        for audio_i in audio_file:
+            with st.spinner(f"Processing audio: {audio_i.name}"):
+                tmp_audio = save_temp_file(audio_i)
+                # Whisper inference
+                tmp_audio_file = whisper.load_audio(tmp_audio)
+                timestamp_result = whisper.transcribe(
+                    model, tmp_audio_file, language=LANGUAGES[language]
+                )
+                # Stamp level
+                if timestamp_type == "Sentence-level":
+                    audio_i_df = get_sentence_data(audio_i.name, timestamp_result)
+                if timestamp_type == "Word-level":
+                    audio_i_df = get_word_data(audio_i.name, timestamp_result)
+                # Timestamps in dataframe
+                timestamps_df = pd.concat(
+                    [timestamps_df, audio_i_df], ignore_index=True
+                )
+                generate_audio_splits(tmp_audio, audio_i_df, utterances_folder)
+                generate_transcriptions_splits(tmp_audio, audio_i_df, utterances_folder)
+        st.divider()
+        st.markdown(
+            "<h3 style='text-align: center;'>Timestamps</h3>",
+            unsafe_allow_html=True,
+        )
+        st.dataframe(timestamps_df)
+        st.divider()
+        col1, col2 = st.columns(2)
+        with col1:
+            st.download_button(
+                "Download timestamps in .csv",
+                timestamps_df.to_csv(index=False),
+                file_name="timestamps.csv",
+                mime="text/csv",
+                use_container_width=True,
+            )
+        with col2:
+            st.download_button(
+                "Download LibriSpeech-like dataset",
+                data=compress_utterances_folder(utterances_folder),
+                file_name="librispeech-like-dataset.zip",
+                mime="application/zip",
+                use_container_width=True,
             )
+if __name__ == "__main__":
+    main_app()

pyproject.toml CHANGED Viewed

@@ -14,6 +14,9 @@ openai-whisper = "*"
 torch = "1.13"
 matplotlib = "^3.7.1"
 streamlit = "^1.24.0"
 [build-system]

 torch = "1.13"
 matplotlib = "^3.7.1"
 streamlit = "^1.24.0"
+sounddevice = "^0.4.6"
+soundfile = "^0.12.1"
+pydub = "^0.25.1"
 [build-system]

requirements.txt CHANGED Viewed

@@ -2,4 +2,7 @@ Cython
 dtw-python
 openai-whisper
 torch==1.13
-streamlit==1.24

 dtw-python
 openai-whisper
 torch==1.13
+streamlit==1.24
+pandas
+numpy
+soundfile

utils/audio.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from typing import Tuple, List
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import pandas as pd
+from utils.text import filter_dataframe_by_audiofile
+def load_audio(audio_path: Path) -> Tuple[np.ndarray, float]:
+    """Loads an audio given its path
+    Parameters
+    ----------
+    audio_path : Path
+        Path of the audio file
+    Returns
+    -------
+    Tuple[np.ndarray, float]
+        Audio array and sample rate
+    """
+    audio_array, sample_rate = sf.read(str(audio_path))
+    return audio_array, sample_rate
+def split_audio(
+    audio_array: np.ndarray, sample_rate: float, timestamp_list: list
+) -> List[np.ndarray]:
+    """Slices audio_array with timestamps in timestamp_list
+    Parameters
+    ----------
+    audio_array : np.ndarray
+        Array of the audio to be splitted
+    sample_rate : float
+        Audio sample rate
+    timestamp_list : list
+        List of tuples containing the start and end of each stamp.
+    Returns
+    -------
+    List[np.ndarray]
+        List of numpy arrays with audio splits
+    """
+    audio_segments = []
+    for timestamp_i in timestamp_list:
+        start_sample = round(timestamp_i[0] * sample_rate)
+        end_sample = round(timestamp_i[1] * sample_rate)
+        audio_segments.append(audio_array[start_sample:end_sample])
+    return audio_segments
+def save_audio_segments(
+    destination: Path,
+    audio_path: Path,
+    audio_segments: List[np.ndarray],
+    sample_rate: float,
+) -> None:
+    """Saves audio segments from audio_segments in destination path.
+    Parameters
+    ----------
+    destination : Path
+        Path were segments will be saved
+    audio_name : Path
+        Name of the original audio file
+    audio_segments : List[np.ndarray]
+        List containing numpy arrays with the audio segments
+    sample_rate : float
+        Sample rate of the original audio file
+    """
+    for i, segment in enumerate(audio_segments):
+        segment_path = destination / f"{audio_path.stem}-{i}.wav"
+        sf.write(str(segment_path), segment, sample_rate)
+def generate_audio_splits(
+    audio_path: Path, timestamps_df: pd.DataFrame, destination: Path
+) -> None:
+    """Splits an audio given its path and timestamps
+    Parameters
+    ----------
+    audio_path : Path
+        Path of the audio
+    timestamps_df : pd.DataFrame
+        DataFrame containing start and end of the utterances
+    destination : Path
+        Path were segments will be saved.
+    """
+    audio_array, sample_rate = load_audio(audio_path)
+    timestamp_list = filter_dataframe_by_audiofile(timestamps_df, audio_path.name)
+    audio_segments = split_audio(audio_array, sample_rate, timestamp_list)
+    save_audio_segments(destination, audio_path, audio_segments, sample_rate)

utils/files.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from pathlib import Path
+import zipfile
+import shutil
+import io
+import streamlit as st
+def save_temp_file(file: st.runtime.uploaded_file_manager.UploadedFile) -> str:
+    """Saves in a temporary directory an Streamlit uploaded file
+    Parameters
+    ----------
+    file : st.runtime.uploaded_file_manager.UploadedFile
+        File from st.file_uploader return
+    Returns
+    -------
+    str
+        Path were file is saved temporary
+    """
+    temp_dir = Path(".temp")
+    temp_file_path = temp_dir.joinpath(file.name)
+    with open(str(temp_file_path), "wb") as temp_file:
+        temp_file.write(file.getvalue())
+    return temp_file_path
+def create_temp_directory(dir_name: str = ".temp") -> Path:
+    """Create a temporary directory.
+    Parameters
+    ----------
+    dir_name : str, optional
+        Name of the temporary directory, by default ".temp"
+    Returns
+    -------
+    Path
+        Path object representing the created temporary directory.
+    """
+    temp_dir = Path(dir_name)
+    temp_dir.mkdir(exist_ok=True)
+    return temp_dir
+def clean_temp_directory() -> None:
+    """Cleans .temp directory"""
+    shutil.rmtree(Path(".temp"))
+def compress_utterances_folder(utterances_folder: Path) -> io.BytesIO:
+    """Compresses the contents of utterances_folder into a zip file.
+    Parameters
+    ----------
+    utterances_folder : Path
+        Path to the folder containing utterances.
+    Returns
+    -------
+    io.BytesIO
+        A BytesIO object representing the compressed zip file.
+    """
+    memory_file = io.BytesIO()
+    with zipfile.ZipFile(memory_file, "w") as zip_file:
+        for file_i in utterances_folder.iterdir():
+            zip_file.write(str(file_i), arcname=file_i.name)
+    memory_file.seek(0)
+    clean_temp_directory()
+    return memory_file

utils/text.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from typing import List
+from pathlib import Path
+import pandas as pd
+def get_sentence_data(filename: str, timestamp_dict: dict) -> pd.DataFrame:
+    """Extracts the sentences from the output dictionary of whisper inference
+    Parameters
+    ----------
+    filename : str
+        Name of the audio analyzed
+    timestamp_dict : dict
+        Output dictionary from whisper inference
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing audio filename, start, end and duration of sentences with
+        its transcriptions.
+    """
+    sentence_df = pd.DataFrame(
+        columns=["Audio file", "Sentence", "Start", "End", "Duration"]
+    )
+    for sentence_i in timestamp_dict["segments"]:
+        sentence_i = pd.DataFrame(
+            {
+                "Audio file": [filename],
+                "Sentence": [str(sentence_i["text"])],
+                "Start": [sentence_i["start"]],
+                "End": [sentence_i["end"]],
+                "Duration": [sentence_i["end"] - sentence_i["start"]],
+            }
+        )
+        sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
+    return sentence_df
+def get_word_data(filename: str, timestamp_dict: dict):
+    """Extracts the words from the output dictionary of whisper inference
+    Parameters
+    ----------
+    filename : str
+        Name of the audio analyzed
+    timestamp_dict : dict
+        Output dictionary from whisper inference
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing audio filename, start, end and duration of words with
+        its transcriptions.
+    """
+    word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"])
+    for sentence_i in timestamp_dict["segments"]:
+        for word_i in sentence_i["words"]:
+            word_i_df = pd.DataFrame(
+                {
+                    "Audio file": [filename],
+                    "Word": [str(word_i["text"])],
+                    "Start": [word_i["start"]],
+                    "End": [word_i["end"]],
+                    "Duration": [word_i["end"] - word_i["start"]],
+                }
+            )
+            word_df = pd.concat([word_df, word_i_df], ignore_index=True)
+    return word_df
+def filter_dataframe_by_audiofile(timestamps_df: pd.DataFrame, audio_file: str) -> List:
+    """Generates a list from timestamps_df with the timestamps belonging to audio_file
+    Parameters
+    ----------
+    timestamps_df : pd.DataFrame
+        Dataframe containing timestamps
+    audio_file : str
+        Name of the audio file.
+    Returns
+    -------
+    List
+        List of tuples containing the start and end of each stamp.
+        E.g: [(start_1, end_2), ..., (start_n, end_n)]
+    """
+    audio_df = timestamps_df[timestamps_df["Audio file"] == audio_file]
+    return list(zip(audio_df["Start"], audio_df["End"]))
+def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
+    """Gives column with transcriptions
+    Parameters
+    ----------
+    timestamps_df : pd.DataFrame
+        DataFrame with transcriptions
+    Returns
+    -------
+    List[str]
+        List of the transcriptions
+    """
+    return timestamps_df.iloc[:, 1].tolist()
+def save_transcriptions_segments(
+    audio_path: Path, transcriptions_list: List[str], destination: Path
+) -> None:
+    """Save transcription segments to text files.
+    Parameters
+    ----------
+    audio_path : Path
+        Path to the audio file.
+    transcriptions_list : List[str]
+        List of transcriptions.
+    destination : Path
+        Destination path for the text files.
+    """
+    for i, transcription_i in enumerate(transcriptions_list):
+        transcription_i_path = destination / f"{audio_path.stem}-{i}.txt"
+        with open(str(transcription_i_path), "w") as file:
+            file.write(transcription_i)
+def generate_transcriptions_splits(
+    audio_path: Path, timestamps_df: pd.DataFrame, destination: Path
+):
+    """Generate and save transcription splits based on timestamps.
+    Parameters
+    ----------
+    audio_path : Path
+        Path to the audio file.
+    timestamps_df : pd.DataFrame
+        DataFrame containing timestamps.
+    destination : Path
+        Destination path for the text files.
+    """
+    transcriptions_list = get_utterances_transcriptions(timestamps_df)
+    save_transcriptions_segments(audio_path, transcriptions_list, destination)