Spaces:

Labbeti
/

conette

Running

File size: 7,988 Bytes

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
from typing import Any, Optional, Union

import streamlit as st
import torchaudio

from st_audiorec import st_audiorec
from streamlit.runtime.uploaded_file_manager import UploadedFile
from torch import Tensor

from conette import CoNeTTEModel, conette
from conette.utils.collections import dict_list_to_list_dict


ALLOW_REP_MODES = ("stopwords", "all", "none")
MAX_BEAM_SIZE = 20
MAX_PRED_SIZE = 30
MAX_BATCH_SIZE = 32
RECORD_AUDIO_FNAME = "microphone_conette_record.wav"
DEFAULT_THRESHOLD = 0.3
THRESHOLD_PRECISION = 100
MIN_AUDIO_DURATION_SEC = 0.3
MAX_AUDIO_DURATION_SEC = 60


@st.cache_resource
def load_conette(*args, **kwargs) -> CoNeTTEModel:
    return conette(*args, **kwargs)


def format_candidate(candidate: str) -> str:
    if len(candidate) == 0:
        return ""
    else:
        return f"{candidate[0].title()}{candidate[1:]}."


def format_tags(tags: Optional[list[str]]) -> str:
    if tags is None or len(tags) == 0:
        return "None."
    else:
        return ", ".join(tags)


def get_result_hash(audio_fname: str, generate_kwds: dict[str, Any]) -> str:
    return f"{audio_fname}-{generate_kwds}"


def get_results(
    model: CoNeTTEModel,
    audio_files: dict[str, bytes],
    generate_kwds: dict[str, Any],
) -> dict[str, Union[dict[str, Any], str]]:
    # Get audio to be processed
    audio_to_predict: dict[str, tuple[str, bytes]] = {}
    for audio_fname, audio in audio_files.items():
        result_hash = get_result_hash(audio_fname, generate_kwds)
        if result_hash not in st.session_state or audio_fname == RECORD_AUDIO_FNAME:
            audio_to_predict[result_hash] = (audio_fname, audio)

    # Save audio to be processed
    tmp_files: dict[str, _TemporaryFileWrapper] = {}
    for result_hash, (audio_fname, audio) in audio_to_predict.items():
        tmp_file = NamedTemporaryFile(delete=False)
        tmp_file.write(audio)
        tmp_file.close()

        metadata = torchaudio.info(tmp_file.name)  # type: ignore
        duration = metadata.num_frames / metadata.sample_rate

        if MIN_AUDIO_DURATION_SEC > duration:
            error_msg = f"""
            ##### Result for "{audio_fname}"
            Audio file is too short. (found {duration:.2f}s but the model expect audio in range [{MIN_AUDIO_DURATION_SEC}, {MAX_AUDIO_DURATION_SEC}])
            """
            st.session_state[result_hash] = error_msg

        elif duration > MAX_AUDIO_DURATION_SEC:
            error_msg = f"""
            ##### Result for "{audio_fname}"
            Audio file is too long. (found {duration:.2f}s but the model expect audio in range [{MIN_AUDIO_DURATION_SEC}, {MAX_AUDIO_DURATION_SEC}])
            """
            st.session_state[result_hash] = error_msg

        else:
            tmp_files[result_hash] = tmp_file

    # Generate predictions and store them in session state
    for start in range(0, len(tmp_files), MAX_BATCH_SIZE):
        end = min(start + MAX_BATCH_SIZE, len(tmp_files))
        result_hashes_j = list(tmp_files.keys())[start:end]
        tmp_files_j = list(tmp_files.values())[start:end]
        tmp_paths_j = [tmp_file.name for tmp_file in tmp_files_j]
        outputs_j = model(
            tmp_paths_j,
            **generate_kwds,
        )
        outputs_lst = dict_list_to_list_dict(outputs_j)  # type: ignore
        for result_hash, output_i in zip(result_hashes_j, outputs_lst):
            st.session_state[result_hash] = output_i

    # Get outputs
    outputs = {}
    for audio_fname in audio_files.keys():
        result_hash = get_result_hash(audio_fname, generate_kwds)
        output_i = st.session_state[result_hash]
        outputs[audio_fname] = output_i

    return outputs


def show_results(outputs: dict[str, Union[dict[str, Any], str]]) -> None:
    keys = list(outputs.keys())[::-1]
    outputs = {key: outputs[key] for key in keys}

    st.divider()

    for audio_fname, output in outputs.items():
        if isinstance(output, str):
            st.error(output)
            st.divider()
            continue

        cand: str = output["cands"]
        lprobs: Tensor = output["lprobs"]
        tags_lst = output.get("tags")
        mult_cands: list[str] = output["mult_cands"]
        mult_lprobs: Tensor = output["mult_lprobs"]

        cand = format_candidate(cand)
        prob = lprobs.exp().tolist()
        tags = format_tags(tags_lst)
        mult_cands = [format_candidate(cand_i) for cand_i in mult_cands]
        mult_probs = mult_lprobs.exp()

        indexes = mult_probs.argsort(descending=True)[1:]
        mult_probs = mult_probs[indexes].tolist()
        mult_cands = [mult_cands[idx] for idx in indexes]

        if audio_fname == RECORD_AUDIO_FNAME:
            header = "##### Result for microphone input:"
        else:
            header = f'##### Result for "{audio_fname}"'

        content = [
            header,
            f'- **Description:** "{cand}" ({prob*100:.1f}%)',
            f"- **Tags:** {tags}",
        ]
        if len(mult_cands) > 0:
            msg = f"- **Other descriptions:**"
            content.append(msg)

        for cand_i, prob_i in zip(mult_cands, mult_probs):
            msg = f'  - "{cand_i}" ({prob_i*100:.1f}%)'
            content.append(msg)

        st.success("\n".join(content))
        st.divider()


def main() -> None:
    model = load_conette(model_kwds=dict(device="cpu"))

    st.header("Describe audio content with CoNeTTE")
    st.markdown(
        "This interface allows you to generate a short description of the sound events of any recording. You can try it from your microphone or upload a file below."
    )

    record_data = st_audiorec()
    audio_files: Optional[list[UploadedFile]] = st.file_uploader(
        "**Or upload audio files here:**",
        type=["wav", "flac", "mp3", "ogg", "avi"],
        accept_multiple_files=True,
        help="Recommanded audio: lasting from **1 to 30s**, sampled at **32 kHz** minimum.",
    )

    with st.expander("Model hyperparameters"):
        task = st.selectbox("Task embedding input", model.tasks, 0)
        allow_rep_mode = st.selectbox("Allow repetition of words", ALLOW_REP_MODES, 0)
        beam_size: int = st.select_slider(  # type: ignore
            "Beam size",
            list(range(1, MAX_BEAM_SIZE + 1)),
            model.config.beam_size,
        )
        min_pred_size, max_pred_size = st.slider(
            "Minimal and maximal number of words",
            1,
            MAX_PRED_SIZE,
            (model.config.min_pred_size, model.config.max_pred_size),
        )
        threshold = st.select_slider(
            "Tags threshold",
            [(i / THRESHOLD_PRECISION) for i in range(THRESHOLD_PRECISION + 1)],
            DEFAULT_THRESHOLD,
        )

        if allow_rep_mode == "all":
            forbid_rep_mode = "none"
        elif allow_rep_mode == "none":
            forbid_rep_mode = "all"
        elif allow_rep_mode == "stopwords":
            forbid_rep_mode = "content_words"
        else:
            raise ValueError(
                f"Unknown option {allow_rep_mode=}. (expected one of {ALLOW_REP_MODES})"
            )
        del allow_rep_mode

        generate_kwds: dict[str, Any] = dict(
            task=task,
            beam_size=beam_size,
            min_pred_size=min_pred_size,
            max_pred_size=max_pred_size,
            forbid_rep_mode=forbid_rep_mode,
            threshold=threshold,
        )

    audios: dict[str, bytes] = {}
    if audio_files is not None:
        audios |= {audio.name: audio.getvalue() for audio in audio_files}
    if record_data is not None:
        audios |= {RECORD_AUDIO_FNAME: record_data}

    if len(audios) > 0:
        with st.spinner("Generating descriptions..."):
            outputs = get_results(model, audios, generate_kwds)
        st.header("Results:")
        show_results(outputs)


if __name__ == "__main__":
    main()