Spaces:

dj-dawgs-ipd
/

IPD_Audio_Pipeline

Running

File size: 3,476 Bytes

from gradio_client import Client, handle_file
import pandas as pd
import gradio as gr
from vosk import Model, KaldiRecognizer
import json
import wave

clientEngText = Client("dj-dawgs-ipd/IPD-Text-English-Finetune")
clientHingText = Client("dj-dawgs-ipd/IPD-Text-Hinglish")
clientAud = Client("dj-dawgs-ipd/IPD_Audio_HuBERT")

profanity_df = pd.read_csv('Hinglish_Profanity_List.csv', encoding='utf-8')
profanity_hn = profanity_df['profanity_hn']
vosk_model = Model(lang="en-us")


# import whisper
# def stt_whisper(file_path):
#     model = whisper.load_model("base")
#     try:
#         result = model.transcribe(file_path)
#         return result["text"]
#     except Exception as e:
#           print(e)
#         return ""


def stt_vosk(file_path):
    try:
        wf = wave.open(file_path, "rb")
        rec = KaldiRecognizer(vosk_model, wf.getframerate())
        rec.SetWords(True)
        rec.SetPartialWords(True)
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            rec.AcceptWaveform(data)
        data = json.loads(rec.FinalResult())
        return data["text"]
    except:
        return ""


def extract_text(audio_path):
    return stt_vosk(audio_path).lower()


def predict_hate_speech(audio_path):

    audResult = clientAud.predict(
        audio_path=handle_file(audio_path),
        api_name="/predict"
    )
    audResult = json.loads(audResult.replace("'", '"'))

    stt_text = extract_text(audio_path)

    engResult = clientEngText.predict(
        text=stt_text[:200],
        api_name="/predict"
    )

    hingResult = clientHingText.predict(
        text=stt_text[:200],
        api_name="/predict"
    )

    profanityFound = [word for word in stt_text.split() if word in profanity_hn]
    threshold = 0.6
    isHate = (engResult[0] != "NEITHER" and engResult[1] > threshold) or (
        hingResult[0] != "NAG" and hingResult[1] > threshold) or (
        audResult['Classification'] == 'Hate Speech\n' and audResult['Confidence'] > threshold)

    engConf = engResult[1] if engResult[0] != "NEITHER" else (1 - engResult[1])
    hingConf = hingResult[1] if hingResult[0] != "NEITHER" else (1 - hingResult[1])
    audConf = audResult['Confidence'] if audResult['Classification'] == 'Hate Speech\n' else (1 - audResult['Confidence'])

    confidence = (engConf + hingConf + audConf) / 3

    if len(profanityFound) > 0:
        return {
            'prediction' : 'hate',
            'language' : 'Hindi',
            'label' : 'Profanity found',
            'confidence' : None, 
            'hate_text' : ",".join(profanityFound)
        }
        
    if isHate:
        return {
            'prediction' : 'hate',
            'language' : 'English' if engConf > hingConf else 'Hinglish',
            'label' : None,
            'confidence' : confidence,
            'hate_text' : stt_text
        }
    
    return {
        'prediction' : 'not_hate',
        'language' : None,
        'label' : None,
        'confidence' : None,
        'hate_text' : None
    }


iface = gr.Interface(
    fn=predict_hate_speech,
    inputs=gr.Audio(type="filepath", label="Upload Audio"),
    outputs=gr.JSON(),
    title="Hate Speech Audio Pipeline",
    description="Upload an audio file to detect potential hate speech content.",
    examples=[
        ["hate_1.wav"],
        ["hate_2.wav"]
    ],
    allow_flagging="manual"
)

if __name__ == "__main__":
    iface.launch()