import gradio as gr
import torch
import soundfile as sf
import io
import librosa
import numpy as np
from pytube import YouTube
import os
import random
from huggingface_hub import HfApi
import pandas as pd
from moviepy.editor import *
import matplotlib.pyplot as plt


FS=16000
MAX_SIZE = FS * 180
CHUNK_SIZE = 4
N = CHUNK_SIZE * FS

HF_TOKEN_DEMO=os.getenv("HF_TOKEN_DEMO")
MODEL_REPO=os.getenv("MODEL_REPO")
MODELNAME=os.getenv("MODELNAME")
username=os.getenv("username")
password=os.getenv("password")
username0=os.getenv("username0")
password0=os.getenv("password0")


username9=os.getenv("username9")
password9=os.getenv("password9")


username12=os.getenv("username12")
password12=os.getenv("password12")

username17=os.getenv("username17")
password17=os.getenv("password17")


hf_api = HfApi(
    endpoint="https://huggingface.co", # Can be a Private Hub endpoint.
    token=HF_TOKEN_DEMO, # Token is not persisted on the machine.
)

modelfile = hf_api.hf_hub_download(MODEL_REPO,MODELNAME)
MODEL = torch.jit.load(modelfile)


def reformat_freq(sr, y):
    if len(y.shape)==1 or y.shape[1]==1:
        pass
        #print("monocanal")
    else:
        # Avg two channels
        y=y.mean(axis=1)

    y = y.astype(np.float32)
    if sr not in (
        FS,
    ):
        y = librosa.resample(y, orig_sr=sr, target_sr=FS)

    return sr, y


def preprocess_audio(audio):
    _, y = reformat_freq(*audio)
    y = y[:MAX_SIZE]
    y=torch.as_tensor(y,dtype=torch.float32)
    y=torch.unsqueeze(y,0)

    return y

def postprocess_output(score):
    out=score.item()
    out = round(100*out,2)
    return "{:.2f}%".format(out)

def process_youtube_address(youtube_address):
    print("Downloading youtube audio from video...")
    
    try:
        selected_video = YouTube(youtube_address)
        audio=selected_video.streams.filter(only_audio=True, file_extension='mp4').first()
        nrand=round(random.random()*1000)
        audioname="audio-"+str(nrand)+".mp4a"
        audiowav="audio-"+str(nrand)+".wav"
        audiomp4a=audio.download('tmp',audioname)
        os.system("ffmpeg -i " + audiomp4a + " -ac 1 -ar {} ".format(FS) + audiowav + "; rm tmp/" + audioname )
    except Exception as inst:
        print("Exception: {}".format(inst))
        print("ERROR while downloading audio from " + youtube_address)
        audiowav=None
    return audiowav


def create_chunk_plot(x,ini, end, scores, lvec, scr):
    x=x.squeeze()
    T=x.size(0)
    t = np.array(list(range(T))) / FS

    result=[np.nan for _ in range(ini)]

    for s,l in zip(scores.tolist(),lvec.tolist()):
        resi=[100*s for _ in range(int(l))]
        result.extend(resi)

    reslast=[np.nan for _ in range(T-end)]
    result.extend(reslast)
    
    assert len(result)==T, f"Length result: {len(result)} - Length audio {T}"
    assert len(t)==T, f"Length time: {len(result)} - Length audio {T}"

    x=x-torch.min(x)
    x=x/torch.max(x)*100

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(t, x, alpha=0.3)
    ax.plot(t,result,color = 'tab:red')
    ax.set_ylabel('Probability of Real')
    ax.set_xlabel('Time (s)')
    ax.set_title(f"Prob. of real audio = {scr}")

    yticks=np.arange(11)*10
    ax.set_yticks(yticks)

    return fig


def process_micro(micro):
    print("Micro processing")
    x=preprocess_audio(micro)
    print("Running model")
    output, output_arr, lvec, ls, ts = MODEL(x)
    print(output)
    result = postprocess_output(output)

    fig = create_chunk_plot(x, ls, ts, output_arr, lvec, result)

    return fig

def process_file(file):
    print("File processing")
    x,fs = librosa.load(file, sr=FS)
    x=preprocess_audio((fs,x))
    print("Running model")
    output, output_arr, lvec, ls, ts = MODEL(x)
    print(output)
    result = postprocess_output(output)

    
    fig = create_chunk_plot(x, ls, ts, output_arr, lvec, result)

    return fig

def process_files(files):
    print("Batch processing")
    resout=[]
    fnames=[]
    for f in files:
        file=f.name
        x,fs = librosa.load(file, sr=FS)
        x=preprocess_audio((fs,x))
        print("Running model")
        output, _, _, _, _ = MODEL(x)
        print(output)
        result = postprocess_output(output)
        resout.append(result)


        fnames.append(os.path.basename(file))


    resout = pd.DataFrame({"File":fnames, "Probability of Real": resout})
    return resout

def process_video(file):
    video = VideoFileClip(file)
    audio = video.audio

    if not os.path.isdir('tmp'):
        os.makedirs('tmp')
    nrand=round(random.random()*1000)
    audiowav="tmp/audio-"+str(nrand)+".wav"
    audio.to_audiofile(audiowav)

    result = process_file(audiowav)
    os.remove(audiowav)

    return result

def process_youtube(youtube_address):
    audiofile=process_youtube_address(youtube_address)

    if audiofile is not None:
        result = process_file(audiofile)
        return result
    else: 
        return "Could not get audio from {}".format(youtube_address)


with gr.Blocks(title="Audio Fake Detector") as demo:
    with gr.Tab("Individual Processing"):
        gr.Markdown("""# [Hiya](https://www.hiya.com/products/ai-voice) - AI Voice detection demo
        This is a demo of our Authenticity Verification solution, aimed at detecting if a voice is real or not.
        * Input - audio file in any format
        * Output - probability of that voice being real or AI-generated (1.0 - Real / 0.0 AI-generated)
                            
        There are two testing modes:
        * Individual processing - for single files. You will see a time-based view and scores for each 4-second chunk. Best for single long files.
        * Batch processing - for a batch of files. You will see a single overall score per file. Best to assess multiple short files.

        Only the first 3 minutes of audio are analyzed.""")
    
    
        with gr.Row():
            with gr.Column():
                m = gr.Audio(sources=["microphone"], type="numpy",label="Micro")
                f = gr.Audio(sources=["upload"], type="filepath", label="Audio file")
                #y = gr.Textbox(label="Enter YouTube address here")
                #v = gr.Video(label="Enter a video", include_audio=True, scale=0.5)

            with gr.Column(scale=2):
                with gr.Row(equal_height=True):
                
                    img = gr.Plot(show_label=False)
        
        #file= gr.Audio(source="upload", type="filepath", optional=True)
        #button_clear = gr.ClearButton([m,f,y,v,text])
        button_clear = gr.ClearButton([m,f,img])
        m.stop_recording(process_micro, inputs=[m], outputs=img)
        f.upload(process_file,inputs=[f], outputs=img)
        #y.submit(process_youtube, inputs=[y], outputs=text)
        #v.upload(process_video, inputs=[v], outputs=[text])

    with gr.Tab("Batch Processing"):
        gr.Markdown("# [Hiya](https://www.hiya.com/products/ai-voice) - AI Voice detection demo")

        with gr.Row():
            with gr.Column():
                f = gr.File(file_types=["audio"], label="Audio file", file_count="multiple")


            with gr.Column():
                with gr.Row(equal_height=True):

                    textbatch = gr.Dataframe(
                        headers=["File", "Probability of Real"],
                        datatype=["str", "str"],
                    )

            
        button_clear = gr.ClearButton([f,textbatch])
        
        f.upload(process_files,inputs=[f], outputs=[textbatch])

demo.launch(auth=[(username,password),(username0,password0) ,(username9,password9), (username12,password12),(username17,password17)])