import gradio as gr import torch import soundfile as sf import io import librosa import numpy as np from pytube import YouTube import os import random from huggingface_hub import HfApi import pandas as pd from moviepy.editor import * import matplotlib.pyplot as plt FS=16000 MAX_SIZE = FS * 180 CHUNK_SIZE = 4 N = CHUNK_SIZE * FS HF_TOKEN_DEMO=os.getenv("HF_TOKEN_DEMO") MODEL_REPO=os.getenv("MODEL_REPO") MODELNAME=os.getenv("MODELNAME") username=os.getenv("username") password=os.getenv("password") username0=os.getenv("username0") password0=os.getenv("password0") username9=os.getenv("username9") password9=os.getenv("password9") username12=os.getenv("username12") password12=os.getenv("password12") username17=os.getenv("username17") password17=os.getenv("password17") hf_api = HfApi( endpoint="", # Can be a Private Hub endpoint. token=HF_TOKEN_DEMO, # Token is not persisted on the machine. ) modelfile = hf_api.hf_hub_download(MODEL_REPO,MODELNAME) MODEL = torch.jit.load(modelfile) def reformat_freq(sr, y): if len(y.shape)==1 or y.shape[1]==1: pass #print("monocanal") else: # Avg two channels y=y.mean(axis=1) y = y.astype(np.float32) if sr not in ( FS, ): y = librosa.resample(y, orig_sr=sr, target_sr=FS) return sr, y def preprocess_audio(audio): _, y = reformat_freq(*audio) y = y[:MAX_SIZE] y=torch.as_tensor(y,dtype=torch.float32) y=torch.unsqueeze(y,0) return y def postprocess_output(score): out=score.item() out = round(100*out,2) return "{:.2f}%".format(out) def process_youtube_address(youtube_address): print("Downloading youtube audio from video...") try: selected_video = YouTube(youtube_address) audio=selected_video.streams.filter(only_audio=True, file_extension='mp4').first() nrand=round(random.random()*1000) audioname="audio-"+str(nrand)+".mp4a" audiowav="audio-"+str(nrand)+".wav"'tmp',audioname) os.system("ffmpeg -i " + audiomp4a + " -ac 1 -ar {} ".format(FS) + audiowav + "; rm tmp/" + audioname ) except Exception as inst: print("Exception: {}".format(inst)) print("ERROR while downloading audio from " + youtube_address) audiowav=None return audiowav def create_chunk_plot(x,ini, end, scores, lvec, scr): x=x.squeeze() T=x.size(0) t = np.array(list(range(T))) / FS result=[np.nan for _ in range(ini)] for s,l in zip(scores.tolist(),lvec.tolist()): resi=[100*s for _ in range(int(l))] result.extend(resi) reslast=[np.nan for _ in range(T-end)] result.extend(reslast) assert len(result)==T, f"Length result: {len(result)} - Length audio {T}" assert len(t)==T, f"Length time: {len(result)} - Length audio {T}" x=x-torch.min(x) x=x/torch.max(x)*100 fig = plt.figure() ax = fig.add_subplot(111) ax.plot(t, x, alpha=0.3) ax.plot(t,result,color = 'tab:red') ax.set_ylabel('Probability of Real') ax.set_xlabel('Time (s)') ax.set_title(f"Prob. of real audio = {scr}") yticks=np.arange(11)*10 ax.set_yticks(yticks) return fig def process_micro(micro): print("Micro processing") x=preprocess_audio(micro) print("Running model") output, output_arr, lvec, ls, ts = MODEL(x) print(output) result = postprocess_output(output) fig = create_chunk_plot(x, ls, ts, output_arr, lvec, result) return fig def process_file(file): print("File processing") x,fs = librosa.load(file, sr=FS) x=preprocess_audio((fs,x)) print("Running model") output, output_arr, lvec, ls, ts = MODEL(x) print(output) result = postprocess_output(output) fig = create_chunk_plot(x, ls, ts, output_arr, lvec, result) return fig def process_files(files): print("Batch processing") resout=[] fnames=[] for f in files: x,fs = librosa.load(file, sr=FS) x=preprocess_audio((fs,x)) print("Running model") output, _, _, _, _ = MODEL(x) print(output) result = postprocess_output(output) resout.append(result) fnames.append(os.path.basename(file)) resout = pd.DataFrame({"File":fnames, "Probability of Real": resout}) return resout def process_video(file): video = VideoFileClip(file) audio = if not os.path.isdir('tmp'): os.makedirs('tmp') nrand=round(random.random()*1000) audiowav="tmp/audio-"+str(nrand)+".wav" audio.to_audiofile(audiowav) result = process_file(audiowav) os.remove(audiowav) return result def process_youtube(youtube_address): audiofile=process_youtube_address(youtube_address) if audiofile is not None: result = process_file(audiofile) return result else: return "Could not get audio from {}".format(youtube_address) with gr.Blocks(title="Audio Fake Detector") as demo: with gr.Tab("Individual Processing"): gr.Markdown("""# [Hiya]( - AI Voice detection demo This is a demo of our Authenticity Verification solution, aimed at detecting if a voice is real or not. * Input - audio file in any format * Output - probability of that voice being real or AI-generated (1.0 - Real / 0.0 AI-generated) There are two testing modes: * Individual processing - for single files. You will see a time-based view and scores for each 4-second chunk. Best for single long files. * Batch processing - for a batch of files. You will see a single overall score per file. Best to assess multiple short files. Only the first 3 minutes of audio are analyzed.""") with gr.Row(): with gr.Column(): m = gr.Audio(sources=["microphone"], type="numpy",label="Micro") f = gr.Audio(sources=["upload"], type="filepath", label="Audio file") #y = gr.Textbox(label="Enter YouTube address here") #v = gr.Video(label="Enter a video", include_audio=True, scale=0.5) with gr.Column(scale=2): with gr.Row(equal_height=True): img = gr.Plot(show_label=False) #file= gr.Audio(source="upload", type="filepath", optional=True) #button_clear = gr.ClearButton([m,f,y,v,text]) button_clear = gr.ClearButton([m,f,img]) m.stop_recording(process_micro, inputs=[m], outputs=img) f.upload(process_file,inputs=[f], outputs=img) #y.submit(process_youtube, inputs=[y], outputs=text) #v.upload(process_video, inputs=[v], outputs=[text]) with gr.Tab("Batch Processing"): gr.Markdown("# [Hiya]( - AI Voice detection demo") with gr.Row(): with gr.Column(): f = gr.File(file_types=["audio"], label="Audio file", file_count="multiple") with gr.Column(): with gr.Row(equal_height=True): textbatch = gr.Dataframe( headers=["File", "Probability of Real"], datatype=["str", "str"], ) button_clear = gr.ClearButton([f,textbatch]) f.upload(process_files,inputs=[f], outputs=[textbatch]) demo.launch(auth=[(username,password),(username0,password0) ,(username9,password9), (username12,password12),(username17,password17)])