DavidLoccus's picture
Batch of files processing.
2e81dfe
raw
history blame
5.63 kB
import gradio as gr
import torch
import soundfile as sf
import io
import librosa
import numpy as np
from pytube import YouTube
import os
import random
from huggingface_hub import HfApi
import pandas as pd
from moviepy.editor import *
FS=16000
MAX_SIZE = FS * 30
HF_TOKEN_DEMO=os.getenv("HF_TOKEN_DEMO")
MODEL_REPO=os.getenv("MODEL_REPO")
MODELNAME=os.getenv("MODELNAME")
MODELNAME2=os.getenv("MODELNAME2")
username=os.getenv("username")
password=os.getenv("password")
hf_api = HfApi(
endpoint="https://huggingface.co", # Can be a Private Hub endpoint.
token=HF_TOKEN_DEMO, # Token is not persisted on the machine.
)
modelfile = hf_api.hf_hub_download(MODEL_REPO,MODELNAME2)
MODEL = torch.jit.load(modelfile)
def reformat_freq(sr, y):
if len(y.shape)==1 or y.shape[1]==1:
pass
#print("monocanal")
else:
# Avg two channels
y=y.mean(axis=1)
y = y.astype(np.float32)
if sr not in (
FS,
):
y = librosa.resample(y, orig_sr=sr, target_sr=FS)
return sr, y
def preprocess_audio(audio):
_, y = reformat_freq(*audio)
y = y[:MAX_SIZE]
y=torch.as_tensor(y,dtype=torch.float32)
y=torch.unsqueeze(y,0)
return y
def postprocess_output(score):
out=score.item()
out = round(100*out,2)
return "{:.2f}%".format(out)
def process_youtube_address(youtube_address):
print("Downloading youtube audio from video...")
try:
selected_video = YouTube(youtube_address)
audio=selected_video.streams.filter(only_audio=True, file_extension='mp4').first()
nrand=round(random.random()*1000)
audioname="audio-"+str(nrand)+".mp4a"
audiowav="audio-"+str(nrand)+".wav"
audiomp4a=audio.download('tmp',audioname)
os.system("ffmpeg -i " + audiomp4a + " -ac 1 -ar {} ".format(FS) + audiowav + "; rm tmp/" + audioname )
except Exception as inst:
print("Exception: {}".format(inst))
print("ERROR while downloading audio from " + youtube_address)
audiowav=None
return audiowav
def process_micro(micro):
x=preprocess_audio(micro)
output,_ = MODEL(x)
print(output)
result = postprocess_output(output)
return result
def process_file(file):
x,fs = librosa.load(file, sr=FS)
x=preprocess_audio((fs,x))
print("Running model")
output,_ = MODEL(x)
print(output)
result = postprocess_output(output)
return result
def process_files(files):
resout=[]
res2out=[]
fnames=[]
for f in files:
file=f.name
x,fs = librosa.load(file, sr=FS)
x=preprocess_audio((fs,x))
print("Running model")
output,_ = MODEL(x)
print(output)
result, res2 = postprocess_output(output)
resout.append(result)
res2out.append(res2)
fnames.append(os.path.basename(file))
resout = pd.DataFrame({"File":fnames, "Probability of Real": resout})
#return resout, res2out
return resout
def process_video(file):
video = VideoFileClip(file)
audio = video.audio
if not os.path.isdir('tmp'):
os.makedirs('tmp')
nrand=round(random.random()*1000)
audiowav="tmp/audio-"+str(nrand)+".wav"
audio.to_audiofile(audiowav)
result = process_file(audiowav)
os.remove(audiowav)
return result
def process_youtube(youtube_address):
audiofile=process_youtube_address(youtube_address)
if audiofile is not None:
result = process_file(audiofile)
return result
else:
return "Could not get audio from {}".format(youtube_address)
with gr.Blocks(title="Audio Fake Detector") as demo:
with gr.Tab("Individual Processing"):
gr.Markdown("# Welcome to Loccus.ai synthetic voice detection demo!")
with gr.Row():
with gr.Column():
m = gr.Audio(source="microphone", type="numpy",label="Micro")
f = gr.Audio(source="upload", type="filepath", label="Audio file")
y = gr.Textbox(label="Enter YouTube address here")
v = gr.Video(label="Enter a video", include_audio=True, scale=0.5)
with gr.Column():
with gr.Row(equal_height=True):
text = gr.Textbox(label="Probability of Real Voice")
#file= gr.Audio(source="upload", type="filepath", optional=True)
button_clear = gr.ClearButton([m,f,y,v,text])
m.stop_recording(process_micro, inputs=[m], outputs=text)
f.upload(process_file,inputs=[f], outputs=text)
y.submit(process_youtube, inputs=[y], outputs=text)
v.upload(process_video, inputs=[v], outputs=[text])
with gr.Tab("Batch Processing"):
gr.Markdown("# Welcome to Loccus.ai synthetic voice detection demo!")
with gr.Row():
with gr.Column():
f = gr.File(file_types=["audio"], label="Audio file", file_count="multiple")
with gr.Column():
with gr.Row(equal_height=True):
textbatch = gr.Dataframe(
headers=["File", "Probability of Real"],
datatype=["str", "str"],
)
#text = gr.Textbox(label="Probability of Real Voice")
#text2 = gr.Textbox(label="Amp Mean Score")
button_clear = gr.ClearButton([f,textbatch])
f.upload(process_files,inputs=[f], outputs=[textbatch])
#btn = gr.Button("Run")
#btn.click(fn=update, inputs=inp, outputs=out)
demo.launch(auth=(username,password))