Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import gradio as gr | |
import torch | |
import soundfile as sf | |
import io | |
import librosa | |
import numpy as np | |
from pytube import YouTube | |
import os | |
import random | |
from huggingface_hub import HfApi | |
import pandas as pd | |
from moviepy.editor import * | |
import matplotlib.pyplot as plt | |
FS=16000 | |
MAX_SIZE = FS * 180 | |
CHUNK_SIZE = 4 | |
N = CHUNK_SIZE * FS | |
HF_TOKEN_DEMO=os.getenv("HF_TOKEN_DEMO") | |
MODEL_REPO=os.getenv("MODEL_REPO") | |
MODELNAME=os.getenv("MODELNAME") | |
username=os.getenv("username") | |
password=os.getenv("password") | |
username0=os.getenv("username0") | |
password0=os.getenv("password0") | |
username9=os.getenv("username9") | |
password9=os.getenv("password9") | |
username12=os.getenv("username12") | |
password12=os.getenv("password12") | |
username17=os.getenv("username17") | |
password17=os.getenv("password17") | |
hf_api = HfApi( | |
endpoint="https://huggingface.co", # Can be a Private Hub endpoint. | |
token=HF_TOKEN_DEMO, # Token is not persisted on the machine. | |
) | |
modelfile = hf_api.hf_hub_download(MODEL_REPO,MODELNAME) | |
MODEL = torch.jit.load(modelfile) | |
def reformat_freq(sr, y): | |
if len(y.shape)==1 or y.shape[1]==1: | |
pass | |
#print("monocanal") | |
else: | |
# Avg two channels | |
y=y.mean(axis=1) | |
y = y.astype(np.float32) | |
if sr not in ( | |
FS, | |
): | |
y = librosa.resample(y, orig_sr=sr, target_sr=FS) | |
return sr, y | |
def preprocess_audio(audio): | |
_, y = reformat_freq(*audio) | |
y = y[:MAX_SIZE] | |
y=torch.as_tensor(y,dtype=torch.float32) | |
y=torch.unsqueeze(y,0) | |
return y | |
def postprocess_output(score): | |
out=score.item() | |
out = round(100*out,2) | |
return "{:.2f}%".format(out) | |
def process_youtube_address(youtube_address): | |
print("Downloading youtube audio from video...") | |
try: | |
selected_video = YouTube(youtube_address) | |
audio=selected_video.streams.filter(only_audio=True, file_extension='mp4').first() | |
nrand=round(random.random()*1000) | |
audioname="audio-"+str(nrand)+".mp4a" | |
audiowav="audio-"+str(nrand)+".wav" | |
audiomp4a=audio.download('tmp',audioname) | |
os.system("ffmpeg -i " + audiomp4a + " -ac 1 -ar {} ".format(FS) + audiowav + "; rm tmp/" + audioname ) | |
except Exception as inst: | |
print("Exception: {}".format(inst)) | |
print("ERROR while downloading audio from " + youtube_address) | |
audiowav=None | |
return audiowav | |
def create_chunk_plot(x,ini, end, scores, lvec, scr): | |
x=x.squeeze() | |
T=x.size(0) | |
t = np.array(list(range(T))) / FS | |
result=[np.nan for _ in range(ini)] | |
for s,l in zip(scores.tolist(),lvec.tolist()): | |
resi=[100*s for _ in range(int(l))] | |
result.extend(resi) | |
reslast=[np.nan for _ in range(T-end)] | |
result.extend(reslast) | |
assert len(result)==T, f"Length result: {len(result)} - Length audio {T}" | |
assert len(t)==T, f"Length time: {len(result)} - Length audio {T}" | |
x=x-torch.min(x) | |
x=x/torch.max(x)*100 | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
ax.plot(t, x, alpha=0.3) | |
ax.plot(t,result,color = 'tab:red') | |
ax.set_ylabel('Probability of Real') | |
ax.set_xlabel('Time (s)') | |
ax.set_title(f"Prob. of real audio = {scr}") | |
yticks=np.arange(11)*10 | |
ax.set_yticks(yticks) | |
return fig | |
def process_micro(micro): | |
print("Micro processing") | |
x=preprocess_audio(micro) | |
print("Running model") | |
output, output_arr, lvec, ls, ts = MODEL(x) | |
print(output) | |
result = postprocess_output(output) | |
fig = create_chunk_plot(x, ls, ts, output_arr, lvec, result) | |
return fig | |
def process_file(file): | |
print("File processing") | |
x,fs = librosa.load(file, sr=FS) | |
x=preprocess_audio((fs,x)) | |
print("Running model") | |
output, output_arr, lvec, ls, ts = MODEL(x) | |
print(output) | |
result = postprocess_output(output) | |
fig = create_chunk_plot(x, ls, ts, output_arr, lvec, result) | |
return fig | |
def process_files(files): | |
print("Batch processing") | |
resout=[] | |
fnames=[] | |
for f in files: | |
file=f.name | |
x,fs = librosa.load(file, sr=FS) | |
x=preprocess_audio((fs,x)) | |
print("Running model") | |
output, _, _, _, _ = MODEL(x) | |
print(output) | |
result = postprocess_output(output) | |
resout.append(result) | |
fnames.append(os.path.basename(file)) | |
resout = pd.DataFrame({"File":fnames, "Probability of Real": resout}) | |
return resout | |
def process_video(file): | |
video = VideoFileClip(file) | |
audio = video.audio | |
if not os.path.isdir('tmp'): | |
os.makedirs('tmp') | |
nrand=round(random.random()*1000) | |
audiowav="tmp/audio-"+str(nrand)+".wav" | |
audio.to_audiofile(audiowav) | |
result = process_file(audiowav) | |
os.remove(audiowav) | |
return result | |
def process_youtube(youtube_address): | |
audiofile=process_youtube_address(youtube_address) | |
if audiofile is not None: | |
result = process_file(audiofile) | |
return result | |
else: | |
return "Could not get audio from {}".format(youtube_address) | |
with gr.Blocks(title="Audio Fake Detector") as demo: | |
with gr.Tab("Individual Processing"): | |
gr.Markdown("""# [Hiya](https://www.hiya.com/products/ai-voice) - AI Voice detection demo | |
This is a demo of our Authenticity Verification solution, aimed at detecting if a voice is real or not. | |
* Input - audio file in any format | |
* Output - probability of that voice being real or AI-generated (1.0 - Real / 0.0 AI-generated) | |
There are two testing modes: | |
* Individual processing - for single files. You will see a time-based view and scores for each 4-second chunk. Best for single long files. | |
* Batch processing - for a batch of files. You will see a single overall score per file. Best to assess multiple short files. | |
Only the first 3 minutes of audio are analyzed.""") | |
with gr.Row(): | |
with gr.Column(): | |
m = gr.Audio(sources=["microphone"], type="numpy",label="Micro") | |
f = gr.Audio(sources=["upload"], type="filepath", label="Audio file") | |
#y = gr.Textbox(label="Enter YouTube address here") | |
#v = gr.Video(label="Enter a video", include_audio=True, scale=0.5) | |
with gr.Column(scale=2): | |
with gr.Row(equal_height=True): | |
img = gr.Plot(show_label=False) | |
#file= gr.Audio(source="upload", type="filepath", optional=True) | |
#button_clear = gr.ClearButton([m,f,y,v,text]) | |
button_clear = gr.ClearButton([m,f,img]) | |
m.stop_recording(process_micro, inputs=[m], outputs=img) | |
f.upload(process_file,inputs=[f], outputs=img) | |
#y.submit(process_youtube, inputs=[y], outputs=text) | |
#v.upload(process_video, inputs=[v], outputs=[text]) | |
with gr.Tab("Batch Processing"): | |
gr.Markdown("# [Hiya](https://www.hiya.com/products/ai-voice) - AI Voice detection demo") | |
with gr.Row(): | |
with gr.Column(): | |
f = gr.File(file_types=["audio"], label="Audio file", file_count="multiple") | |
with gr.Column(): | |
with gr.Row(equal_height=True): | |
textbatch = gr.Dataframe( | |
headers=["File", "Probability of Real"], | |
datatype=["str", "str"], | |
) | |
button_clear = gr.ClearButton([f,textbatch]) | |
f.upload(process_files,inputs=[f], outputs=[textbatch]) | |
demo.launch(auth=[(username,password),(username0,password0) ,(username9,password9), (username12,password12),(username17,password17)]) | |