Spaces:

Hiya-ai
/

loccus-audio-fake-detection

Running on CPU Upgrade

App Files Files Community

DavidLoccus commited on Jan 18, 2024

Commit

7ca009e

1 Parent(s): b706e43

Change output layout from score to figure.

Browse files

Files changed (1) hide show

app.py +70 -32

app.py CHANGED Viewed

@@ -10,9 +10,13 @@ import random
 from huggingface_hub import HfApi
 import pandas as pd
 from moviepy.editor import *
 FS=16000
-MAX_SIZE = FS * 30
 HF_TOKEN_DEMO=os.getenv("HF_TOKEN_DEMO")
 MODEL_REPO=os.getenv("MODEL_REPO")
@@ -107,48 +111,87 @@ def process_youtube_address(youtube_address):
     return audiowav
 def process_micro(micro):
     x=preprocess_audio(micro)
-    output = MODEL(x)
     print(output)
     result = postprocess_output(output)
-    return result
 def process_file(file):
     x,fs = librosa.load(file, sr=FS)
     x=preprocess_audio((fs,x))
     print("Running model")
-    output = MODEL(x)
     print(output)
     result = postprocess_output(output)
-    return result
-def process_files(files):
     resout=[]
-    res2out=[]
     fnames=[]
     for f in files:
         file=f.name
         x,fs = librosa.load(file, sr=FS)
         x=preprocess_audio((fs,x))
         print("Running model")
-        output = MODEL(x)
         print(output)
         result = postprocess_output(output)
         resout.append(result)
-        #res2out.append(res2)
         fnames.append(os.path.basename(file))
     resout = pd.DataFrame({"File":fnames, "Probability of Real": resout})
-    #return resout, res2out
     return resout
 def process_video(file):
@@ -182,14 +225,16 @@ def process_youtube(youtube_address):
 with gr.Blocks(title="Audio Fake Detector") as demo:
     with gr.Tab("Individual Processing"):
-        gr.Markdown("""# Welcome to Loccus' Authenticity Verification demo!
-        This is a showcase of our solution. It provides a probability of a voice being real or AI-generated. It is designed for the following context:
-        * To detect voice clones
-        * Focus on English and Spanish languages
-        * For short audio samples (3 to 10 seconds)
-        * For audio from digital channels (at 16 kHz or more)
-        Please test it accordingly. Variations of the above (e.g. a 1 minute audio file of an off-the-shelf TTS voice in Japanese) can compromise the accuracy and performance of the solution. We keep improving the solution adding new features every week.""")
@@ -201,21 +246,21 @@ with gr.Blocks(title="Audio Fake Detector") as demo:
                 #y = gr.Textbox(label="Enter YouTube address here")
                 #v = gr.Video(label="Enter a video", include_audio=True, scale=0.5)
-            with gr.Column():
                 with gr.Row(equal_height=True):
-                    text = gr.Textbox(label="Probability of Real Voice")
         #file= gr.Audio(source="upload", type="filepath", optional=True)
         #button_clear = gr.ClearButton([m,f,y,v,text])
-        button_clear = gr.ClearButton([m,f,text])
-        m.stop_recording(process_micro, inputs=[m], outputs=text)
-        f.upload(process_file,inputs=[f], outputs=text)
         #y.submit(process_youtube, inputs=[y], outputs=text)
         #v.upload(process_video, inputs=[v], outputs=[text])
     with gr.Tab("Batch Processing"):
-        gr.Markdown("# Welcome to Loccus' Authenticity Verification demo!")
         with gr.Row():
             with gr.Column():
@@ -229,19 +274,12 @@ with gr.Blocks(title="Audio Fake Detector") as demo:
                         headers=["File", "Probability of Real"],
                         datatype=["str", "str"],
                     )
-                    #text = gr.Textbox(label="Probability of Real Voice")
-                    #text2 = gr.Textbox(label="Amp Mean Score")
         button_clear = gr.ClearButton([f,textbatch])
         f.upload(process_files,inputs=[f], outputs=[textbatch])
-    #btn = gr.Button("Run")
-    #btn.click(fn=update, inputs=inp, outputs=out)
 demo.launch(auth=[(username,password),(username0,password0),(username1,password1),(username2,password2),(username3,password3),(username4,password4),(username5,password5),(username6,password6),(username7,password7),(username8,password8),(username9,password9),(username10,password10), \
                   (username11,password11),(username12,password12),(username13,password13),(username14,password14)])

 from huggingface_hub import HfApi
 import pandas as pd
 from moviepy.editor import *
+import matplotlib.pyplot as plt
 FS=16000
+MAX_SIZE = FS * 60
+CHUNK_SIZE = 4
+N = CHUNK_SIZE * FS
 HF_TOKEN_DEMO=os.getenv("HF_TOKEN_DEMO")
 MODEL_REPO=os.getenv("MODEL_REPO")
     return audiowav
+def create_chunk_plot(x,ini, end, scores, lvec, scr):
+    x=x.squeeze()
+    T=x.size(0)
+    t = np.array(list(range(T))) / FS
+    result=[np.nan for _ in range(ini)]
+    for s,l in zip(scores.tolist(),lvec.tolist()):
+        resi=[100*s for _ in range(int(l))]
+        result.extend(resi)
+    reslast=[np.nan for _ in range(T-end)]
+    result.extend(reslast)
+    assert len(result)==T, f"Length result: {len(result)} - Length audio {T}"
+    assert len(t)==T, f"Length time: {len(result)} - Length audio {T}"
+    x=x-torch.min(x)
+    x=x/torch.max(x)*100
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    ax.plot(t, x, alpha=0.3)
+    ax.plot(t,result,color = 'tab:red')
+    ax.set_ylabel('Probability of Real')
+    ax.set_xlabel('Time (s)')
+    ax.set_title(f"Prob. of real audio = {scr}")
+    yticks=np.arange(11)*10
+    ax.set_yticks(yticks)
+    return fig
 def process_micro(micro):
+    print("Micro processing")
     x=preprocess_audio(micro)
+    print("Running model")
+    output, output_arr, lvec, ls, ts = MODEL(x)
     print(output)
     result = postprocess_output(output)
+    fig = create_chunk_plot(x, ls, ts, output_arr, lvec, result)
+    return fig
 def process_file(file):
+    print("File processing")
     x,fs = librosa.load(file, sr=FS)
     x=preprocess_audio((fs,x))
     print("Running model")
+    output, output_arr, lvec, ls, ts = MODEL(x)
     print(output)
     result = postprocess_output(output)
+    fig = create_chunk_plot(x, ls, ts, output_arr, lvec, result)
+    return fig
+def process_files(files):
+    print("Batch processing")
     resout=[]
     fnames=[]
     for f in files:
         file=f.name
         x,fs = librosa.load(file, sr=FS)
         x=preprocess_audio((fs,x))
         print("Running model")
+        output, _, _, _, _ = MODEL(x)
         print(output)
         result = postprocess_output(output)
         resout.append(result)
         fnames.append(os.path.basename(file))
     resout = pd.DataFrame({"File":fnames, "Probability of Real": resout})
     return resout
 def process_video(file):
 with gr.Blocks(title="Audio Fake Detector") as demo:
     with gr.Tab("Individual Processing"):
+        gr.Markdown("""# [Loccus.ai](http://www.loccus.ai) - AI Voice detection demo
+        This is a demo of our Authenticity Verification solution, aimed at detecting if a voice is real or not.
+        * Input - audio file in any format
+        * Output - probability of that voice being real or AI-generated (1.0 - Real / 0.0 AI-generated)
+        There are two testing modes:
+        * Individual processing - for single files. You will see a time-based view and scores for each 4-second chunk. Best for single long files.
+        * Batch processing - for a batch of files. You will see a single overall score per file. Best to assess multiple short files.
+        Only the first minute of audio is analyzed.""")
                 #y = gr.Textbox(label="Enter YouTube address here")
                 #v = gr.Video(label="Enter a video", include_audio=True, scale=0.5)
+            with gr.Column(scale=2):
                 with gr.Row(equal_height=True):
+                    img = gr.Plot(show_label=False)
         #file= gr.Audio(source="upload", type="filepath", optional=True)
         #button_clear = gr.ClearButton([m,f,y,v,text])
+        button_clear = gr.ClearButton([m,f,img])
+        m.stop_recording(process_micro, inputs=[m], outputs=img)
+        f.upload(process_file,inputs=[f], outputs=img)
         #y.submit(process_youtube, inputs=[y], outputs=text)
         #v.upload(process_video, inputs=[v], outputs=[text])
     with gr.Tab("Batch Processing"):
+        gr.Markdown("# [Loccus.ai](http://www.loccus.ai) - AI Voice detection demo")
         with gr.Row():
             with gr.Column():
                         headers=["File", "Probability of Real"],
                         datatype=["str", "str"],
                     )
         button_clear = gr.ClearButton([f,textbatch])
         f.upload(process_files,inputs=[f], outputs=[textbatch])
 demo.launch(auth=[(username,password),(username0,password0),(username1,password1),(username2,password2),(username3,password3),(username4,password4),(username5,password5),(username6,password6),(username7,password7),(username8,password8),(username9,password9),(username10,password10), \
                   (username11,password11),(username12,password12),(username13,password13),(username14,password14)])