diff --git a/audio_detection/__init__.py b/audio_detection/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/audio_detection/audio_infer/__init__.py b/audio_detection/audio_infer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/audio_detection/audio_infer/__pycache__/__init__.cpython-38.pyc b/audio_detection/audio_infer/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0be763e4f05595b0b8fc1819a5ce5d665e6a7e6d
Binary files /dev/null and b/audio_detection/audio_infer/__pycache__/__init__.cpython-38.pyc differ
diff --git a/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv
new file mode 100644
index 0000000000000000000000000000000000000000..48d8522774b0127d4b585c18fb7da54a9fcbc248
--- /dev/null
+++ b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv
@@ -0,0 +1,1350 @@
+-JMT0mK0Dbg_30.000_40.000.wav 30.000 40.000 Train horn
+3ACjUf9QpAQ_30.000_40.000.wav 30.000 40.000 Train horn
+3S2-TODd__k_90.000_100.000.wav 90.000 100.000 Train horn
+3YJewEC-NWo_30.000_40.000.wav 30.000 40.000 Train horn
+3jXAh3V2FO8_30.000_40.000.wav 30.000 40.000 Train horn
+53oq_Otm_XI_30.000_40.000.wav 30.000 40.000 Train horn
+8IaInXpdd9M_0.000_10.000.wav 0.000 10.000 Train horn
+8nU1aVscJec_30.000_40.000.wav 30.000 40.000 Train horn
+9LQEZJPNVpw_30.000_40.000.wav 30.000 40.000 Train horn
+AHom7lBbtoY_30.000_40.000.wav 30.000 40.000 Train horn
+Ag_zT74ZGNc_9.000_19.000.wav 9.000 19.000 Train horn
+BQpa8whzwAE_30.000_40.000.wav 30.000 40.000 Train horn
+CCX_4cW_SAU_0.000_10.000.wav 0.000 10.000 Train horn
+CLIdVCUO_Vw_30.000_40.000.wav 30.000 40.000 Train horn
+D_nXtMgbPNY_30.000_40.000.wav 30.000 40.000 Train horn
+GFQnh84kNwU_30.000_40.000.wav 30.000 40.000 Train horn
+I4qODX0fypE_30.000_40.000.wav 30.000 40.000 Train horn
+IdqEbjujFb8_30.000_40.000.wav 30.000 40.000 Train horn
+L3a132_uApg_50.000_60.000.wav 50.000 60.000 Train horn
+LzcNa3HvD7c_30.000_40.000.wav 30.000 40.000 Train horn
+MCYY8tJsnfY_7.000_17.000.wav 7.000 17.000 Train horn
+MPSf7dJpV5w_30.000_40.000.wav 30.000 40.000 Train horn
+NdCr5IDnkxc_30.000_40.000.wav 30.000 40.000 Train horn
+P54KKbTA_TE_0.000_7.000.wav 0.000 7.000 Train horn
+PJUy17bXlhc_40.000_50.000.wav 40.000 50.000 Train horn
+QrAoRSA13bM_30.000_40.000.wav 30.000 40.000 Train horn
+R_Lpb-51Kl4_30.000_40.000.wav 30.000 40.000 Train horn
+Rq-22Cycrpg_30.000_40.000.wav 30.000 40.000 Train horn
+TBjrN1aMRrM_30.000_40.000.wav 30.000 40.000 Train horn
+XAUtk9lwzU8_30.000_40.000.wav 30.000 40.000 Train horn
+XW8pSKLyr0o_20.000_30.000.wav 20.000 30.000 Train horn
+Y10I9JSvJuQ_30.000_40.000.wav 30.000 40.000 Train horn
+Y_jwEflLthg_190.000_200.000.wav 190.000 200.000 Train horn
+YilfKdY7w6Y_60.000_70.000.wav 60.000 70.000 Train horn
+ZcTI8fQgEZE_240.000_250.000.wav 240.000 250.000 Train horn
+_8MvhMlbwiE_40.000_50.000.wav 40.000 50.000 Train horn
+_dkeW6lqmq4_30.000_40.000.wav 30.000 40.000 Train horn
+aXsUHAKbyLs_30.000_40.000.wav 30.000 40.000 Train horn
+arevYmB0qGg_30.000_40.000.wav 30.000 40.000 Train horn
+d1o334I5X_k_30.000_40.000.wav 30.000 40.000 Train horn
+dSzZWgbJ378_30.000_40.000.wav 30.000 40.000 Train horn
+ePVb5Upev8k_40.000_50.000.wav 40.000 50.000 Train horn
+g4cA-ifQc70_30.000_40.000.wav 30.000 40.000 Train horn
+g9JVq7wfDIo_30.000_40.000.wav 30.000 40.000 Train horn
+gTFCK9TuLOQ_30.000_40.000.wav 30.000 40.000 Train horn
+hYqzr_rIIAw_30.000_40.000.wav 30.000 40.000 Train horn
+iZgzRfa-xPQ_30.000_40.000.wav 30.000 40.000 Train horn
+k8H8rn4NaSM_0.000_10.000.wav 0.000 10.000 Train horn
+lKQ-I_P7TEM_20.000_30.000.wav 20.000 30.000 Train horn
+nfY_zkJceDw_30.000_40.000.wav 30.000 40.000 Train horn
+pW5SI1ZKUpA_30.000_40.000.wav 30.000 40.000 Train horn
+pxmrmtEnROk_30.000_40.000.wav 30.000 40.000 Train horn
+q7zzKHFWGkg_30.000_40.000.wav 30.000 40.000 Train horn
+qu8vVFWKszA_30.000_40.000.wav 30.000 40.000 Train horn
+stdjjG6Y5IU_30.000_40.000.wav 30.000 40.000 Train horn
+tdRMxc4UWRk_30.000_40.000.wav 30.000 40.000 Train horn
+tu-cxDG2mW8_0.000_10.000.wav 0.000 10.000 Train horn
+txXSE7kgrc8_30.000_40.000.wav 30.000 40.000 Train horn
+xabrKa79prM_30.000_40.000.wav 30.000 40.000 Train horn
+yBVxtq9k8Sg_0.000_10.000.wav 0.000 10.000 Train horn
+-WoudI3gGvk_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+0_gci63CtFY_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+2-h8MRSRvEg_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+3NX4HaOVBoo_240.000_250.000.wav 240.000 250.000 Air horn, truck horn
+9NPKQDaNCRk_0.000_6.000.wav 0.000 6.000 Air horn, truck horn
+9ct4w4aYWdc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+9l9QXgsJSfo_120.000_130.000.wav 120.000 130.000 Air horn, truck horn
+CN0Bi4MDpA4_20.000_30.000.wav 20.000 30.000 Air horn, truck horn
+CU2MyVM_B48_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+Cg-DWc9nPfQ_90.000_100.000.wav 90.000 100.000 Air horn, truck horn
+D62L3husEa0_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+GO2zKyMtBV4_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+Ge_KWS-0098_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+Hk7HqLBHWng_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+IpyingiCwV8_0.000_3.000.wav 0.000 3.000 Air horn, truck horn
+Isuh9pOuH6I_300.000_310.000.wav 300.000 310.000 Air horn, truck horn
+IuTfMfzkr5Y_120.000_130.000.wav 120.000 130.000 Air horn, truck horn
+MFxsgcZZtFs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn
+N3osL4QmOL8_49.000_59.000.wav 49.000 59.000 Air horn, truck horn
+NOZsDTFLm7M_0.000_9.000.wav 0.000 9.000 Air horn, truck horn
+OjVY3oM1jEU_40.000_50.000.wav 40.000 50.000 Air horn, truck horn
+PNaLTW50fxM_60.000_70.000.wav 60.000 70.000 Air horn, truck horn
+TYLZuBBu8ms_0.000_10.000.wav 0.000 10.000 Air horn, truck horn
+UdHR1P_NIbo_110.000_120.000.wav 110.000 120.000 Air horn, truck horn
+YilfKdY7w6Y_60.000_70.000.wav 60.000 70.000 Air horn, truck horn
+Yt4ZWNjvJOY_50.000_60.000.wav 50.000 60.000 Air horn, truck horn
+Z5M3fGT3Xjk_60.000_70.000.wav 60.000 70.000 Air horn, truck horn
+ZauRsP1uH74_12.000_22.000.wav 12.000 22.000 Air horn, truck horn
+a_6CZ2JaEuc_0.000_2.000.wav 0.000 2.000 Air horn, truck horn
+b7m5Kt5U7Vc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+bIObkrK06rk_15.000_25.000.wav 15.000 25.000 Air horn, truck horn
+cdrjKqyDrak_420.000_430.000.wav 420.000 430.000 Air horn, truck horn
+ckSYn557ZyE_20.000_30.000.wav 20.000 30.000 Air horn, truck horn
+cs-RPPsg_ks_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+ctsq33oUBT8_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+eCFUwyU9ZWA_9.000_19.000.wav 9.000 19.000 Air horn, truck horn
+ePVb5Upev8k_40.000_50.000.wav 40.000 50.000 Air horn, truck horn
+fHaQPHCjyfA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+fOVsAMJ3Yms_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+g4cA-ifQc70_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+gjlo4evwjlE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+i9VjpIbM3iE_410.000_420.000.wav 410.000 420.000 Air horn, truck horn
+ieZVo7W3BQ4_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+ii87iO6JboA_10.000_20.000.wav 10.000 20.000 Air horn, truck horn
+jko48cNdvFA_80.000_90.000.wav 80.000 90.000 Air horn, truck horn
+kJuvA2zmrnY_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+kUrb38hMwPs_0.000_10.000.wav 0.000 10.000 Air horn, truck horn
+km_hVyma2vo_0.000_10.000.wav 0.000 10.000 Air horn, truck horn
+m1e9aOwRiDQ_0.000_9.000.wav 0.000 9.000 Air horn, truck horn
+mQJcObz1k_E_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+pk75WDyNZKc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+rhUfN81puDI_0.000_10.000.wav 0.000 10.000 Air horn, truck horn
+suuYwAifIAQ_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+wDdEZ46B-tM_460.000_470.000.wav 460.000 470.000 Air horn, truck horn
+wHISHmuP58s_80.000_90.000.wav 80.000 90.000 Air horn, truck horn
+xwqIKDz1bT4_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+y4Ko6VNiqB0_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+yhcmPrU3QSk_61.000_71.000.wav 61.000 71.000 Air horn, truck horn
+3FWHjjZGT9U_80.000_90.000.wav 80.000 90.000 Car alarm
+3YChVhqW42E_130.000_140.000.wav 130.000 140.000 Car alarm
+3YRkin3bMlQ_170.000_180.000.wav 170.000 180.000 Car alarm
+4APBvMmKubU_10.000_20.000.wav 10.000 20.000 Car alarm
+4JDah6Ckr9k_5.000_15.000.wav 5.000 15.000 Car alarm
+5hL1uGb4sas_30.000_40.000.wav 30.000 40.000 Car alarm
+969Zfj4IoPk_20.000_30.000.wav 20.000 30.000 Car alarm
+AyfuBDN3Vdw_40.000_50.000.wav 40.000 50.000 Car alarm
+B-ZqhRg3km4_60.000_70.000.wav 60.000 70.000 Car alarm
+BDnwA3AaclE_10.000_20.000.wav 10.000 20.000 Car alarm
+ES-rjFfuxq4_120.000_130.000.wav 120.000 130.000 Car alarm
+EWbZq5ruCpg_0.000_10.000.wav 0.000 10.000 Car alarm
+F50h9HiyC3k_40.000_50.000.wav 40.000 50.000 Car alarm
+F5AP8kQvogM_30.000_40.000.wav 30.000 40.000 Car alarm
+FKJuDOAumSk_20.000_30.000.wav 20.000 30.000 Car alarm
+GmbNjZi4xBw_30.000_40.000.wav 30.000 40.000 Car alarm
+H7lOMlND9dc_30.000_40.000.wav 30.000 40.000 Car alarm
+Hu8lxbHYaqg_40.000_50.000.wav 40.000 50.000 Car alarm
+IziTYkSwq9Q_30.000_40.000.wav 30.000 40.000 Car alarm
+JcO2TTtiplA_30.000_40.000.wav 30.000 40.000 Car alarm
+KKx7dWRg8s8_8.000_18.000.wav 8.000 18.000 Car alarm
+Kf9Kr69mwOA_14.000_24.000.wav 14.000 24.000 Car alarm
+L535vIV3ED4_40.000_50.000.wav 40.000 50.000 Car alarm
+LOjT44tFx1A_0.000_10.000.wav 0.000 10.000 Car alarm
+Mxn2FKuNwiI_20.000_30.000.wav 20.000 30.000 Car alarm
+Nkqx09b-xyI_70.000_80.000.wav 70.000 80.000 Car alarm
+QNKo1W1WRbc_22.000_32.000.wav 22.000 32.000 Car alarm
+R0VxYDfjyAU_60.000_70.000.wav 60.000 70.000 Car alarm
+TJ58vMpSy1w_30.000_40.000.wav 30.000 40.000 Car alarm
+ToU1kRagUjY_0.000_10.000.wav 0.000 10.000 Car alarm
+TrQGIZqrW0s_30.000_40.000.wav 30.000 40.000 Car alarm
+ULFhHR0OLSE_30.000_40.000.wav 30.000 40.000 Car alarm
+ULS3ffQkCW4_30.000_40.000.wav 30.000 40.000 Car alarm
+U_9NuNORYQM_1.000_11.000.wav 1.000 11.000 Car alarm
+UkCEuwYUW8c_110.000_120.000.wav 110.000 120.000 Car alarm
+Wak5QxsS-QU_30.000_40.000.wav 30.000 40.000 Car alarm
+XzE7mp3pVik_0.000_10.000.wav 0.000 10.000 Car alarm
+Y-4dtrP-RNo_7.000_17.000.wav 7.000 17.000 Car alarm
+Zltlj0fDeS4_30.000_40.000.wav 30.000 40.000 Car alarm
+cB1jkzgH2es_150.000_160.000.wav 150.000 160.000 Car alarm
+eIMjkADTWzA_60.000_70.000.wav 60.000 70.000 Car alarm
+eL7s5CoW0UA_0.000_7.000.wav 0.000 7.000 Car alarm
+i9VjpIbM3iE_410.000_420.000.wav 410.000 420.000 Car alarm
+iWl-5LNURFc_30.000_40.000.wav 30.000 40.000 Car alarm
+iX34nDCq9NU_10.000_20.000.wav 10.000 20.000 Car alarm
+ii87iO6JboA_10.000_20.000.wav 10.000 20.000 Car alarm
+l6_h_YHuTbY_30.000_40.000.wav 30.000 40.000 Car alarm
+lhedRVb85Fk_30.000_40.000.wav 30.000 40.000 Car alarm
+monelE7hnwI_20.000_30.000.wav 20.000 30.000 Car alarm
+o2CmtHNUrXg_30.000_40.000.wav 30.000 40.000 Car alarm
+pXX6cK4xtiY_11.000_21.000.wav 11.000 21.000 Car alarm
+stnVta2ip9g_30.000_40.000.wav 30.000 40.000 Car alarm
+uvuVg9Cl0n0_30.000_40.000.wav 30.000 40.000 Car alarm
+vF2zXcbADUk_20.000_30.000.wav 20.000 30.000 Car alarm
+vN7dJyt-nj0_20.000_30.000.wav 20.000 30.000 Car alarm
+w8Md65mE5Vc_30.000_40.000.wav 30.000 40.000 Car alarm
+ySqfMcFk5LM_30.000_40.000.wav 30.000 40.000 Car alarm
+ysNK5RVF3Zw_0.000_10.000.wav 0.000 10.000 Car alarm
+za8KPcQ0dTw_30.000_40.000.wav 30.000 40.000 Car alarm
+-2sE5CH8Wb8_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-fJsZm3YRc0_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-oSzD8P2BtU_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-pzwalZ0ub0_5.000_15.000.wav 5.000 15.000 Reversing beeps
+-t-htrAtNvM_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-zNEcuo28oE_30.000_40.000.wav 30.000 40.000 Reversing beeps
+077aWlQn6XI_30.000_40.000.wav 30.000 40.000 Reversing beeps
+0O-gZoirpRA_30.000_40.000.wav 30.000 40.000 Reversing beeps
+10aF24rMeu0_30.000_40.000.wav 30.000 40.000 Reversing beeps
+1P5FFxXLSpY_30.000_40.000.wav 30.000 40.000 Reversing beeps
+1n_s2Gb5R1Q_30.000_40.000.wav 30.000 40.000 Reversing beeps
+2HZcxlRs-hg_30.000_40.000.wav 30.000 40.000 Reversing beeps
+2Jpg_KvJWL0_30.000_40.000.wav 30.000 40.000 Reversing beeps
+2WTk_j_fivY_30.000_40.000.wav 30.000 40.000 Reversing beeps
+38F6eeIR-s0_30.000_40.000.wav 30.000 40.000 Reversing beeps
+3xh2kScw64U_30.000_40.000.wav 30.000 40.000 Reversing beeps
+4MIHbR4QZhE_30.000_40.000.wav 30.000 40.000 Reversing beeps
+4Tpy1lsfcSM_30.000_40.000.wav 30.000 40.000 Reversing beeps
+4XMY2IvVSf0_30.000_40.000.wav 30.000 40.000 Reversing beeps
+4ep09nZl3LA_30.000_40.000.wav 30.000 40.000 Reversing beeps
+4t1VqRz4w2g_30.000_40.000.wav 30.000 40.000 Reversing beeps
+4tKvAMmAUMM_30.000_40.000.wav 30.000 40.000 Reversing beeps
+5-x2pk3YYAs_11.000_21.000.wav 11.000 21.000 Reversing beeps
+5DW8WjxxCag_30.000_40.000.wav 30.000 40.000 Reversing beeps
+5DjZHCumLfs_11.000_21.000.wav 11.000 21.000 Reversing beeps
+5V0xKS-FGMk_30.000_40.000.wav 30.000 40.000 Reversing beeps
+5fLzQegwHUg_30.000_40.000.wav 30.000 40.000 Reversing beeps
+6Y8bKS6KLeE_30.000_40.000.wav 30.000 40.000 Reversing beeps
+6xEHP-C-ZuU_30.000_40.000.wav 30.000 40.000 Reversing beeps
+6yyToq9cW9A_60.000_70.000.wav 60.000 70.000 Reversing beeps
+7Gua0-UrKIw_30.000_40.000.wav 30.000 40.000 Reversing beeps
+7nglQSmcjAk_30.000_40.000.wav 30.000 40.000 Reversing beeps
+81DteAPIhoE_30.000_40.000.wav 30.000 40.000 Reversing beeps
+96a4smrM_30_30.000_40.000.wav 30.000 40.000 Reversing beeps
+9EsgN-WS2qY_30.000_40.000.wav 30.000 40.000 Reversing beeps
+9OcAwC8y-eQ_30.000_40.000.wav 30.000 40.000 Reversing beeps
+9Ti98L4PRCo_17.000_27.000.wav 17.000 27.000 Reversing beeps
+9yhMtJ50sys_30.000_40.000.wav 30.000 40.000 Reversing beeps
+A9KMqwqLboE_30.000_40.000.wav 30.000 40.000 Reversing beeps
+AFwmMFq_xlc_390.000_400.000.wav 390.000 400.000 Reversing beeps
+AvhBRiwWJU4_30.000_40.000.wav 30.000 40.000 Reversing beeps
+CL5vkiMs2c0_10.000_20.000.wav 10.000 20.000 Reversing beeps
+DcU6AzN7imA_210.000_220.000.wav 210.000 220.000 Reversing beeps
+ISBJKY8hwnM_30.000_40.000.wav 30.000 40.000 Reversing beeps
+LA5TekLaIPI_10.000_20.000.wav 10.000 20.000 Reversing beeps
+NqzZbJJl3E4_30.000_40.000.wav 30.000 40.000 Reversing beeps
+PSt0xAYgf4g_0.000_10.000.wav 0.000 10.000 Reversing beeps
+Q1CMSV81_ws_30.000_40.000.wav 30.000 40.000 Reversing beeps
+_gG0KNGD47M_30.000_40.000.wav 30.000 40.000 Reversing beeps
+ckt7YEGcSoY_30.000_40.000.wav 30.000 40.000 Reversing beeps
+eIkUuCRE_0U_30.000_40.000.wav 30.000 40.000 Reversing beeps
+kH6fFjIZkB0_30.000_40.000.wav 30.000 40.000 Reversing beeps
+mCJ0aqIygWE_24.000_34.000.wav 24.000 34.000 Reversing beeps
+nFqf1vflJaI_350.000_360.000.wav 350.000 360.000 Reversing beeps
+nMaSkwx6cHE_30.000_40.000.wav 30.000 40.000 Reversing beeps
+oHKTmTLEy68_11.000_21.000.wav 11.000 21.000 Reversing beeps
+saPU2JNoytU_0.000_10.000.wav 0.000 10.000 Reversing beeps
+tQd0vFueRKs_30.000_40.000.wav 30.000 40.000 Reversing beeps
+vzP6soELj2Q_0.000_10.000.wav 0.000 10.000 Reversing beeps
+0x82_HySIVU_30.000_40.000.wav 30.000 40.000 Bicycle
+1IQdvfm9SDY_30.000_40.000.wav 30.000 40.000 Bicycle
+1_hGvbEiYAs_30.000_40.000.wav 30.000 40.000 Bicycle
+26CM8IXODG4_2.000_12.000.wav 2.000 12.000 Bicycle
+2f7Ad-XpbnY_30.000_40.000.wav 30.000 40.000 Bicycle
+3-a8i_MEUl8_30.000_40.000.wav 30.000 40.000 Bicycle
+7KiTXYwaD04_7.000_17.000.wav 7.000 17.000 Bicycle
+7gkjn-LLInI_30.000_40.000.wav 30.000 40.000 Bicycle
+84flVacRHUI_21.000_31.000.wav 21.000 31.000 Bicycle
+9VziOIkNXsE_30.000_40.000.wav 30.000 40.000 Bicycle
+ANofTuuN0W0_160.000_170.000.wav 160.000 170.000 Bicycle
+B6n0op0sLPA_30.000_40.000.wav 30.000 40.000 Bicycle
+D4_zTwsCRds_60.000_70.000.wav 60.000 70.000 Bicycle
+DEs_Sp9S1Nw_30.000_40.000.wav 30.000 40.000 Bicycle
+GjsxrMRRdfQ_3.000_13.000.wav 3.000 13.000 Bicycle
+GkpUU3VX4wQ_30.000_40.000.wav 30.000 40.000 Bicycle
+H9HNXYxRmv8_30.000_40.000.wav 30.000 40.000 Bicycle
+HPWRKwrs-rY_370.000_380.000.wav 370.000 380.000 Bicycle
+HrQxbNO5jXU_6.000_16.000.wav 6.000 16.000 Bicycle
+IYaEZkAO0LU_30.000_40.000.wav 30.000 40.000 Bicycle
+Idzfy0XbZRo_7.000_17.000.wav 7.000 17.000 Bicycle
+Iigfz_GeXVs_30.000_40.000.wav 30.000 40.000 Bicycle
+JWCtQ_94YoQ_30.000_40.000.wav 30.000 40.000 Bicycle
+JXmBrD4b4EI_30.000_40.000.wav 30.000 40.000 Bicycle
+LSZPNwZex9s_30.000_40.000.wav 30.000 40.000 Bicycle
+M5kwg1kx4q0_30.000_40.000.wav 30.000 40.000 Bicycle
+NrR1wmCpqAk_12.000_22.000.wav 12.000 22.000 Bicycle
+O1_Rw2dHb1I_2.000_12.000.wav 2.000 12.000 Bicycle
+OEN0TySl1Jw_10.000_20.000.wav 10.000 20.000 Bicycle
+PF7uY9ydMYc_30.000_40.000.wav 30.000 40.000 Bicycle
+SDl0tWf9Q44_30.000_40.000.wav 30.000 40.000 Bicycle
+SkXXjcw9sJI_30.000_40.000.wav 30.000 40.000 Bicycle
+Ssa1m5Mnllw_0.000_9.000.wav 0.000 9.000 Bicycle
+UB-A1oyNyyg_0.000_6.000.wav 0.000 6.000 Bicycle
+UqyvFyQthHo_30.000_40.000.wav 30.000 40.000 Bicycle
+Wg4ik5zZxBc_250.000_260.000.wav 250.000 260.000 Bicycle
+WvquSD2PcCE_30.000_40.000.wav 30.000 40.000 Bicycle
+YIJBuXUi64U_30.000_40.000.wav 30.000 40.000 Bicycle
+aBHdl_TiseI_30.000_40.000.wav 30.000 40.000 Bicycle
+aeHCq6fFkNo_30.000_40.000.wav 30.000 40.000 Bicycle
+amKDjVcs1Vg_30.000_40.000.wav 30.000 40.000 Bicycle
+ehYwty_G2L4_13.000_23.000.wav 13.000 23.000 Bicycle
+jOlVJv7jAHg_30.000_40.000.wav 30.000 40.000 Bicycle
+lGFDQ-ZwUfk_30.000_40.000.wav 30.000 40.000 Bicycle
+lmTHvLGQy3g_50.000_60.000.wav 50.000 60.000 Bicycle
+nNHW3Uxlb-g_30.000_40.000.wav 30.000 40.000 Bicycle
+o98R4ruf8kw_30.000_40.000.wav 30.000 40.000 Bicycle
+oiLHBkHgkAo_0.000_8.000.wav 0.000 8.000 Bicycle
+qL0ESQcaPhM_30.000_40.000.wav 30.000 40.000 Bicycle
+qjz5t9M4YCw_30.000_40.000.wav 30.000 40.000 Bicycle
+qrCWPsqG9vA_30.000_40.000.wav 30.000 40.000 Bicycle
+r06tmeUDgc8_3.000_13.000.wav 3.000 13.000 Bicycle
+sAMjMyCdGOc_30.000_40.000.wav 30.000 40.000 Bicycle
+tKdRlWz-1pg_30.000_40.000.wav 30.000 40.000 Bicycle
+uNpSMpqlkMA_0.000_10.000.wav 0.000 10.000 Bicycle
+vOYj9W7Jsxk_8.000_18.000.wav 8.000 18.000 Bicycle
+xBKrmKdjAIA_0.000_10.000.wav 0.000 10.000 Bicycle
+xfNeZaw4o3U_17.000_27.000.wav 17.000 27.000 Bicycle
+xgiJqbhhU3c_30.000_40.000.wav 30.000 40.000 Bicycle
+0vg9qxNKXOw_30.000_40.000.wav 30.000 40.000 Skateboard
+10YXuv9Go0E_140.000_150.000.wav 140.000 150.000 Skateboard
+3-a8i_MEUl8_30.000_40.000.wav 30.000 40.000 Skateboard
+6kXUG1Zo6VA_0.000_10.000.wav 0.000 10.000 Skateboard
+84fDGWoRtsU_210.000_220.000.wav 210.000 220.000 Skateboard
+8kbHA22EWd0_330.000_340.000.wav 330.000 340.000 Skateboard
+8m-a_6wLTkU_230.000_240.000.wav 230.000 240.000 Skateboard
+9QwaP-cvdeU_360.000_370.000.wav 360.000 370.000 Skateboard
+9ZYj5toEbGA_0.000_10.000.wav 0.000 10.000 Skateboard
+9gkppwB5CXA_30.000_40.000.wav 30.000 40.000 Skateboard
+9hlXgXWXYXQ_0.000_6.000.wav 0.000 6.000 Skateboard
+ALxn5-2bVyI_30.000_40.000.wav 30.000 40.000 Skateboard
+ANPjV_rudog_30.000_40.000.wav 30.000 40.000 Skateboard
+ATAL-_Dblvg_0.000_7.000.wav 0.000 7.000 Skateboard
+An-4jPvUT14_60.000_70.000.wav 60.000 70.000 Skateboard
+BGR0QnX4k6w_30.000_40.000.wav 30.000 40.000 Skateboard
+BlhUt8AJJO8_30.000_40.000.wav 30.000 40.000 Skateboard
+CD7INyI79fM_170.000_180.000.wav 170.000 180.000 Skateboard
+CNcxzB9F-Q8_100.000_110.000.wav 100.000 110.000 Skateboard
+DqOGYyFVnKk_200.000_210.000.wav 200.000 210.000 Skateboard
+E0gBwPTHxqE_30.000_40.000.wav 30.000 40.000 Skateboard
+E3XIdP8kxwg_110.000_120.000.wav 110.000 120.000 Skateboard
+FQZnQhiM41U_0.000_6.000.wav 0.000 6.000 Skateboard
+FRwFfq3Tl1g_310.000_320.000.wav 310.000 320.000 Skateboard
+JJo971B_eDg_30.000_40.000.wav 30.000 40.000 Skateboard
+KXkxqxoCylc_30.000_40.000.wav 30.000 40.000 Skateboard
+L4Z7XkS6CtA_30.000_40.000.wav 30.000 40.000 Skateboard
+LjEqr0Z7xm0_0.000_6.000.wav 0.000 6.000 Skateboard
+MAbDEeLF4cQ_30.000_40.000.wav 30.000 40.000 Skateboard
+MUBbiivNYZs_30.000_40.000.wav 30.000 40.000 Skateboard
+Nq8GyBrTI8Y_30.000_40.000.wav 30.000 40.000 Skateboard
+PPq9QZmV7jc_25.000_35.000.wav 25.000 35.000 Skateboard
+PVgL5wFOKMs_30.000_40.000.wav 30.000 40.000 Skateboard
+Tcq_xAdCMr4_30.000_40.000.wav 30.000 40.000 Skateboard
+UtZofZjccBs_290.000_300.000.wav 290.000 300.000 Skateboard
+VZfrDZhI7BU_30.000_40.000.wav 30.000 40.000 Skateboard
+WxChkRrVOIs_0.000_7.000.wav 0.000 7.000 Skateboard
+YV0noe1sZAs_150.000_160.000.wav 150.000 160.000 Skateboard
+YjScrri_F7U_0.000_10.000.wav 0.000 10.000 Skateboard
+YrGQKTbiG1g_30.000_40.000.wav 30.000 40.000 Skateboard
+ZM67kt6G-d4_30.000_40.000.wav 30.000 40.000 Skateboard
+ZaUaqnLdg6k_30.000_40.000.wav 30.000 40.000 Skateboard
+ZhpkRcAEJzc_3.000_13.000.wav 3.000 13.000 Skateboard
+_43OOP6UEw0_30.000_40.000.wav 30.000 40.000 Skateboard
+_6Fyave4jqA_260.000_270.000.wav 260.000 270.000 Skateboard
+aOoZ0bCoaZw_30.000_40.000.wav 30.000 40.000 Skateboard
+gV6y9L24wWg_0.000_10.000.wav 0.000 10.000 Skateboard
+hHb0Eq1I7Fk_0.000_10.000.wav 0.000 10.000 Skateboard
+lGf_L6i6AZI_20.000_30.000.wav 20.000 30.000 Skateboard
+leOH87itNWM_30.000_40.000.wav 30.000 40.000 Skateboard
+mIkW7mWlnXw_30.000_40.000.wav 30.000 40.000 Skateboard
+qadmKrM0ppo_20.000_30.000.wav 20.000 30.000 Skateboard
+rLUIHCc4b9A_0.000_7.000.wav 0.000 7.000 Skateboard
+u3vBJgEVJvk_0.000_10.000.wav 0.000 10.000 Skateboard
+vHKBrtPDSvA_150.000_160.000.wav 150.000 160.000 Skateboard
+wWmydRt0Z-w_21.000_31.000.wav 21.000 31.000 Skateboard
+xeHt-R5ScmI_0.000_10.000.wav 0.000 10.000 Skateboard
+xqGtIVeeXY4_330.000_340.000.wav 330.000 340.000 Skateboard
+y_lfY0uzmr0_30.000_40.000.wav 30.000 40.000 Skateboard
+02Ak1eIyj3M_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+0N0C0Wbe6AI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+2-h8MRSRvEg_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+4APBvMmKubU_10.000_20.000.wav 10.000 20.000 Ambulance (siren)
+5RgHBmX2HLw_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+6rXgD5JlYxY_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+7eeN-fXbso8_20.000_30.000.wav 20.000 30.000 Ambulance (siren)
+8Aq2DyLbUBA_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+8qMHvgA9mGw_20.000_30.000.wav 20.000 30.000 Ambulance (siren)
+9CRb-PToaAM_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+AwFuGITwrms_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+BGp9-Ro5h8Y_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+CDrpqsGqfPo_10.000_20.000.wav 10.000 20.000 Ambulance (siren)
+Cc7-P0py1Mc_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+Daqv2F6SEmQ_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+F9Dbcxr-lAI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+GORjnSWhZeY_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+GgV0yYogTPI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+H9xQQVv3ElI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+LNQ7fzfdLiY_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+MEUcv-QM0cQ_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+QWVub6-0jX4_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+R8G5Y0HASxY_60.000_70.000.wav 60.000 70.000 Ambulance (siren)
+RVTKY5KR3ME_20.000_30.000.wav 20.000 30.000 Ambulance (siren)
+Sm0pPvXPA9U_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+VXI3-DI4xNs_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+W8fIlauyJkk_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+ZlS4vIWQMmE_0.000_10.000.wav 0.000 10.000 Ambulance (siren)
+ZxlbI2Rj1VY_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+ZyuX_gMFiss_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+bA8mt0JI0Ko_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+bIU0X1v4SF0_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+cHm1cYBAXMI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+cR79KnWpiQA_70.000_80.000.wav 70.000 80.000 Ambulance (siren)
+dPcw4R5lczw_500.000_510.000.wav 500.000 510.000 Ambulance (siren)
+epwDz5WBkvc_80.000_90.000.wav 80.000 90.000 Ambulance (siren)
+fHaQPHCjyfA_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+gw9pYEG2Zb0_20.000_30.000.wav 20.000 30.000 Ambulance (siren)
+iEX8L_oEbsU_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+iM-U56fTTOQ_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+iSnWMz4FUAg_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+kJuvA2zmrnY_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+kSjvt2Z_pBo_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+ke35yF1LHs4_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+lqGtL8sUo_g_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+mAfPu0meA_Y_20.000_30.000.wav 20.000 30.000 Ambulance (siren)
+mlS9LLiMIG8_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+oPR7tUEUptk_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+qsHc2X1toLs_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+rCQykaL8Hy4_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+rhUfN81puDI_0.000_10.000.wav 0.000 10.000 Ambulance (siren)
+s0iddDFzL9s_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+tcKlq7_cOkw_8.000_18.000.wav 8.000 18.000 Ambulance (siren)
+u3yYpMwG4Us_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+vBXPyBiyJG0_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+vVqUvv1SSu8_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+vYKWnuvq2FI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+ysNK5RVF3Zw_0.000_10.000.wav 0.000 10.000 Ambulance (siren)
+z4B14tAqJ4w_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+zbiJEml563w_20.000_30.000.wav 20.000 30.000 Ambulance (siren)
+-HxRz4w60-Y_150.000_160.000.wav 150.000 160.000 Fire engine, fire truck (siren)
+-_dElQcyJnA_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+0K1mroXg8bs_9.000_19.000.wav 9.000 19.000 Fire engine, fire truck (siren)
+0SvSNVatkv0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+2-h8MRSRvEg_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+31WGUPOYS5g_22.000_32.000.wav 22.000 32.000 Fire engine, fire truck (siren)
+3h3_IZWhX0g_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+4APBvMmKubU_10.000_20.000.wav 10.000 20.000 Fire engine, fire truck (siren)
+5fjy_2ajEkg_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+6rXgD5JlYxY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+8Aq2DyLbUBA_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+8DaEd5KbnnA_80.000_90.000.wav 80.000 90.000 Fire engine, fire truck (siren)
+ARIVxBOc0BQ_40.000_50.000.wav 40.000 50.000 Fire engine, fire truck (siren)
+AwFuGITwrms_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+Bs2KqqI9F_k_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+Cc7-P0py1Mc_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+D4M3YT75ZrQ_90.000_100.000.wav 90.000 100.000 Fire engine, fire truck (siren)
+DWXQ_cSUW98_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+Daqv2F6SEmQ_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+DpagxUQwXDo_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+FFSI6Bg2M-Q_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+GORjnSWhZeY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+GbIuxmaiCOk_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+GgV0yYogTPI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+H6c8ZDrdUaM_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+H9xQQVv3ElI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+HQQxGJKg1iM_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+IiCh2H3JtsE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+InrS4Fdndr4_0.000_10.000.wav 0.000 10.000 Fire engine, fire truck (siren)
+JpLA7HY9r3Y_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+MEUcv-QM0cQ_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+PCl-q7lCT_U_50.000_60.000.wav 50.000 60.000 Fire engine, fire truck (siren)
+VXI3-DI4xNs_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+Xggsbzzes3M_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+YbiiaDBU-HI_10.000_20.000.wav 10.000 20.000 Fire engine, fire truck (siren)
+ZeH6Fc7Y900_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+ZlS4vIWQMmE_0.000_10.000.wav 0.000 10.000 Fire engine, fire truck (siren)
+bIU0X1v4SF0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+cHm1cYBAXMI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+fHaQPHCjyfA_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+iM-U56fTTOQ_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+k2a30--j37Q_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+kJuvA2zmrnY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+kr8ssbrDDMY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+pvYwIdGrS90_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+qsHc2X1toLs_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+rCQykaL8Hy4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+rhUfN81puDI_0.000_10.000.wav 0.000 10.000 Fire engine, fire truck (siren)
+u08iA12iAmM_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+u9aHjYGbl5o_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+uUiZrgUpw2A_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+vBXPyBiyJG0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+vVqUvv1SSu8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+vYKWnuvq2FI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+wD0P-doqkXo_20.000_30.000.wav 20.000 30.000 Fire engine, fire truck (siren)
+xbr7x2V6mxk_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+ysNK5RVF3Zw_0.000_10.000.wav 0.000 10.000 Fire engine, fire truck (siren)
+z4B14tAqJ4w_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+zpzJKMG5iGc_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+02Ak1eIyj3M_30.000_40.000.wav 30.000 40.000 Civil defense siren
+0CJFt950vOk_30.000_40.000.wav 30.000 40.000 Civil defense siren
+0phl6nlC-n0_10.000_20.000.wav 10.000 20.000 Civil defense siren
+1jhbNtCWC9w_50.000_60.000.wav 50.000 60.000 Civil defense siren
+4Ukj2TTJxHM_30.000_40.000.wav 30.000 40.000 Civil defense siren
+4XAVaSz_P7c_150.000_160.000.wav 150.000 160.000 Civil defense siren
+69AIBPnJN5E_0.000_10.000.wav 0.000 10.000 Civil defense siren
+8DaEd5KbnnA_80.000_90.000.wav 80.000 90.000 Civil defense siren
+8ILgvaJVPCI_30.000_40.000.wav 30.000 40.000 Civil defense siren
+9MWHXCLAX8I_30.000_40.000.wav 30.000 40.000 Civil defense siren
+A5y-aZc0CiM_30.000_40.000.wav 30.000 40.000 Civil defense siren
+AQCZH4OdNSM_30.000_40.000.wav 30.000 40.000 Civil defense siren
+AVBUh6qeHrQ_30.000_40.000.wav 30.000 40.000 Civil defense siren
+BhQPDafekdw_30.000_40.000.wav 30.000 40.000 Civil defense siren
+CJXNdudcJrs_30.000_40.000.wav 30.000 40.000 Civil defense siren
+CU2MyVM_B48_30.000_40.000.wav 30.000 40.000 Civil defense siren
+DdZw0XDv0JI_30.000_40.000.wav 30.000 40.000 Civil defense siren
+DgWHUawAGnI_30.000_40.000.wav 30.000 40.000 Civil defense siren
+Do9Dffb6vHA_30.000_40.000.wav 30.000 40.000 Civil defense siren
+GO2zKyMtBV4_30.000_40.000.wav 30.000 40.000 Civil defense siren
+GeRgy4of730_30.000_40.000.wav 30.000 40.000 Civil defense siren
+IIypdzgZAaI_30.000_40.000.wav 30.000 40.000 Civil defense siren
+JpLA7HY9r3Y_30.000_40.000.wav 30.000 40.000 Civil defense siren
+JqHJ7015aWM_30.000_40.000.wav 30.000 40.000 Civil defense siren
+K7a1P4RX_5w_30.000_40.000.wav 30.000 40.000 Civil defense siren
+KrTocA-I550_190.000_200.000.wav 190.000 200.000 Civil defense siren
+KumYcZVLOVU_350.000_360.000.wav 350.000 360.000 Civil defense siren
+L60HS_jbZu0_30.000_40.000.wav 30.000 40.000 Civil defense siren
+MZ1Yh6mRC-E_30.000_40.000.wav 30.000 40.000 Civil defense siren
+R8XUrRCFkzs_30.000_40.000.wav 30.000 40.000 Civil defense siren
+SyWbolNFst4_60.000_70.000.wav 60.000 70.000 Civil defense siren
+TYLZuBBu8ms_0.000_10.000.wav 0.000 10.000 Civil defense siren
+Tx6eSkU2lKc_30.000_40.000.wav 30.000 40.000 Civil defense siren
+VcflBZLflSU_130.000_140.000.wav 130.000 140.000 Civil defense siren
+WXsTHg_DiYA_30.000_40.000.wav 30.000 40.000 Civil defense siren
+Wz5ffJxCElQ_10.000_20.000.wav 10.000 20.000 Civil defense siren
+X2MlmcY8UZU_30.000_40.000.wav 30.000 40.000 Civil defense siren
+XYLheTmlEYI_30.000_40.000.wav 30.000 40.000 Civil defense siren
+YyxlD_FwZXM_30.000_40.000.wav 30.000 40.000 Civil defense siren
+adCuLs-4nmI_30.000_40.000.wav 30.000 40.000 Civil defense siren
+cPjtrTq3F-I_30.000_40.000.wav 30.000 40.000 Civil defense siren
+eHDm93tI4Ok_30.000_40.000.wav 30.000 40.000 Civil defense siren
+etppP5Sdo14_30.000_40.000.wav 30.000 40.000 Civil defense siren
+fRKxUc1gQBw_50.000_60.000.wav 50.000 60.000 Civil defense siren
+feIue4LHzfM_30.000_40.000.wav 30.000 40.000 Civil defense siren
+gr-Yen6Sj_Q_0.000_10.000.wav 0.000 10.000 Civil defense siren
+hl3Kqi9Wi_g_30.000_40.000.wav 30.000 40.000 Civil defense siren
+iKca2cbowd4_30.000_40.000.wav 30.000 40.000 Civil defense siren
+kzFyGWdj6MI_30.000_40.000.wav 30.000 40.000 Civil defense siren
+m3LGopSVju4_30.000_40.000.wav 30.000 40.000 Civil defense siren
+ne4IMxs-hMk_30.000_40.000.wav 30.000 40.000 Civil defense siren
+nuu2iNisoQc_6.000_16.000.wav 6.000 16.000 Civil defense siren
+oYeql9xE19k_30.000_40.000.wav 30.000 40.000 Civil defense siren
+rGUrM19BnJ8_110.000_120.000.wav 110.000 120.000 Civil defense siren
+u08iA12iAmM_30.000_40.000.wav 30.000 40.000 Civil defense siren
+uCRAnDBXxgI_30.000_40.000.wav 30.000 40.000 Civil defense siren
+vQG4HZR2KSk_30.000_40.000.wav 30.000 40.000 Civil defense siren
+vjsG5b2yNzc_190.000_200.000.wav 190.000 200.000 Civil defense siren
+yO7guxGY-_k_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-9GUUhB3QV0_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-HxRz4w60-Y_150.000_160.000.wav 150.000 160.000 Police car (siren)
+-UBVqmhbT50_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-_dElQcyJnA_30.000_40.000.wav 30.000 40.000 Police car (siren)
+0N0C0Wbe6AI_30.000_40.000.wav 30.000 40.000 Police car (siren)
+0SvSNVatkv0_30.000_40.000.wav 30.000 40.000 Police car (siren)
+145N68nh4m0_120.000_130.000.wav 120.000 130.000 Police car (siren)
+2-h8MRSRvEg_30.000_40.000.wav 30.000 40.000 Police car (siren)
+31WGUPOYS5g_22.000_32.000.wav 22.000 32.000 Police car (siren)
+5RgHBmX2HLw_30.000_40.000.wav 30.000 40.000 Police car (siren)
+6rXgD5JlYxY_30.000_40.000.wav 30.000 40.000 Police car (siren)
+8Aq2DyLbUBA_30.000_40.000.wav 30.000 40.000 Police car (siren)
+8DaEd5KbnnA_80.000_90.000.wav 80.000 90.000 Police car (siren)
+8E7okHnCcTA_30.000_40.000.wav 30.000 40.000 Police car (siren)
+9CRb-PToaAM_30.000_40.000.wav 30.000 40.000 Police car (siren)
+9OFUd38sBNM_0.000_8.000.wav 0.000 8.000 Police car (siren)
+AQCZH4OdNSM_30.000_40.000.wav 30.000 40.000 Police car (siren)
+AwFuGITwrms_30.000_40.000.wav 30.000 40.000 Police car (siren)
+CDrpqsGqfPo_10.000_20.000.wav 10.000 20.000 Police car (siren)
+DK_6C29B2zs_14.000_24.000.wav 14.000 24.000 Police car (siren)
+GORjnSWhZeY_30.000_40.000.wav 30.000 40.000 Police car (siren)
+GgV0yYogTPI_30.000_40.000.wav 30.000 40.000 Police car (siren)
+H6c8ZDrdUaM_30.000_40.000.wav 30.000 40.000 Police car (siren)
+H7lOMlND9dc_30.000_40.000.wav 30.000 40.000 Police car (siren)
+H9xQQVv3ElI_30.000_40.000.wav 30.000 40.000 Police car (siren)
+IiCh2H3JtsE_30.000_40.000.wav 30.000 40.000 Police car (siren)
+InrS4Fdndr4_0.000_10.000.wav 0.000 10.000 Police car (siren)
+JgDuU9kpHpM_30.000_40.000.wav 30.000 40.000 Police car (siren)
+JpLA7HY9r3Y_30.000_40.000.wav 30.000 40.000 Police car (siren)
+LNQ7fzfdLiY_30.000_40.000.wav 30.000 40.000 Police car (siren)
+PCl-q7lCT_U_50.000_60.000.wav 50.000 60.000 Police car (siren)
+QWVub6-0jX4_30.000_40.000.wav 30.000 40.000 Police car (siren)
+Wak5QxsS-QU_30.000_40.000.wav 30.000 40.000 Police car (siren)
+YbiiaDBU-HI_10.000_20.000.wav 10.000 20.000 Police car (siren)
+Z34SD-OEpJI_10.000_20.000.wav 10.000 20.000 Police car (siren)
+ZeH6Fc7Y900_30.000_40.000.wav 30.000 40.000 Police car (siren)
+ZlS4vIWQMmE_0.000_10.000.wav 0.000 10.000 Police car (siren)
+ZyuX_gMFiss_30.000_40.000.wav 30.000 40.000 Police car (siren)
+bIU0X1v4SF0_30.000_40.000.wav 30.000 40.000 Police car (siren)
+eIMjkADTWzA_60.000_70.000.wav 60.000 70.000 Police car (siren)
+epwDz5WBkvc_80.000_90.000.wav 80.000 90.000 Police car (siren)
+fHaQPHCjyfA_30.000_40.000.wav 30.000 40.000 Police car (siren)
+fNcrlqPrAqM_30.000_40.000.wav 30.000 40.000 Police car (siren)
+g_DBLppDZAs_30.000_40.000.wav 30.000 40.000 Police car (siren)
+gw9pYEG2Zb0_20.000_30.000.wav 20.000 30.000 Police car (siren)
+iEX8L_oEbsU_30.000_40.000.wav 30.000 40.000 Police car (siren)
+iM-U56fTTOQ_30.000_40.000.wav 30.000 40.000 Police car (siren)
+kJuvA2zmrnY_30.000_40.000.wav 30.000 40.000 Police car (siren)
+kSjvt2Z_pBo_30.000_40.000.wav 30.000 40.000 Police car (siren)
+lqGtL8sUo_g_30.000_40.000.wav 30.000 40.000 Police car (siren)
+mAfPu0meA_Y_20.000_30.000.wav 20.000 30.000 Police car (siren)
+mlS9LLiMIG8_30.000_40.000.wav 30.000 40.000 Police car (siren)
+pzup58Eyhuo_30.000_40.000.wav 30.000 40.000 Police car (siren)
+rCQykaL8Hy4_30.000_40.000.wav 30.000 40.000 Police car (siren)
+rhUfN81puDI_0.000_10.000.wav 0.000 10.000 Police car (siren)
+u08iA12iAmM_30.000_40.000.wav 30.000 40.000 Police car (siren)
+u3yYpMwG4Us_30.000_40.000.wav 30.000 40.000 Police car (siren)
+u9aHjYGbl5o_30.000_40.000.wav 30.000 40.000 Police car (siren)
+uUiZrgUpw2A_30.000_40.000.wav 30.000 40.000 Police car (siren)
+vYKWnuvq2FI_30.000_40.000.wav 30.000 40.000 Police car (siren)
+xbr7x2V6mxk_30.000_40.000.wav 30.000 40.000 Police car (siren)
+z4B14tAqJ4w_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-FKrYTj_eCU_0.000_10.000.wav 0.000 10.000 Screaming
+0G50t4FlbIA_60.000_70.000.wav 60.000 70.000 Screaming
+1LTxZ2aNytc_30.000_40.000.wav 30.000 40.000 Screaming
+2FEhG1UXb_E_370.000_380.000.wav 370.000 380.000 Screaming
+45vBbOhzS6g_50.000_60.000.wav 50.000 60.000 Screaming
+4PYTtp78Ig0_60.000_70.000.wav 60.000 70.000 Screaming
+5QNq0IEPICQ_30.000_40.000.wav 30.000 40.000 Screaming
+5YcIJuYQECc_0.000_6.000.wav 0.000 6.000 Screaming
+5kQF4r03yRI_0.000_6.000.wav 0.000 6.000 Screaming
+7ARVgI_wx5Y_30.000_40.000.wav 30.000 40.000 Screaming
+AIFvFuZPr68_30.000_40.000.wav 30.000 40.000 Screaming
+Aw43FUCkIb8_20.000_30.000.wav 20.000 30.000 Screaming
+AxM2BofYfPY_30.000_40.000.wav 30.000 40.000 Screaming
+BFqHyCoypfM_16.000_26.000.wav 16.000 26.000 Screaming
+Bk_xS_fKCpk_30.000_40.000.wav 30.000 40.000 Screaming
+C4YMjmJ7tt4_90.000_100.000.wav 90.000 100.000 Screaming
+CMWoAvgD0A0_9.000_19.000.wav 9.000 19.000 Screaming
+DZfYFhywhRs_30.000_40.000.wav 30.000 40.000 Screaming
+ElJFYwRtrH4_30.000_40.000.wav 30.000 40.000 Screaming
+FcUVtXJMkJs_30.000_40.000.wav 30.000 40.000 Screaming
+G--718JDmAQ_0.000_10.000.wav 0.000 10.000 Screaming
+GPJ1uQwmNHk_30.000_40.000.wav 30.000 40.000 Screaming
+H3vSRzkG82U_30.000_40.000.wav 30.000 40.000 Screaming
+HS28EUWt8dE_110.000_120.000.wav 110.000 120.000 Screaming
+KkGTB8ESMCM_0.000_10.000.wav 0.000 10.000 Screaming
+MQ0YasvMcuQ_1.000_11.000.wav 1.000 11.000 Screaming
+Msl9dI5yweA_90.000_100.000.wav 90.000 100.000 Screaming
+Ntn6YvZM3kA_0.000_10.000.wav 0.000 10.000 Screaming
+NwTHlpXdk4M_30.000_40.000.wav 30.000 40.000 Screaming
+OHjfSfqa804_0.000_10.000.wav 0.000 10.000 Screaming
+OzWJuqG2F3Y_30.000_40.000.wav 30.000 40.000 Screaming
+QDW_uCMnMMU_0.000_8.000.wav 0.000 8.000 Screaming
+SxI3Lnzzmkw_110.000_120.000.wav 110.000 120.000 Screaming
+TVvbfuGu9eM_70.000_80.000.wav 70.000 80.000 Screaming
+YCk9F0Uq3BE_70.000_80.000.wav 70.000 80.000 Screaming
+Z54pSnNw2iM_30.000_40.000.wav 30.000 40.000 Screaming
+a59ivTlYoNk_310.000_320.000.wav 310.000 320.000 Screaming
+auC_LgwFF8g_30.000_40.000.wav 30.000 40.000 Screaming
+bi8R9JbF2cc_80.000_90.000.wav 80.000 90.000 Screaming
+cdbYsoEasio_70.000_80.000.wav 70.000 80.000 Screaming
+dfsvT5xImNg_80.000_90.000.wav 80.000 90.000 Screaming
+e2AaF6siR1A_540.000_550.000.wav 540.000 550.000 Screaming
+gB1ytjgpcW4_190.000_200.000.wav 190.000 200.000 Screaming
+gE-0JxMtUh0_20.000_30.000.wav 20.000 30.000 Screaming
+hWiGgsuGnzs_100.000_110.000.wav 100.000 110.000 Screaming
+l-iIfi3SNpw_120.000_130.000.wav 120.000 130.000 Screaming
+mT-f0lGk-JM_30.000_40.000.wav 30.000 40.000 Screaming
+nApE_Biu13k_10.000_20.000.wav 10.000 20.000 Screaming
+nRMmafPUAEU_80.000_90.000.wav 80.000 90.000 Screaming
+nYAbLuyqPis_30.000_40.000.wav 30.000 40.000 Screaming
+nlYlNF30bVg_30.000_40.000.wav 30.000 40.000 Screaming
+sUp-UXzgmrA_0.000_10.000.wav 0.000 10.000 Screaming
+syIwNMo2TUA_0.000_7.000.wav 0.000 7.000 Screaming
+uTu0a1wd9-M_21.000_31.000.wav 21.000 31.000 Screaming
+xVG7dfH5DL0_320.000_330.000.wav 320.000 330.000 Screaming
+xvAQ44hx3_k_220.000_230.000.wav 220.000 230.000 Screaming
+yNTkb2zgA_M_70.000_80.000.wav 70.000 80.000 Screaming
+zCdOEvduBTo_30.000_40.000.wav 30.000 40.000 Screaming
+zMICvbCJ6zc_550.000_560.000.wav 550.000 560.000 Screaming
+-0RWZT-miFs_420.000_430.000.wav 420.000 430.000 Car
+-1pRmoJIGQc_11.000_21.000.wav 11.000 21.000 Car
+-7eDqv-6AKQ_30.000_40.000.wav 30.000 40.000 Car
+-CZ1LIc8aos_20.000_30.000.wav 20.000 30.000 Car
+-HWygXWSNRA_30.000_40.000.wav 30.000 40.000 Car
+-PVEno65928_30.000_40.000.wav 30.000 40.000 Car
+-WgJ-M292Yc_30.000_40.000.wav 30.000 40.000 Car
+0O-gZoirpRA_30.000_40.000.wav 30.000 40.000 Car
+0QwxnzHf_0E_30.000_40.000.wav 30.000 40.000 Car
+0bg1nzEVdgY_0.000_10.000.wav 0.000 10.000 Car
+0lpPdWvg7Eo_0.000_10.000.wav 0.000 10.000 Car
+11Pn3yJifSQ_4.000_14.000.wav 4.000 14.000 Car
+1BgqrhbyRFw_30.000_40.000.wav 30.000 40.000 Car
+1F9zCsJyw6k_430.000_440.000.wav 430.000 440.000 Car
+1HayoASR-54_80.000_90.000.wav 80.000 90.000 Car
+1P5FFxXLSpY_30.000_40.000.wav 30.000 40.000 Car
+1hIg-Lsvc7Q_30.000_40.000.wav 30.000 40.000 Car
+27m49pmJ8Og_370.000_380.000.wav 370.000 380.000 Car
+2E_N8lnoVKE_30.000_40.000.wav 30.000 40.000 Car
+2Fdau5KTEls_30.000_40.000.wav 30.000 40.000 Car
+2STASUlGAjs_30.000_40.000.wav 30.000 40.000 Car
+2fi0m8ei_B4_30.000_40.000.wav 30.000 40.000 Car
+2uMXfAIMeN0_180.000_190.000.wav 180.000 190.000 Car
+32V2zsK7GME_110.000_120.000.wav 110.000 120.000 Car
+3YChVhqW42E_130.000_140.000.wav 130.000 140.000 Car
+3_OLj6XChvM_30.000_40.000.wav 30.000 40.000 Car
+3hLxPQpmfQo_30.000_40.000.wav 30.000 40.000 Car
+3mDPQ_CPopw_30.000_40.000.wav 30.000 40.000 Car
+3mor5mPSYoU_7.000_17.000.wav 7.000 17.000 Car
+3xh2kScw64U_30.000_40.000.wav 30.000 40.000 Car
+40s88hEcn5I_170.000_180.000.wav 170.000 180.000 Car
+42P93B_GzGA_30.000_40.000.wav 30.000 40.000 Car
+4KZWpXlcpM4_60.000_70.000.wav 60.000 70.000 Car
+4TshFWSsrn8_290.000_300.000.wav 290.000 300.000 Car
+4WRgvRI06zc_30.000_40.000.wav 30.000 40.000 Car
+4aJfQpHt9lY_160.000_170.000.wav 160.000 170.000 Car
+4hd2CLrzCZs_30.000_40.000.wav 30.000 40.000 Car
+4zCHl7pRsNY_30.000_40.000.wav 30.000 40.000 Car
+5RgHBmX2HLw_30.000_40.000.wav 30.000 40.000 Car
+5oirFKi6Sfo_190.000_200.000.wav 190.000 200.000 Car
+5vmxFp1r1ZM_30.000_40.000.wav 30.000 40.000 Car
+5z1rE_l-0Ow_0.000_8.000.wav 0.000 8.000 Car
+620GoTv5Ic8_30.000_40.000.wav 30.000 40.000 Car
+6BitLl5Bnxw_30.000_40.000.wav 30.000 40.000 Car
+6FVA4hqp1Ro_30.000_40.000.wav 30.000 40.000 Car
+6U942AYlcXA_30.000_40.000.wav 30.000 40.000 Car
+6b2ZMMrLTz8_5.000_15.000.wav 5.000 15.000 Car
+6ibh38autyA_30.000_40.000.wav 30.000 40.000 Car
+6kuESYFcEqw_30.000_40.000.wav 30.000 40.000 Car
+73cuZZq-J3w_20.000_30.000.wav 20.000 30.000 Car
+764IcMEMVUk_90.000_100.000.wav 90.000 100.000 Car
+7NH1WJlSiYI_30.000_40.000.wav 30.000 40.000 Car
+7lJu9wEsErY_220.000_230.000.wav 220.000 230.000 Car
+8CqqK9CzuXM_30.000_40.000.wav 30.000 40.000 Car
+8SYLYWR47EE_30.000_40.000.wav 30.000 40.000 Car
+8Wk-ZmlsUqY_28.000_38.000.wav 28.000 38.000 Car
+8q8JrJNAa-Q_30.000_40.000.wav 30.000 40.000 Car
+8rMlNbKlp_s_0.000_10.000.wav 0.000 10.000 Car
+8sGJFPr2Nmc_30.000_40.000.wav 30.000 40.000 Car
+8yRROnG0-lA_30.000_40.000.wav 30.000 40.000 Car
+9Ti98L4PRCo_17.000_27.000.wav 17.000 27.000 Car
+9fzAWj5YJ9c_30.000_40.000.wav 30.000 40.000 Car
+9rq8h4oMJ98_30.000_40.000.wav 30.000 40.000 Car
+9ye2Fn62xDc_60.000_70.000.wav 60.000 70.000 Car
+ACGuC6SH4V4_150.000_160.000.wav 150.000 160.000 Car
+AFz5TIs_Gug_30.000_40.000.wav 30.000 40.000 Car
+AedlWfHafgw_21.000_31.000.wav 21.000 31.000 Car
+AlsDSDTiaWI_30.000_40.000.wav 30.000 40.000 Car
+B3SkK0wuOhY_130.000_140.000.wav 130.000 140.000 Car
+B9n4a5ciI48_16.000_26.000.wav 16.000 26.000 Car
+BAekfGvUtFM_30.000_40.000.wav 30.000 40.000 Car
+BNLOvQbrPdc_290.000_300.000.wav 290.000 300.000 Car
+BS1fqEDAvh0_330.000_340.000.wav 330.000 340.000 Car
+Bqx_SZgCzZw_10.000_20.000.wav 10.000 20.000 Car
+CZB6WXDuM1g_30.000_40.000.wav 30.000 40.000 Car
+C_pnsyNXphA_30.000_40.000.wav 30.000 40.000 Car
+Ck5ZjBf1nLM_30.000_40.000.wav 30.000 40.000 Car
+CqNyeZeHb8Y_30.000_40.000.wav 30.000 40.000 Car
+Cs1d7Ibk8CA_220.000_230.000.wav 220.000 230.000 Car
+CuS-ok0xG9g_0.000_10.000.wav 0.000 10.000 Car
+CuaBHNKycvI_30.000_40.000.wav 30.000 40.000 Car
+Cwur_jvxMzY_360.000_370.000.wav 360.000 370.000 Car
+DEGSyVygE98_110.000_120.000.wav 110.000 120.000 Car
+DLxTYAUifjU_30.000_40.000.wav 30.000 40.000 Car
+DkKpnvJk9u0_30.000_40.000.wav 30.000 40.000 Car
+DkVfro9iq80_30.000_40.000.wav 30.000 40.000 Car
+Dw1q9rBv7oU_30.000_40.000.wav 30.000 40.000 Car
+E8NgxTz1d90_30.000_40.000.wav 30.000 40.000 Car
+ExqedxdXuBc_70.000_80.000.wav 70.000 80.000 Car
+FCxEMSNSEuI_160.000_170.000.wav 160.000 170.000 Car
+FEoMTMxzn3U_30.000_40.000.wav 30.000 40.000 Car
+FFSWmryaZ60_30.000_40.000.wav 30.000 40.000 Car
+FYk2paHPSdg_30.000_40.000.wav 30.000 40.000 Car
+Fo_FDiZhzDo_30.000_40.000.wav 30.000 40.000 Car
+GteozUDpJRc_30.000_40.000.wav 30.000 40.000 Car
+GwBS2NzjAvA_30.000_40.000.wav 30.000 40.000 Car
+H8d1mZOqb1c_110.000_120.000.wav 110.000 120.000 Car
+HFF_PpqLQ9w_30.000_40.000.wav 30.000 40.000 Car
+HHlb-h2Pc7o_30.000_40.000.wav 30.000 40.000 Car
+Hu8lxbHYaqg_40.000_50.000.wav 40.000 50.000 Car
+I-HlrcP6Qg4_30.000_40.000.wav 30.000 40.000 Car
+I7vs2H-Htt8_480.000_490.000.wav 480.000 490.000 Car
+IblhEF_MiH8_400.000_410.000.wav 400.000 410.000 Car
+JgXnbgS_XBk_480.000_490.000.wav 480.000 490.000 Car
+Ju7Kg_H2iZQ_30.000_40.000.wav 30.000 40.000 Car
+KiCB6pP6EEo_100.000_110.000.wav 100.000 110.000 Car
+Kwpn3utYEHM_30.000_40.000.wav 30.000 40.000 Car
+Ky9Kw-0XwAs_30.000_40.000.wav 30.000 40.000 Car
+KzKDk-UgS54_30.000_40.000.wav 30.000 40.000 Car
+L1qC8DicAZE_70.000_80.000.wav 70.000 80.000 Car
+L4N0LOYZrFo_30.000_40.000.wav 30.000 40.000 Car
+L535vIV3ED4_40.000_50.000.wav 40.000 50.000 Car
+L9YtOeck3A0_0.000_10.000.wav 0.000 10.000 Car
+LEtkHiZZugk_30.000_40.000.wav 30.000 40.000 Car
+LLkNFGrrgUo_30.000_40.000.wav 30.000 40.000 Car
+LhRNnXaSsCk_30.000_40.000.wav 30.000 40.000 Car
+M7NvD1WJQ7o_70.000_80.000.wav 70.000 80.000 Car
+M8BFtmQRHq4_200.000_210.000.wav 200.000 210.000 Car
+Mxn2FKuNwiI_20.000_30.000.wav 20.000 30.000 Car
+NMqSBlEq14Q_30.000_40.000.wav 30.000 40.000 Car
+NoPbk9fy6uw_10.000_20.000.wav 10.000 20.000 Car
+O36torHptH4_30.000_40.000.wav 30.000 40.000 Car
+OBwh-KGukE8_30.000_40.000.wav 30.000 40.000 Car
+Oa2Os8eOUjs_30.000_40.000.wav 30.000 40.000 Car
+PNaLTW50fxM_60.000_70.000.wav 60.000 70.000 Car
+PfXdcsW8dJI_540.000_550.000.wav 540.000 550.000 Car
+QAWuHvVCI6g_30.000_40.000.wav 30.000 40.000 Car
+QBMDnMRwQCc_70.000_80.000.wav 70.000 80.000 Car
+QzrS-S7OerE_370.000_380.000.wav 370.000 380.000 Car
+R0BtkTm_CPI_30.000_40.000.wav 30.000 40.000 Car
+SEHxfje9Eio_30.000_40.000.wav 30.000 40.000 Car
+Sb3V17F8xU8_360.000_370.000.wav 360.000 370.000 Car
+SkbFczIabRY_30.000_40.000.wav 30.000 40.000 Car
+SqWkV-UQ6CI_30.000_40.000.wav 30.000 40.000 Car
+TWDytzefXXc_10.000_20.000.wav 10.000 20.000 Car
+Tv67JhZDAYs_30.000_40.000.wav 30.000 40.000 Car
+VTwVF3xRSWg_12.000_22.000.wav 12.000 22.000 Car
+VulCKZgWspc_570.000_580.000.wav 570.000 580.000 Car
+Vx6mttDHWfo_30.000_40.000.wav 30.000 40.000 Car
+W11cJ9HZNaY_30.000_40.000.wav 30.000 40.000 Car
+WLXQgcx8qTI_30.000_40.000.wav 30.000 40.000 Car
+WMbdMQ7rdFs_30.000_40.000.wav 30.000 40.000 Car
+WZoQD6cInx8_360.000_370.000.wav 360.000 370.000 Car
+WffmaOr2p8I_30.000_40.000.wav 30.000 40.000 Car
+WoynilrteLU_30.000_40.000.wav 30.000 40.000 Car
+WxrKq0aI0iM_130.000_140.000.wav 130.000 140.000 Car
+X60eVxecY3I_30.000_40.000.wav 30.000 40.000 Car
+X8fEzx-fA0U_80.000_90.000.wav 80.000 90.000 Car
+XVxlZqwWcBI_10.000_20.000.wav 10.000 20.000 Car
+Xnd8ERrynEo_120.000_130.000.wav 120.000 130.000 Car
+XqXLI7bDb-I_0.000_7.000.wav 0.000 7.000 Car
+XyCjByHuDIk_260.000_270.000.wav 260.000 270.000 Car
+XzE7mp3pVik_0.000_10.000.wav 0.000 10.000 Car
+Y5e8BW513ww_20.000_30.000.wav 20.000 30.000 Car
+YJdBwuIn4Ec_30.000_40.000.wav 30.000 40.000 Car
+YTFJUFWcRns_30.000_40.000.wav 30.000 40.000 Car
+YY9aConw2QE_0.000_10.000.wav 0.000 10.000 Car
+Yc_WuISxfLI_30.000_40.000.wav 30.000 40.000 Car
+Ys_rO2Ieg1U_30.000_40.000.wav 30.000 40.000 Car
+Z34SD-OEpJI_10.000_20.000.wav 10.000 20.000 Car
+Z8cigemT5_g_210.000_220.000.wav 210.000 220.000 Car
+ZJW7ymsioQc_16.000_26.000.wav 16.000 26.000 Car
+ZY6A9ZDkudg_130.000_140.000.wav 130.000 140.000 Car
+_Mw9lKigni4_30.000_40.000.wav 30.000 40.000 Car
+_ZiJA6phEq8_30.000_40.000.wav 30.000 40.000 Car
+_yU0-fmspFY_210.000_220.000.wav 210.000 220.000 Car
+a5vTn5286-A_80.000_90.000.wav 80.000 90.000 Car
+aCX6vJhHO2c_30.000_40.000.wav 30.000 40.000 Car
+aHEAK0iWqKk_180.000_190.000.wav 180.000 190.000 Car
+aOVPHKqKjyQ_90.000_100.000.wav 90.000 100.000 Car
+aUq4glO5ryE_30.000_40.000.wav 30.000 40.000 Car
+aW3DY8XDrmw_22.000_32.000.wav 22.000 32.000 Car
+aa4uhPvKviY_30.000_40.000.wav 30.000 40.000 Car
+akgqVmFFDiY_30.000_40.000.wav 30.000 40.000 Car
+buOEFwXhoe0_310.000_320.000.wav 310.000 320.000 Car
+cHCIoXF7moA_30.000_40.000.wav 30.000 40.000 Car
+cW859JAzVZ0_30.000_40.000.wav 30.000 40.000 Car
+cbYZQRz09bc_390.000_400.000.wav 390.000 400.000 Car
+d-do1XZ8f_E_30.000_40.000.wav 30.000 40.000 Car
+d3gMwtMK6Gs_30.000_40.000.wav 30.000 40.000 Car
+d6AioJ8CkTc_30.000_40.000.wav 30.000 40.000 Car
+dAud19zNZyw_190.000_200.000.wav 190.000 200.000 Car
+dC1TVxwiitc_30.000_40.000.wav 30.000 40.000 Car
+dFqOBLxhEl8_20.000_30.000.wav 20.000 30.000 Car
+dSfcznv4KLo_30.000_40.000.wav 30.000 40.000 Car
+dThSTe35jb0_50.000_60.000.wav 50.000 60.000 Car
+dfwr8wgZU8M_40.000_50.000.wav 40.000 50.000 Car
+dmJH84FnQa8_30.000_40.000.wav 30.000 40.000 Car
+e9xPBfEJni8_230.000_240.000.wav 230.000 240.000 Car
+eAl9WwRaWUE_30.000_40.000.wav 30.000 40.000 Car
+eAt6si6k65c_30.000_40.000.wav 30.000 40.000 Car
+eHiqCLHmoxI_0.000_8.000.wav 0.000 8.000 Car
+eV5JX81GzqA_150.000_160.000.wav 150.000 160.000 Car
+er1vQ-nse_g_30.000_40.000.wav 30.000 40.000 Car
+eyFPHlybqDg_30.000_40.000.wav 30.000 40.000 Car
+f70nsY7ThBA_220.000_230.000.wav 220.000 230.000 Car
+fJLCT3xDGxA_30.000_40.000.wav 30.000 40.000 Car
+fZMPDCNyQxE_30.000_40.000.wav 30.000 40.000 Car
+f__6chtFRM0_30.000_40.000.wav 30.000 40.000 Car
+fdDTuo_COG8_90.000_100.000.wav 90.000 100.000 Car
+gFJjYWXeBn0_30.000_40.000.wav 30.000 40.000 Car
+g_DBLppDZAs_30.000_40.000.wav 30.000 40.000 Car
+gaFQgJLQHtU_90.000_100.000.wav 90.000 100.000 Car
+gc6VlixMHXE_30.000_40.000.wav 30.000 40.000 Car
+hN1ykzC8kZM_30.000_40.000.wav 30.000 40.000 Car
+hQ_yyPI46FI_11.000_21.000.wav 11.000 21.000 Car
+haiMRJEH-Aw_0.000_9.000.wav 0.000 9.000 Car
+hsC_sT0A4XM_30.000_40.000.wav 30.000 40.000 Car
+ihQDd1CqFBw_70.000_80.000.wav 70.000 80.000 Car
+ii87iO6JboA_10.000_20.000.wav 10.000 20.000 Car
+j2R1zurR39E_30.000_40.000.wav 30.000 40.000 Car
+j42ETHcp044_0.000_10.000.wav 0.000 10.000 Car
+j7OEpDiK3IA_30.000_40.000.wav 30.000 40.000 Car
+jCeUZwd8b2w_0.000_10.000.wav 0.000 10.000 Car
+jZxusrD28rM_30.000_40.000.wav 30.000 40.000 Car
+kdDgTDfo9HY_100.000_110.000.wav 100.000 110.000 Car
+l6_h_YHuTbY_30.000_40.000.wav 30.000 40.000 Car
+lRrv5m9Xu4k_30.000_40.000.wav 30.000 40.000 Car
+lb1awXgoyQE_0.000_10.000.wav 0.000 10.000 Car
+llZBUsAwRWc_30.000_40.000.wav 30.000 40.000 Car
+lu5teS1j1RQ_0.000_10.000.wav 0.000 10.000 Car
+mCmjh_EJtb4_30.000_40.000.wav 30.000 40.000 Car
+nFqf1vflJaI_350.000_360.000.wav 350.000 360.000 Car
+njodYtK0Hqg_30.000_40.000.wav 30.000 40.000 Car
+noymXcxyxis_30.000_40.000.wav 30.000 40.000 Car
+o2CmtHNUrXg_30.000_40.000.wav 30.000 40.000 Car
+oPJVdi0cqNE_30.000_40.000.wav 30.000 40.000 Car
+oxJYMzEmtk4_10.000_20.000.wav 10.000 20.000 Car
+pPnLErF3GOY_30.000_40.000.wav 30.000 40.000 Car
+pXX6cK4xtiY_11.000_21.000.wav 11.000 21.000 Car
+qC5M7BAsKOA_0.000_10.000.wav 0.000 10.000 Car
+qg4WxBm8h_w_510.000_520.000.wav 510.000 520.000 Car
+qxLdv8u_Ujw_0.000_5.000.wav 0.000 5.000 Car
+rgeu0Gtf3Es_40.000_50.000.wav 40.000 50.000 Car
+s3-i5eUpe6c_30.000_40.000.wav 30.000 40.000 Car
+s5s3aR8Z7I8_350.000_360.000.wav 350.000 360.000 Car
+syCQldBsAtg_30.000_40.000.wav 30.000 40.000 Car
+tAfucDIyRiM_30.000_40.000.wav 30.000 40.000 Car
+teoER4j9H14_290.000_300.000.wav 290.000 300.000 Car
+uFSkczD2i14_30.000_40.000.wav 30.000 40.000 Car
+uUyB4q7jgn4_30.000_40.000.wav 30.000 40.000 Car
+uYqlVTlSgbM_40.000_50.000.wav 40.000 50.000 Car
+v8Kry1CbTkM_310.000_320.000.wav 310.000 320.000 Car
+vF2zXcbADUk_20.000_30.000.wav 20.000 30.000 Car
+vHlqKDR7ggA_30.000_40.000.wav 30.000 40.000 Car
+vPDXFKcdaS4_0.000_10.000.wav 0.000 10.000 Car
+vW1nk4o9u5g_30.000_40.000.wav 30.000 40.000 Car
+vdFYBSlmsXw_30.000_40.000.wav 30.000 40.000 Car
+vtE1J8HsCUs_30.000_40.000.wav 30.000 40.000 Car
+w0vy1YvNcOg_30.000_40.000.wav 30.000 40.000 Car
+wDKrcZ7xLY8_80.000_90.000.wav 80.000 90.000 Car
+wM-sBzIDzok_30.000_40.000.wav 30.000 40.000 Car
+wUY4eWJt17w_30.000_40.000.wav 30.000 40.000 Car
+we66pU0MN1M_30.000_40.000.wav 30.000 40.000 Car
+wjfMWiYLDWA_30.000_40.000.wav 30.000 40.000 Car
+wu3-_VKULZU_30.000_40.000.wav 30.000 40.000 Car
+wwNIm8bgzKc_30.000_40.000.wav 30.000 40.000 Car
+xqH9TpH6Xy0_0.000_10.000.wav 0.000 10.000 Car
+xsT5ZJUnBg0_160.000_170.000.wav 160.000 170.000 Car
+y9DFJEsiTLk_110.000_120.000.wav 110.000 120.000 Car
+yESwp_fg0Po_70.000_80.000.wav 70.000 80.000 Car
+yQg3eMb0QKU_30.000_40.000.wav 30.000 40.000 Car
+yQjnNR7fXKo_50.000_60.000.wav 50.000 60.000 Car
+zCuKYr_oMlE_60.000_70.000.wav 60.000 70.000 Car
+zz35Va7tYmA_30.000_40.000.wav 30.000 40.000 Car
+-CZ1LIc8aos_20.000_30.000.wav 20.000 30.000 Car passing by
+-WgJ-M292Yc_30.000_40.000.wav 30.000 40.000 Car passing by
+-iAAxJkoqcM_0.000_6.000.wav 0.000 6.000 Car passing by
+0mQcGLpc8to_30.000_40.000.wav 30.000 40.000 Car passing by
+1HtGgZnlKjU_30.000_40.000.wav 30.000 40.000 Car passing by
+2IsAlhq0XFc_30.000_40.000.wav 30.000 40.000 Car passing by
+2UvEmetE__I_30.000_40.000.wav 30.000 40.000 Car passing by
+2oHGIzH_XzA_30.000_40.000.wav 30.000 40.000 Car passing by
+3mor5mPSYoU_7.000_17.000.wav 7.000 17.000 Car passing by
+8SYLYWR47EE_30.000_40.000.wav 30.000 40.000 Car passing by
+8rzhhvS0tGc_30.000_40.000.wav 30.000 40.000 Car passing by
+8v377AXrgac_30.000_40.000.wav 30.000 40.000 Car passing by
+9lMtTDKyDEk_30.000_40.000.wav 30.000 40.000 Car passing by
+BWoL8oKoTFI_30.000_40.000.wav 30.000 40.000 Car passing by
+BsvD806qNM8_10.000_20.000.wav 10.000 20.000 Car passing by
+C3LLtToB2zA_30.000_40.000.wav 30.000 40.000 Car passing by
+Dk6b9dVD0i8_6.000_16.000.wav 6.000 16.000 Car passing by
+Dw1q9rBv7oU_30.000_40.000.wav 30.000 40.000 Car passing by
+EqFuY_U0Yz0_30.000_40.000.wav 30.000 40.000 Car passing by
+FjpOboRcrNc_10.000_20.000.wav 10.000 20.000 Car passing by
+FjyZV8zIJ0k_30.000_40.000.wav 30.000 40.000 Car passing by
+Fn7eSPVvgCQ_30.000_40.000.wav 30.000 40.000 Car passing by
+G6A-sT2DOjY_30.000_40.000.wav 30.000 40.000 Car passing by
+GBXRuYIvhfM_30.000_40.000.wav 30.000 40.000 Car passing by
+HDEPd5MIaow_30.000_40.000.wav 30.000 40.000 Car passing by
+HQQxGJKg1iM_30.000_40.000.wav 30.000 40.000 Car passing by
+If-V0XO-mpo_30.000_40.000.wav 30.000 40.000 Car passing by
+JtuNiusRRLk_30.000_40.000.wav 30.000 40.000 Car passing by
+M8BFtmQRHq4_200.000_210.000.wav 200.000 210.000 Car passing by
+NKPAwhwZmqs_30.000_40.000.wav 30.000 40.000 Car passing by
+Oa2Os8eOUjs_30.000_40.000.wav 30.000 40.000 Car passing by
+QcLfJE-YfJY_30.000_40.000.wav 30.000 40.000 Car passing by
+SkbFczIabRY_30.000_40.000.wav 30.000 40.000 Car passing by
+VAiH1LX8guk_17.000_27.000.wav 17.000 27.000 Car passing by
+Yc_WuISxfLI_30.000_40.000.wav 30.000 40.000 Car passing by
+Yd10enP9ykM_30.000_40.000.wav 30.000 40.000 Car passing by
+_HGGCwtyNxM_30.000_40.000.wav 30.000 40.000 Car passing by
+a2U10_mi5as_30.000_40.000.wav 30.000 40.000 Car passing by
+aB6FDPKAPus_30.000_40.000.wav 30.000 40.000 Car passing by
+bDFQWubN4x4_30.000_40.000.wav 30.000 40.000 Car passing by
+cW859JAzVZ0_30.000_40.000.wav 30.000 40.000 Car passing by
+dDTvjXXFkDg_30.000_40.000.wav 30.000 40.000 Car passing by
+dfwr8wgZU8M_40.000_50.000.wav 40.000 50.000 Car passing by
+fJLCT3xDGxA_30.000_40.000.wav 30.000 40.000 Car passing by
+gc6VlixMHXE_30.000_40.000.wav 30.000 40.000 Car passing by
+gd_KjDM4fi8_0.000_10.000.wav 0.000 10.000 Car passing by
+j7OEpDiK3IA_30.000_40.000.wav 30.000 40.000 Car passing by
+jZxusrD28rM_30.000_40.000.wav 30.000 40.000 Car passing by
+llZBUsAwRWc_30.000_40.000.wav 30.000 40.000 Car passing by
+m_dCO5bBCic_26.000_36.000.wav 26.000 36.000 Car passing by
+qDQX7Xi3GsQ_30.000_40.000.wav 30.000 40.000 Car passing by
+qxLdv8u_Ujw_0.000_5.000.wav 0.000 5.000 Car passing by
+reP-OOWiLWU_30.000_40.000.wav 30.000 40.000 Car passing by
+s4jG5ZJYCvQ_30.000_40.000.wav 30.000 40.000 Car passing by
+s5s3aR8Z7I8_350.000_360.000.wav 350.000 360.000 Car passing by
+uUyB4q7jgn4_30.000_40.000.wav 30.000 40.000 Car passing by
+vPDXFKcdaS4_0.000_10.000.wav 0.000 10.000 Car passing by
+wD4QouhX8zo_30.000_40.000.wav 30.000 40.000 Car passing by
+xqH9TpH6Xy0_0.000_10.000.wav 0.000 10.000 Car passing by
+zd67ihUZ1u4_25.000_35.000.wav 25.000 35.000 Car passing by
+-3z5mFRgbxc_30.000_40.000.wav 30.000 40.000 Bus
+0N9EN0BEjP0_430.000_440.000.wav 430.000 440.000 Bus
+0lPcHRhXlWk_30.000_40.000.wav 30.000 40.000 Bus
+1E1evA4T_Tk_30.000_40.000.wav 30.000 40.000 Bus
+1hIg-Lsvc7Q_30.000_40.000.wav 30.000 40.000 Bus
+6-yQsEH2WYA_30.000_40.000.wav 30.000 40.000 Bus
+6Y8wSI1l-Lw_30.000_40.000.wav 30.000 40.000 Bus
+7T04388Ijk8_30.000_40.000.wav 30.000 40.000 Bus
+8E7okHnCcTA_30.000_40.000.wav 30.000 40.000 Bus
+8oEdgb8iXYA_1.000_11.000.wav 1.000 11.000 Bus
+AdpNSGX2_Pk_10.000_20.000.wav 10.000 20.000 Bus
+AwJ8orGuOXg_2.000_12.000.wav 2.000 12.000 Bus
+BS1fqEDAvh0_330.000_340.000.wav 330.000 340.000 Bus
+CoFbRc1OxFU_9.000_19.000.wav 9.000 19.000 Bus
+DRqKOlP8BmU_110.000_120.000.wav 110.000 120.000 Bus
+DYcXvyBFc5w_30.000_40.000.wav 30.000 40.000 Bus
+DYdalOQnx1Y_30.000_40.000.wav 30.000 40.000 Bus
+DkwFXd5nYLE_40.000_50.000.wav 40.000 50.000 Bus
+FBMR3pW9H9o_30.000_40.000.wav 30.000 40.000 Bus
+FEGa4e6RAlw_30.000_40.000.wav 30.000 40.000 Bus
+Ge_KWS-0098_30.000_40.000.wav 30.000 40.000 Bus
+HxMoMMrA6Eo_30.000_40.000.wav 30.000 40.000 Bus
+I7esm6vqqZ4_30.000_40.000.wav 30.000 40.000 Bus
+JLj11umr1CE_0.000_10.000.wav 0.000 10.000 Bus
+JwAhcHHF2qg_30.000_40.000.wav 30.000 40.000 Bus
+LhRNnXaSsCk_30.000_40.000.wav 30.000 40.000 Bus
+LzZ_nxuZ8Co_30.000_40.000.wav 30.000 40.000 Bus
+LzcNa3HvD7c_30.000_40.000.wav 30.000 40.000 Bus
+Nyi9_-u6-w0_30.000_40.000.wav 30.000 40.000 Bus
+O_SKumO328I_30.000_40.000.wav 30.000 40.000 Bus
+Owg_XU9XmRM_30.000_40.000.wav 30.000 40.000 Bus
+P94rcZSuTT8_30.000_40.000.wav 30.000 40.000 Bus
+PP741kd2vRM_30.000_40.000.wav 30.000 40.000 Bus
+Qna9qrV8_go_30.000_40.000.wav 30.000 40.000 Bus
+Qt7FJkuqWPE_30.000_40.000.wav 30.000 40.000 Bus
+UcQ7cVukaxY_21.000_31.000.wav 21.000 31.000 Bus
+W8fIlauyJkk_30.000_40.000.wav 30.000 40.000 Bus
+WDn851XbWTk_30.000_40.000.wav 30.000 40.000 Bus
+WvquSD2PcCE_30.000_40.000.wav 30.000 40.000 Bus
+a9B_HA3y8WQ_30.000_40.000.wav 30.000 40.000 Bus
+cEEoKQ38fHY_30.000_40.000.wav 30.000 40.000 Bus
+er1vQ-nse_g_30.000_40.000.wav 30.000 40.000 Bus
+fLvM4bbpg6w_0.000_10.000.wav 0.000 10.000 Bus
+fOVsAMJ3Yms_30.000_40.000.wav 30.000 40.000 Bus
+gxVhAVNjSU0_30.000_40.000.wav 30.000 40.000 Bus
+jaSK_t8QP1E_30.000_40.000.wav 30.000 40.000 Bus
+ji_YCMygNHQ_8.000_18.000.wav 8.000 18.000 Bus
+kNKfoDp0uUw_30.000_40.000.wav 30.000 40.000 Bus
+kdDgTDfo9HY_100.000_110.000.wav 100.000 110.000 Bus
+lHP0q2sQzPQ_30.000_40.000.wav 30.000 40.000 Bus
+mGG8rop4Jig_30.000_40.000.wav 30.000 40.000 Bus
+oHKTmTLEy68_11.000_21.000.wav 11.000 21.000 Bus
+tAfucDIyRiM_30.000_40.000.wav 30.000 40.000 Bus
+tQd0vFueRKs_30.000_40.000.wav 30.000 40.000 Bus
+ucICmff0K-Q_30.000_40.000.wav 30.000 40.000 Bus
+x-2Abohj8VY_30.000_40.000.wav 30.000 40.000 Bus
+xFr2xX6PulQ_70.000_80.000.wav 70.000 80.000 Bus
+yfSBqp5IZSM_10.000_20.000.wav 10.000 20.000 Bus
+-2sE5CH8Wb8_30.000_40.000.wav 30.000 40.000 Truck
+-BY64_p-vtM_30.000_40.000.wav 30.000 40.000 Truck
+-fJsZm3YRc0_30.000_40.000.wav 30.000 40.000 Truck
+-t-htrAtNvM_30.000_40.000.wav 30.000 40.000 Truck
+-zNEcuo28oE_30.000_40.000.wav 30.000 40.000 Truck
+01WuUBxFBp4_30.000_40.000.wav 30.000 40.000 Truck
+077aWlQn6XI_30.000_40.000.wav 30.000 40.000 Truck
+0Ga7T-2e490_17.000_27.000.wav 17.000 27.000 Truck
+0N9EN0BEjP0_430.000_440.000.wav 430.000 440.000 Truck
+10aF24rMeu0_30.000_40.000.wav 30.000 40.000 Truck
+2HZcxlRs-hg_30.000_40.000.wav 30.000 40.000 Truck
+2Jpg_KvJWL0_30.000_40.000.wav 30.000 40.000 Truck
+2Tmi7EqpGZQ_0.000_10.000.wav 0.000 10.000 Truck
+4DlKNmVcoek_20.000_30.000.wav 20.000 30.000 Truck
+4MRzQbAIyV4_90.000_100.000.wav 90.000 100.000 Truck
+4Tpy1lsfcSM_30.000_40.000.wav 30.000 40.000 Truck
+4ep09nZl3LA_30.000_40.000.wav 30.000 40.000 Truck
+5DW8WjxxCag_30.000_40.000.wav 30.000 40.000 Truck
+5DjZHCumLfs_11.000_21.000.wav 11.000 21.000 Truck
+5QP1Tc3XbDc_30.000_40.000.wav 30.000 40.000 Truck
+5V0xKS-FGMk_30.000_40.000.wav 30.000 40.000 Truck
+5fLzQegwHUg_30.000_40.000.wav 30.000 40.000 Truck
+6HL_DKWK-WA_10.000_20.000.wav 10.000 20.000 Truck
+6VQGk8IrV-4_30.000_40.000.wav 30.000 40.000 Truck
+6Y8bKS6KLeE_30.000_40.000.wav 30.000 40.000 Truck
+6xEHP-C-ZuU_30.000_40.000.wav 30.000 40.000 Truck
+6yyToq9cW9A_60.000_70.000.wav 60.000 70.000 Truck
+7Gua0-UrKIw_30.000_40.000.wav 30.000 40.000 Truck
+7nglQSmcjAk_30.000_40.000.wav 30.000 40.000 Truck
+81DteAPIhoE_30.000_40.000.wav 30.000 40.000 Truck
+84E9i9_ELBs_30.000_40.000.wav 30.000 40.000 Truck
+8jblPMBafKE_30.000_40.000.wav 30.000 40.000 Truck
+8k17D6qiuqI_30.000_40.000.wav 30.000 40.000 Truck
+9EsgN-WS2qY_30.000_40.000.wav 30.000 40.000 Truck
+9LJnjmcRcb8_280.000_290.000.wav 280.000 290.000 Truck
+9yhMtJ50sys_30.000_40.000.wav 30.000 40.000 Truck
+A9KMqwqLboE_30.000_40.000.wav 30.000 40.000 Truck
+ARIVxBOc0BQ_40.000_50.000.wav 40.000 50.000 Truck
+AwFuGITwrms_30.000_40.000.wav 30.000 40.000 Truck
+BQVXzH6YK8g_30.000_40.000.wav 30.000 40.000 Truck
+CnYWJp2bknU_50.000_60.000.wav 50.000 60.000 Truck
+DRqKOlP8BmU_110.000_120.000.wav 110.000 120.000 Truck
+DXlTakKvLzg_30.000_40.000.wav 30.000 40.000 Truck
+DkVfro9iq80_30.000_40.000.wav 30.000 40.000 Truck
+Dmy4EjohxxU_60.000_70.000.wav 60.000 70.000 Truck
+DvMFQ64YwcI_30.000_40.000.wav 30.000 40.000 Truck
+FEoMTMxzn3U_30.000_40.000.wav 30.000 40.000 Truck
+GTk_6JDmtCY_230.000_240.000.wav 230.000 240.000 Truck
+HDEPd5MIaow_30.000_40.000.wav 30.000 40.000 Truck
+HQkLVac7z9Q_70.000_80.000.wav 70.000 80.000 Truck
+I4VDcVTE4YA_30.000_40.000.wav 30.000 40.000 Truck
+IxlvxvG8zOE_110.000_120.000.wav 110.000 120.000 Truck
+JLzD44Im1Ec_30.000_40.000.wav 30.000 40.000 Truck
+K4Hcb00hTTY_30.000_40.000.wav 30.000 40.000 Truck
+L2M3xanqQP8_30.000_40.000.wav 30.000 40.000 Truck
+LA5TekLaIPI_10.000_20.000.wav 10.000 20.000 Truck
+LhRNnXaSsCk_30.000_40.000.wav 30.000 40.000 Truck
+MWTTe0M9vi4_30.000_40.000.wav 30.000 40.000 Truck
+Nkqx09b-xyI_70.000_80.000.wav 70.000 80.000 Truck
+NqzZbJJl3E4_30.000_40.000.wav 30.000 40.000 Truck
+OPd0cz1hRqc_30.000_40.000.wav 30.000 40.000 Truck
+PCl-q7lCT_U_50.000_60.000.wav 50.000 60.000 Truck
+PNaLTW50fxM_60.000_70.000.wav 60.000 70.000 Truck
+PO1eaJ7tQOg_180.000_190.000.wav 180.000 190.000 Truck
+PSt0xAYgf4g_0.000_10.000.wav 0.000 10.000 Truck
+Pef6g19i5iI_30.000_40.000.wav 30.000 40.000 Truck
+Q1CMSV81_ws_30.000_40.000.wav 30.000 40.000 Truck
+SiBIYAiIajM_30.000_40.000.wav 30.000 40.000 Truck
+T6oYCFRafPs_30.000_40.000.wav 30.000 40.000 Truck
+WdubBeFntYQ_460.000_470.000.wav 460.000 470.000 Truck
+_ZiJA6phEq8_30.000_40.000.wav 30.000 40.000 Truck
+_jfv_ziZWII_60.000_70.000.wav 60.000 70.000 Truck
+acvV6yYNc7Y_30.000_40.000.wav 30.000 40.000 Truck
+bQSaQ0iX_vk_30.000_40.000.wav 30.000 40.000 Truck
+bhxN5w03yS0_30.000_40.000.wav 30.000 40.000 Truck
+ckt7YEGcSoY_30.000_40.000.wav 30.000 40.000 Truck
+eIkUuCRE_0U_30.000_40.000.wav 30.000 40.000 Truck
+gxVhAVNjSU0_30.000_40.000.wav 30.000 40.000 Truck
+hDVNQOJCvOk_30.000_40.000.wav 30.000 40.000 Truck
+ieZVo7W3BQ4_30.000_40.000.wav 30.000 40.000 Truck
+ikmE_kRvDAc_30.000_40.000.wav 30.000 40.000 Truck
+jwZTKNsbf58_70.000_80.000.wav 70.000 80.000 Truck
+kH6fFjIZkB0_30.000_40.000.wav 30.000 40.000 Truck
+kr8ssbrDDMY_30.000_40.000.wav 30.000 40.000 Truck
+lp66EaEOOoU_30.000_40.000.wav 30.000 40.000 Truck
+n4o1r8Ai66o_30.000_40.000.wav 30.000 40.000 Truck
+nDtrUUc2J2U_0.000_10.000.wav 0.000 10.000 Truck
+nMaSkwx6cHE_30.000_40.000.wav 30.000 40.000 Truck
+p70IcMwsW9M_30.000_40.000.wav 30.000 40.000 Truck
+pJ1fore8JbQ_30.000_40.000.wav 30.000 40.000 Truck
+pt-J_L-OFI8_0.000_10.000.wav 0.000 10.000 Truck
+rdanJP7Usrg_30.000_40.000.wav 30.000 40.000 Truck
+srTX18ikXkE_10.000_20.000.wav 10.000 20.000 Truck
+tuplsUUDXKw_30.000_40.000.wav 30.000 40.000 Truck
+x6vuWsdeS3s_30.000_40.000.wav 30.000 40.000 Truck
+xMClk12ouB8_30.000_40.000.wav 30.000 40.000 Truck
+ycqDMKTrvLY_30.000_40.000.wav 30.000 40.000 Truck
+yk5LqHTtHLo_30.000_40.000.wav 30.000 40.000 Truck
+yrscqyUOIlI_30.000_40.000.wav 30.000 40.000 Truck
+zM3chsL-B7U_30.000_40.000.wav 30.000 40.000 Truck
+06si40RVDco_30.000_40.000.wav 30.000 40.000 Motorcycle
+0DzsPL-xElE_20.000_30.000.wav 20.000 30.000 Motorcycle
+145N68nh4m0_120.000_130.000.wav 120.000 130.000 Motorcycle
+16vw4K9qJnY_30.000_40.000.wav 30.000 40.000 Motorcycle
+21QlKF17ipc_30.000_40.000.wav 30.000 40.000 Motorcycle
+3LulQoOXNB0_30.000_40.000.wav 30.000 40.000 Motorcycle
+45JHcLU57B8_20.000_30.000.wav 20.000 30.000 Motorcycle
+4NZkW-XaIa4_30.000_40.000.wav 30.000 40.000 Motorcycle
+506I6LfdDuk_50.000_60.000.wav 50.000 60.000 Motorcycle
+6MCy1lh4qaw_20.000_30.000.wav 20.000 30.000 Motorcycle
+6R8cO4ARzkY_30.000_40.000.wav 30.000 40.000 Motorcycle
+6taAP7SFewI_30.000_40.000.wav 30.000 40.000 Motorcycle
+7g6aZTBe2xE_30.000_40.000.wav 30.000 40.000 Motorcycle
+9HcahqYUVoc_90.000_100.000.wav 90.000 100.000 Motorcycle
+9N1iw5Vdim8_20.000_30.000.wav 20.000 30.000 Motorcycle
+ANWU9Hiy_5k_40.000_50.000.wav 40.000 50.000 Motorcycle
+BTNz6NftP34_30.000_40.000.wav 30.000 40.000 Motorcycle
+BxnLAGsByCI_10.000_20.000.wav 10.000 20.000 Motorcycle
+CZgx_6XaEkg_30.000_40.000.wav 30.000 40.000 Motorcycle
+D3BJuOwltoI_10.000_20.000.wav 10.000 20.000 Motorcycle
+FgN9v1jYqjA_30.000_40.000.wav 30.000 40.000 Motorcycle
+HQ8eR2lvjSE_30.000_40.000.wav 30.000 40.000 Motorcycle
+Mb-GyQEKoEc_30.000_40.000.wav 30.000 40.000 Motorcycle
+Pair_NsHdTc_30.000_40.000.wav 30.000 40.000 Motorcycle
+UFIBEBkm7ao_30.000_40.000.wav 30.000 40.000 Motorcycle
+UWz5OIijWM4_30.000_40.000.wav 30.000 40.000 Motorcycle
+WLX3Db60418_20.000_30.000.wav 20.000 30.000 Motorcycle
+X5Xs8Y1cJK0_30.000_40.000.wav 30.000 40.000 Motorcycle
+ZGf0vrZStwI_30.000_40.000.wav 30.000 40.000 Motorcycle
+ZfkO1HlI0zM_30.000_40.000.wav 30.000 40.000 Motorcycle
+bhtB2Zgh9Q8_110.000_120.000.wav 110.000 120.000 Motorcycle
+d-m8eXCpeDg_30.000_40.000.wav 30.000 40.000 Motorcycle
+d21IwtH2oHI_30.000_40.000.wav 30.000 40.000 Motorcycle
+dhaKGPCgtfw_30.000_40.000.wav 30.000 40.000 Motorcycle
+ee-0JGvEIng_30.000_40.000.wav 30.000 40.000 Motorcycle
+epGDNMrsQb8_40.000_50.000.wav 40.000 50.000 Motorcycle
+ezUkPETm6cs_30.000_40.000.wav 30.000 40.000 Motorcycle
+f724u5z_UDw_30.000_40.000.wav 30.000 40.000 Motorcycle
+gGmWm1i6pVo_30.000_40.000.wav 30.000 40.000 Motorcycle
+i9VjpIbM3iE_410.000_420.000.wav 410.000 420.000 Motorcycle
+iMp8nODaotA_580.000_590.000.wav 580.000 590.000 Motorcycle
+lVW2CqsHJ4Y_30.000_40.000.wav 30.000 40.000 Motorcycle
+lj7hzmz19-M_30.000_40.000.wav 30.000 40.000 Motorcycle
+mX45CiTjf8I_30.000_40.000.wav 30.000 40.000 Motorcycle
+mbLiZ_jpgeY_20.000_30.000.wav 20.000 30.000 Motorcycle
+owZDBEq6WdU_30.000_40.000.wav 30.000 40.000 Motorcycle
+pNMBIqvbyB4_30.000_40.000.wav 30.000 40.000 Motorcycle
+po-tnKZAzdg_40.000_50.000.wav 40.000 50.000 Motorcycle
+qAQuljp-atA_30.000_40.000.wav 30.000 40.000 Motorcycle
+r0Oll28wmXs_30.000_40.000.wav 30.000 40.000 Motorcycle
+sAMjMyCdGOc_30.000_40.000.wav 30.000 40.000 Motorcycle
+vHlqKDR7ggA_30.000_40.000.wav 30.000 40.000 Motorcycle
+wPfv8ifzzyg_30.000_40.000.wav 30.000 40.000 Motorcycle
+wyhurCZbKQU_30.000_40.000.wav 30.000 40.000 Motorcycle
+xQTPEQDb0Gg_30.000_40.000.wav 30.000 40.000 Motorcycle
+xTPmoYwgKf4_30.000_40.000.wav 30.000 40.000 Motorcycle
+xXGIKM4daMU_30.000_40.000.wav 30.000 40.000 Motorcycle
+xZ8hQliZqhg_160.000_170.000.wav 160.000 170.000 Motorcycle
+xuMBy2NoROI_30.000_40.000.wav 30.000 40.000 Motorcycle
+z_8yGVO1qws_30.000_40.000.wav 30.000 40.000 Motorcycle
+-BaVEk1zS2g_50.000_60.000.wav 50.000 60.000 Train
+-Q4fBQ4egrs_0.000_10.000.wav 0.000 10.000 Train
+-QxSFr1cYuQ_20.000_30.000.wav 20.000 30.000 Train
+-ZdReI9dL6M_530.000_540.000.wav 530.000 540.000 Train
+0YIyGEM0yG0_550.000_560.000.wav 550.000 560.000 Train
+1Mk2MJDhLJQ_20.000_30.000.wav 20.000 30.000 Train
+2nejPPEWqJ8_320.000_330.000.wav 320.000 330.000 Train
+3ACjUf9QpAQ_30.000_40.000.wav 30.000 40.000 Train
+3RfrTU1p5SA_500.000_510.000.wav 500.000 510.000 Train
+3YJewEC-NWo_30.000_40.000.wav 30.000 40.000 Train
+3ZZDuYU2HM4_150.000_160.000.wav 150.000 160.000 Train
+3fPX1LaGwJo_60.000_70.000.wav 60.000 70.000 Train
+4_gyCWuPxRg_170.000_180.000.wav 170.000 180.000 Train
+4l4vGrMD4Tw_550.000_560.000.wav 550.000 560.000 Train
+4oT0bxldS80_30.000_40.000.wav 30.000 40.000 Train
+4t7Mi3pnSA4_210.000_220.000.wav 210.000 220.000 Train
+53oq_Otm_XI_30.000_40.000.wav 30.000 40.000 Train
+6OgSNQOTw2U_30.000_40.000.wav 30.000 40.000 Train
+6_TGlFO0DCk_10.000_20.000.wav 10.000 20.000 Train
+7KdSGBzXvz8_420.000_430.000.wav 420.000 430.000 Train
+7W_kcu0CJqI_310.000_320.000.wav 310.000 320.000 Train
+8IaInXpdd9M_0.000_10.000.wav 0.000 10.000 Train
+8nU1aVscJec_30.000_40.000.wav 30.000 40.000 Train
+9LQEZJPNVpw_30.000_40.000.wav 30.000 40.000 Train
+9NT6gEiqpWA_30.000_40.000.wav 30.000 40.000 Train
+AFhll08KM98_30.000_40.000.wav 30.000 40.000 Train
+AHom7lBbtoY_30.000_40.000.wav 30.000 40.000 Train
+AK0kZUDk294_2.000_12.000.wav 2.000 12.000 Train
+AKPC4rEGoyI_30.000_40.000.wav 30.000 40.000 Train
+APsvUzw7bWA_60.000_70.000.wav 60.000 70.000 Train
+AshwkKUV07s_23.000_33.000.wav 23.000 33.000 Train
+BI2Tol64na0_30.000_40.000.wav 30.000 40.000 Train
+BmS2NiuT2c0_160.000_170.000.wav 160.000 170.000 Train
+CCX_4cW_SAU_0.000_10.000.wav 0.000 10.000 Train
+D_nXtMgbPNY_30.000_40.000.wav 30.000 40.000 Train
+F-JFxERdA2w_30.000_40.000.wav 30.000 40.000 Train
+FoIBRxw0tyE_30.000_40.000.wav 30.000 40.000 Train
+G958vjLYBcI_110.000_120.000.wav 110.000 120.000 Train
+GFQnh84kNwU_30.000_40.000.wav 30.000 40.000 Train
+GKc8PCTen8Q_310.000_320.000.wav 310.000 320.000 Train
+I4qODX0fypE_30.000_40.000.wav 30.000 40.000 Train
+IIIxN_ziy_I_60.000_70.000.wav 60.000 70.000 Train
+IdqEbjujFb8_30.000_40.000.wav 30.000 40.000 Train
+K-i81KrH8BQ_30.000_40.000.wav 30.000 40.000 Train
+K9pSRLw6FNc_40.000_50.000.wav 40.000 50.000 Train
+KPyYUly5xCc_90.000_100.000.wav 90.000 100.000 Train
+L3a132_uApg_50.000_60.000.wav 50.000 60.000 Train
+LK4b2eJpy24_30.000_40.000.wav 30.000 40.000 Train
+LzcNa3HvD7c_30.000_40.000.wav 30.000 40.000 Train
+MCYY8tJsnfY_7.000_17.000.wav 7.000 17.000 Train
+MDF2vsjm8jU_10.000_20.000.wav 10.000 20.000 Train
+MMfiWJVftMA_60.000_70.000.wav 60.000 70.000 Train
+MYzVHespZ-E_30.000_40.000.wav 30.000 40.000 Train
+Mbe4rlNiM84_0.000_7.000.wav 0.000 7.000 Train
+MczH_PWBNeI_360.000_370.000.wav 360.000 370.000 Train
+Mfkif49LLc4_30.000_40.000.wav 30.000 40.000 Train
+MwSbYICrYj8_290.000_300.000.wav 290.000 300.000 Train
+PJUy17bXlhc_40.000_50.000.wav 40.000 50.000 Train
+QDTbchu0LrU_30.000_40.000.wav 30.000 40.000 Train
+QZJ5WAYIUh8_70.000_80.000.wav 70.000 80.000 Train
+QrAoRSA13bM_30.000_40.000.wav 30.000 40.000 Train
+RN-_agT8_Cg_0.000_10.000.wav 0.000 10.000 Train
+R_Lpb-51Kl4_30.000_40.000.wav 30.000 40.000 Train
+Rhvy7V4F95Q_40.000_50.000.wav 40.000 50.000 Train
+Rq-22Cycrpg_30.000_40.000.wav 30.000 40.000 Train
+RrlgSfQrqQc_20.000_30.000.wav 20.000 30.000 Train
+RwBKGPEg6uA_340.000_350.000.wav 340.000 350.000 Train
+T73runykdnE_25.000_35.000.wav 25.000 35.000 Train
+T8M6W4yOzI4_30.000_40.000.wav 30.000 40.000 Train
+Tmm4H6alHCE_30.000_40.000.wav 30.000 40.000 Train
+TyTORMEourg_270.000_280.000.wav 270.000 280.000 Train
+UQx0EMXtLZA_60.000_70.000.wav 60.000 70.000 Train
+UZx7OAgRMRY_90.000_100.000.wav 90.000 100.000 Train
+UerX5Bv2hcs_70.000_80.000.wav 70.000 80.000 Train
+UxSUGCvpskM_340.000_350.000.wav 340.000 350.000 Train
+V2hln47cP78_130.000_140.000.wav 130.000 140.000 Train
+VIe_Qkg5RJI_130.000_140.000.wav 130.000 140.000 Train
+WDn851XbWTk_30.000_40.000.wav 30.000 40.000 Train
+WFdpQCtpBB4_30.000_40.000.wav 30.000 40.000 Train
+XAUtk9lwzU8_30.000_40.000.wav 30.000 40.000 Train
+XDTlBb3aYqo_30.000_40.000.wav 30.000 40.000 Train
+XKvLkIM8dck_40.000_50.000.wav 40.000 50.000 Train
+XQbeLJYzY9k_90.000_100.000.wav 90.000 100.000 Train
+XW8pSKLyr0o_20.000_30.000.wav 20.000 30.000 Train
+XeYiNanFS_M_120.000_130.000.wav 120.000 130.000 Train
+Y10I9JSvJuQ_30.000_40.000.wav 30.000 40.000 Train
+YDGf-razgyU_250.000_260.000.wav 250.000 260.000 Train
+YFD1Qrlskrg_60.000_70.000.wav 60.000 70.000 Train
+Y_jwEflLthg_190.000_200.000.wav 190.000 200.000 Train
+Y_ynIwm3qm0_370.000_380.000.wav 370.000 380.000 Train
+Zy0goYEHPHU_30.000_40.000.wav 30.000 40.000 Train
+_dkeW6lqmq4_30.000_40.000.wav 30.000 40.000 Train
+aNO2KEXBCOk_30.000_40.000.wav 30.000 40.000 Train
+aXsUHAKbyLs_30.000_40.000.wav 30.000 40.000 Train
+ahct5yzUtdE_20.000_30.000.wav 20.000 30.000 Train
+arevYmB0qGg_30.000_40.000.wav 30.000 40.000 Train
+bCGtzspNbNo_30.000_40.000.wav 30.000 40.000 Train
+bI6wPI9kAm8_70.000_80.000.wav 70.000 80.000 Train
+bpdCMWWiB_0_30.000_40.000.wav 30.000 40.000 Train
+cdrjKqyDrak_420.000_430.000.wav 420.000 430.000 Train
+d1o334I5X_k_30.000_40.000.wav 30.000 40.000 Train
+dSzZWgbJ378_30.000_40.000.wav 30.000 40.000 Train
+eRclX9l0F_c_150.000_160.000.wav 150.000 160.000 Train
+fOVsAMJ3Yms_30.000_40.000.wav 30.000 40.000 Train
+fWVfi9pAh_4_10.000_20.000.wav 10.000 20.000 Train
+fztkF47lVQg_0.000_10.000.wav 0.000 10.000 Train
+g0ICxHjC9Uc_30.000_40.000.wav 30.000 40.000 Train
+g2scd3YVgwQ_30.000_40.000.wav 30.000 40.000 Train
+g4cA-ifQc70_30.000_40.000.wav 30.000 40.000 Train
+g9JVq7wfDIo_30.000_40.000.wav 30.000 40.000 Train
+gKMpowHeyKc_30.000_40.000.wav 30.000 40.000 Train
+gTFCK9TuLOQ_30.000_40.000.wav 30.000 40.000 Train
+gU0mD2fSh4c_500.000_510.000.wav 500.000 510.000 Train
+gkH_Zxasn8o_40.000_50.000.wav 40.000 50.000 Train
+gvnM4kK4r70_10.000_20.000.wav 10.000 20.000 Train
+hH_M56EnnDk_30.000_40.000.wav 30.000 40.000 Train
+hVvtTC9AmNs_30.000_40.000.wav 30.000 40.000 Train
+hYqzr_rIIAw_30.000_40.000.wav 30.000 40.000 Train
+hdYQzH2E-e4_310.000_320.000.wav 310.000 320.000 Train
+iZgzRfa-xPQ_30.000_40.000.wav 30.000 40.000 Train
+j9Z63H5hvrQ_0.000_10.000.wav 0.000 10.000 Train
+jbW2ew8VMfU_50.000_60.000.wav 50.000 60.000 Train
+jlz7r-NSUuA_50.000_60.000.wav 50.000 60.000 Train
+k0vRZm7ZnQk_280.000_290.000.wav 280.000 290.000 Train
+k8H8rn4NaSM_0.000_10.000.wav 0.000 10.000 Train
+kbfkq3TuAe0_470.000_480.000.wav 470.000 480.000 Train
+lf1Sblrda3A_560.000_570.000.wav 560.000 570.000 Train
+m4DS9-5Gkds_30.000_40.000.wav 30.000 40.000 Train
+m5HeCy87QYY_380.000_390.000.wav 380.000 390.000 Train
+nKM4MUAsVzg_100.000_110.000.wav 100.000 110.000 Train
+nY1gcEMzsWI_10.000_20.000.wav 10.000 20.000 Train
+nfY_zkJceDw_30.000_40.000.wav 30.000 40.000 Train
+oogrnx-_LBA_60.000_70.000.wav 60.000 70.000 Train
+pW5SI1ZKUpA_30.000_40.000.wav 30.000 40.000 Train
+pbOZLMrJy0A_0.000_10.000.wav 0.000 10.000 Train
+pxmrmtEnROk_30.000_40.000.wav 30.000 40.000 Train
+q7zzKHFWGkg_30.000_40.000.wav 30.000 40.000 Train
+qu8vVFWKszA_30.000_40.000.wav 30.000 40.000 Train
+r6mHSfFkY_8_30.000_40.000.wav 30.000 40.000 Train
+rNNPQ9DD4no_30.000_40.000.wav 30.000 40.000 Train
+rSrBDAgLUoI_460.000_470.000.wav 460.000 470.000 Train
+stdjjG6Y5IU_30.000_40.000.wav 30.000 40.000 Train
+t_lFhyZaZR0_150.000_160.000.wav 150.000 160.000 Train
+txXSE7kgrc8_30.000_40.000.wav 30.000 40.000 Train
+uZfsEDo3elY_20.000_30.000.wav 20.000 30.000 Train
+umcnfA9veOw_160.000_170.000.wav 160.000 170.000 Train
+uysTr0SfhLI_10.000_20.000.wav 10.000 20.000 Train
+wM9wNgY8d4g_150.000_160.000.wav 150.000 160.000 Train
+xabrKa79prM_30.000_40.000.wav 30.000 40.000 Train
+xshKOSEF_6o_0.000_10.000.wav 0.000 10.000 Train
+yBVxtq9k8Sg_0.000_10.000.wav 0.000 10.000 Train
+yH1r2Bblluw_240.000_250.000.wav 240.000 250.000 Train
+yywGJu6jp8U_30.000_40.000.wav 30.000 40.000 Train
+z5uKFGeTtNg_30.000_40.000.wav 30.000 40.000 Train
diff --git a/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv
new file mode 100644
index 0000000000000000000000000000000000000000..d98569b2bb2a47882ab09081c204bc66823b5053
--- /dev/null
+++ b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv
@@ -0,0 +1,606 @@
+-5QrBL6MzLg_60.000_70.000.wav 60.000 70.000 Train horn
+-E0shPRxAbo_30.000_40.000.wav 30.000 40.000 Train horn
+-GCwoyCnYsY_0.000_10.000.wav 0.000 10.000 Train horn
+-Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Train horn
+-Qfk_Q2ctBs_30.000_40.000.wav 30.000 40.000 Train horn
+-Wd1pV7UjWg_60.000_70.000.wav 60.000 70.000 Train horn
+-Zq22n4OewA_30.000_40.000.wav 30.000 40.000 Train horn
+-jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Train horn
+-nGBPqlRNg4_30.000_40.000.wav 30.000 40.000 Train horn
+-u9BxBNcrw4_30.000_40.000.wav 30.000 40.000 Train horn
+-zqW9xCZd80_260.000_270.000.wav 260.000 270.000 Train horn
+02w3vd_GgF0_390.000_400.000.wav 390.000 400.000 Train horn
+0HqeYIREv8M_30.000_40.000.wav 30.000 40.000 Train horn
+0IpYF91Fdt0_80.000_90.000.wav 80.000 90.000 Train horn
+0NaZejdABG0_90.000_100.000.wav 90.000 100.000 Train horn
+0RurXUfKyow_4.000_14.000.wav 4.000 14.000 Train horn
+0_HnD-rW3lI_170.000_180.000.wav 170.000 180.000 Train horn
+10i60V1RZkQ_210.000_220.000.wav 210.000 220.000 Train horn
+1FJY5X1iY9I_170.000_180.000.wav 170.000 180.000 Train horn
+1S5WKCcf-wU_40.000_50.000.wav 40.000 50.000 Train horn
+1U0Ty6CW6AM_40.000_50.000.wav 40.000 50.000 Train horn
+1hQLr88iCvg_30.000_40.000.wav 30.000 40.000 Train horn
+1iUXERALOOs_190.000_200.000.wav 190.000 200.000 Train horn
+1iWFlLpixKU_5.000_15.000.wav 5.000 15.000 Train horn
+1oJAVJPX0YY_20.000_30.000.wav 20.000 30.000 Train horn
+26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Train horn
+2BMHsKLcb7E_90.000_100.000.wav 90.000 100.000 Train horn
+2RpOd9MJjyQ_10.000_20.000.wav 10.000 20.000 Train horn
+2U4wSdl10to_200.000_210.000.wav 200.000 210.000 Train horn
+2aBV6AZt5nk_570.000_580.000.wav 570.000 580.000 Train horn
+-8baTnilyjs_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+-Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+-jG26jT3fP8_230.000_240.000.wav 230.000 240.000 Air horn, truck horn
+-jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Air horn, truck horn
+-v7cUxke-f4_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+-yeWlsEpcpA_15.000_25.000.wav 15.000 25.000 Air horn, truck horn
+04KOunVOkSA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+08y2LHhxmsM_400.000_410.000.wav 400.000 410.000 Air horn, truck horn
+0G73yqtBwgE_11.000_21.000.wav 11.000 21.000 Air horn, truck horn
+0UPY7ws-VFs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn
+0euD32aKYUs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn
+1T1i2rny8RU_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+1iRgwn7p0DA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+1myTsHAIvYc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+1z0XoG6GEv4_420.000_430.000.wav 420.000 430.000 Air horn, truck horn
+26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Air horn, truck horn
+2KmSuPb9gwA_24.000_34.000.wav 24.000 34.000 Air horn, truck horn
+2Vy5NCEkg2I_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+2ZciT0XrifM_0.000_8.000.wav 0.000 8.000 Air horn, truck horn
+2jOzX06bzuA_16.000_26.000.wav 16.000 26.000 Air horn, truck horn
+35EOmSMTQ6I_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Air horn, truck horn
+3ntFslTK6hM_90.000_100.000.wav 90.000 100.000 Air horn, truck horn
+3rGOv4evODE_20.000_30.000.wav 20.000 30.000 Air horn, truck horn
+42U7xIucU68_20.000_30.000.wav 20.000 30.000 Air horn, truck horn
+46r7mO2k6zY_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+4EBnb2DN3Yg_13.000_23.000.wav 13.000 23.000 Air horn, truck horn
+4NTjS5pFfSc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+4bvfOnX7BIE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+4l78f9VZ9uE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
+-ajCLjpfGKI_83.000_93.000.wav 83.000 93.000 Car alarm
+-hLSc9aPOms_13.000_23.000.wav 13.000 23.000 Car alarm
+-rgDWfvxxqw_30.000_40.000.wav 30.000 40.000 Car alarm
+0C3kqtF76t8_50.000_60.000.wav 50.000 60.000 Car alarm
+0Hz4R_m0hmI_80.000_90.000.wav 80.000 90.000 Car alarm
+0ZPafgZftWk_80.000_90.000.wav 80.000 90.000 Car alarm
+0npLQ4LzD0c_40.000_50.000.wav 40.000 50.000 Car alarm
+17VuPl9Wxvs_20.000_30.000.wav 20.000 30.000 Car alarm
+3HxQ83IMyw4_70.000_80.000.wav 70.000 80.000 Car alarm
+3z05luLEc_Q_0.000_10.000.wav 0.000 10.000 Car alarm
+4A1Ar1TIXIY_30.000_40.000.wav 30.000 40.000 Car alarm
+4Kpklmj-ze0_53.000_63.000.wav 53.000 63.000 Car alarm
+4h01lBkTVQY_18.000_28.000.wav 18.000 28.000 Car alarm
+5-SzZotiaBU_30.000_40.000.wav 30.000 40.000 Car alarm
+54PbkldEp9M_30.000_40.000.wav 30.000 40.000 Car alarm
+5P6YYsMaIH4_30.000_40.000.wav 30.000 40.000 Car alarm
+5tzTahLHylw_70.000_80.000.wav 70.000 80.000 Car alarm
+7DC3HtNi4fU_160.000_170.000.wav 160.000 170.000 Car alarm
+7NJ5TbNEIvA_250.000_260.000.wav 250.000 260.000 Car alarm
+7NZ0kMj2HSI_54.000_64.000.wav 54.000 64.000 Car alarm
+7RQpt1_1ZzU_30.000_40.000.wav 30.000 40.000 Car alarm
+7ee54nr6jG8_30.000_40.000.wav 30.000 40.000 Car alarm
+8OajsyPSNt8_40.000_50.000.wav 40.000 50.000 Car alarm
+9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car alarm
+9fzeD7CeI7Y_110.000_120.000.wav 110.000 120.000 Car alarm
+9jYv9WuyknA_130.000_140.000.wav 130.000 140.000 Car alarm
+A-GNszKtjJc_93.000_103.000.wav 93.000 103.000 Car alarm
+A437a4Y_xag_230.000_240.000.wav 230.000 240.000 Car alarm
+APMPW2YI-Zk_20.000_30.000.wav 20.000 30.000 Car alarm
+AR-KmtlXg4Y_70.000_80.000.wav 70.000 80.000 Car alarm
+-60XojQWWoc_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-6d-zxMvC5E_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-6qSMlbJJ58_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-8OITuFZha8_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-8n2NqDFRko_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-AIrHVeCgtM_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-AVzYvKHwPg_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-AXDeY-N2_M_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-B1uzsLG0Dk_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-BM_EAszxBg_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-Em3OpyaefM_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-FWkB2IDMhc_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-SP7KWmTRUU_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-h4or05bj_I_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-oV6dQu5tZo_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-r8mfjRiHrU_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-s9kwrRilOY_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-uMiGr6xvRA_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-x70B12Mb-8_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-xYsfYZOI-Y_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-zxrdL6MlKI_30.000_40.000.wav 30.000 40.000 Reversing beeps
+03xMfqt4fZI_24.000_34.000.wav 24.000 34.000 Reversing beeps
+0E4AqW9dmdk_30.000_40.000.wav 30.000 40.000 Reversing beeps
+0FQo-2xRJ0E_30.000_40.000.wav 30.000 40.000 Reversing beeps
+0HmiH-wKLB4_30.000_40.000.wav 30.000 40.000 Reversing beeps
+0KskqFt3DoY_15.000_25.000.wav 15.000 25.000 Reversing beeps
+0OiPtV9sd_w_30.000_40.000.wav 30.000 40.000 Reversing beeps
+0P-YGHC5cBU_30.000_40.000.wav 30.000 40.000 Reversing beeps
+0QKet-tdquc_30.000_40.000.wav 30.000 40.000 Reversing beeps
+0VnoYVqd-yo_30.000_40.000.wav 30.000 40.000 Reversing beeps
+-5px8DVPl8A_28.000_38.000.wav 28.000 38.000 Bicycle
+-D08wyQwDPQ_10.000_20.000.wav 10.000 20.000 Bicycle
+-F1_Gh78vJ0_30.000_40.000.wav 30.000 40.000 Bicycle
+-FZQIkX44Pk_10.000_20.000.wav 10.000 20.000 Bicycle
+-FsvS99nWTc_30.000_40.000.wav 30.000 40.000 Bicycle
+-Holdef_BZ0_30.000_40.000.wav 30.000 40.000 Bicycle
+-Inn26beF70_30.000_40.000.wav 30.000 40.000 Bicycle
+-Jq9HNSs_ns_14.000_24.000.wav 14.000 24.000 Bicycle
+-KlN_AXMM0Q_30.000_40.000.wav 30.000 40.000 Bicycle
+-NCcqKWiGus_30.000_40.000.wav 30.000 40.000 Bicycle
+-NNC_TqWfGw_30.000_40.000.wav 30.000 40.000 Bicycle
+-OGFiXvmldM_30.000_40.000.wav 30.000 40.000 Bicycle
+-RFpDUZhN-g_13.000_23.000.wav 13.000 23.000 Bicycle
+-XUfeRTw3b4_0.000_6.000.wav 0.000 6.000 Bicycle
+-XoATxJ-Qcg_30.000_40.000.wav 30.000 40.000 Bicycle
+-bFNxvFwDts_470.000_480.000.wav 470.000 480.000 Bicycle
+-e5PokL6Cyo_30.000_40.000.wav 30.000 40.000 Bicycle
+-fNyOf9zIU0_30.000_40.000.wav 30.000 40.000 Bicycle
+-fhpkRyZL90_30.000_40.000.wav 30.000 40.000 Bicycle
+-fo3m0hiZbg_30.000_40.000.wav 30.000 40.000 Bicycle
+-ikJkNwcmkA_27.000_37.000.wav 27.000 37.000 Bicycle
+-k2nMcxAjWE_30.000_40.000.wav 30.000 40.000 Bicycle
+-k80ibA-fyw_30.000_40.000.wav 30.000 40.000 Bicycle
+-lBcEVa_NKw_30.000_40.000.wav 30.000 40.000 Bicycle
+-mQyAYU_Bd4_50.000_60.000.wav 50.000 60.000 Bicycle
+-ngrinYHF4c_30.000_40.000.wav 30.000 40.000 Bicycle
+-nqm_RJ2xj8_40.000_50.000.wav 40.000 50.000 Bicycle
+-oAw5iTeT1g_40.000_50.000.wav 40.000 50.000 Bicycle
+-p2EMzpTE38_4.000_14.000.wav 4.000 14.000 Bicycle
+-qmfWP_yzn4_30.000_40.000.wav 30.000 40.000 Bicycle
+-0DIFwkUpjQ_50.000_60.000.wav 50.000 60.000 Skateboard
+-53qltVyjpc_180.000_190.000.wav 180.000 190.000 Skateboard
+-5y4jb9eUWs_110.000_120.000.wav 110.000 120.000 Skateboard
+-81kolkG8M0_0.000_8.000.wav 0.000 8.000 Skateboard
+-9dwTSq6JZg_70.000_80.000.wav 70.000 80.000 Skateboard
+-9oKZsjjf_0_20.000_30.000.wav 20.000 30.000 Skateboard
+-AFGfu5zOzQ_30.000_40.000.wav 30.000 40.000 Skateboard
+-DHGwygUsQc_30.000_40.000.wav 30.000 40.000 Skateboard
+-DkuTmIs7_Q_30.000_40.000.wav 30.000 40.000 Skateboard
+-E1E17R7UBA_260.000_270.000.wav 260.000 270.000 Skateboard
+-E1aIXhB4YU_30.000_40.000.wav 30.000 40.000 Skateboard
+-McJLXNN3-o_50.000_60.000.wav 50.000 60.000 Skateboard
+-N7nQ4CXGsY_170.000_180.000.wav 170.000 180.000 Skateboard
+-O5vrHFRzcY_30.000_40.000.wav 30.000 40.000 Skateboard
+-Plh9jAN_Eo_0.000_2.000.wav 0.000 2.000 Skateboard
+-Qd_dXTbgK0_30.000_40.000.wav 30.000 40.000 Skateboard
+-aVZ-H92M_s_0.000_4.000.wav 0.000 4.000 Skateboard
+-cd-Zn8qFxU_90.000_100.000.wav 90.000 100.000 Skateboard
+-esP4loyvjM_60.000_70.000.wav 60.000 70.000 Skateboard
+-iB3a71aPew_30.000_40.000.wav 30.000 40.000 Skateboard
+-lZapwtvwlg_0.000_10.000.wav 0.000 10.000 Skateboard
+-mxMaMJCXL8_180.000_190.000.wav 180.000 190.000 Skateboard
+-nYGTw9Sypg_20.000_30.000.wav 20.000 30.000 Skateboard
+-oS19KshdlM_30.000_40.000.wav 30.000 40.000 Skateboard
+-s6uxc77NWo_40.000_50.000.wav 40.000 50.000 Skateboard
+-sCrXS2kJlA_30.000_40.000.wav 30.000 40.000 Skateboard
+-saCvPTdQ7s_30.000_40.000.wav 30.000 40.000 Skateboard
+-sb-knLiDic_20.000_30.000.wav 20.000 30.000 Skateboard
+-tSwRvqaKWg_90.000_100.000.wav 90.000 100.000 Skateboard
+-x_jV34hVq4_30.000_40.000.wav 30.000 40.000 Skateboard
+--ljM2Kojag_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-4F1TX-T6T4_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-7HVWUwyMig_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-9pUUT-6o8U_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-Ei2LE71Dfg_20.000_30.000.wav 20.000 30.000 Ambulance (siren)
+-LGTb-xyjzA_11.000_21.000.wav 11.000 21.000 Ambulance (siren)
+-Y1qiiugnk8_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-YsrLG2K1TE_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-ZeMV790MXE_10.000_20.000.wav 10.000 20.000 Ambulance (siren)
+-d-T8Y9-TOg_17.000_27.000.wav 17.000 27.000 Ambulance (siren)
+-dcrL5JLmvo_11.000_21.000.wav 11.000 21.000 Ambulance (siren)
+-fCSO8SVWZU_6.000_16.000.wav 6.000 16.000 Ambulance (siren)
+-fGFQTGd2nA_10.000_20.000.wav 10.000 20.000 Ambulance (siren)
+-hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Ambulance (siren)
+-jnQgpHubNI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-k6p9n9y22Q_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-kr4SUjnm88_29.000_39.000.wav 29.000 39.000 Ambulance (siren)
+-lyPnABQhCI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-od8LQAVgno_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-pVEgzu95Nc_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-w-9yF465IY_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-woquFRnQk8_16.000_26.000.wav 16.000 26.000 Ambulance (siren)
+-xz75wUCln8_50.000_60.000.wav 50.000 60.000 Ambulance (siren)
+-yGElLHdkEI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-yPSgCn9AWo_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-z8jsgl3iHE_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+00H_s-krtg8_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+02u3P99INjs_8.000_18.000.wav 8.000 18.000 Ambulance (siren)
+06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Ambulance (siren)
+0EPK7Pv_lbE_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+-0Eem_FuIto_15.000_25.000.wav 15.000 25.000 Fire engine, fire truck (siren)
+-2sT5oBBWWY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-45cKZA7Jww_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-4B435WQvag_20.000_30.000.wav 20.000 30.000 Fire engine, fire truck (siren)
+-6qhtwdfGOA_23.000_33.000.wav 23.000 33.000 Fire engine, fire truck (siren)
+-8uyNBFbdFc_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-Jsu4dbuO4A_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-KsPTvgJJVE_350.000_360.000.wav 350.000 360.000 Fire engine, fire truck (siren)
+-PRrNx6_MD0_16.000_26.000.wav 16.000 26.000 Fire engine, fire truck (siren)
+-QBo1W2w8II_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-QX-ddNtUvE_24.000_34.000.wav 24.000 34.000 Fire engine, fire truck (siren)
+-RlUu1el2G4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-SkO97C81Ms_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-T8QHPXfIC4_13.000_23.000.wav 13.000 23.000 Fire engine, fire truck (siren)
+-USiTjZoh88_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-X0vNLwH1C0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-Z3ByS_RCwI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-ZtZOcg3s7M_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-cOjJ0Nvtlw_23.000_33.000.wav 23.000 33.000 Fire engine, fire truck (siren)
+-cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Fire engine, fire truck (siren)
+-eYUCWGQ_wU_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Fire engine, fire truck (siren)
+-hplTh4SGvs_90.000_100.000.wav 90.000 100.000 Fire engine, fire truck (siren)
+-nPhg6Eu4b4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-oCvKmNbhl0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-oEGuMg8hT4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-pvaJ4DwtRg_3.000_13.000.wav 3.000 13.000 Fire engine, fire truck (siren)
+-qKRKDTbt4c_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-sJn3uUxpH8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-sfn1NDHWJI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+-09rxiqNNEs_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-3qh-WFUV2U_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-4JG_Ag99hY_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-60NmEaP0is_0.000_10.000.wav 0.000 10.000 Civil defense siren
+-6cTEqIcics_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-6iVBmb5PZU_40.000_50.000.wav 40.000 50.000 Civil defense siren
+-6qp8NjWffE_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-75iY1j3MeY_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-E3Yju3lrRo_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-FHSBdx5A3g_40.000_50.000.wav 40.000 50.000 Civil defense siren
+-JhSzxTdcwY_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-OtNDK_Hxp8_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-S3_I0RiG3g_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-YMXgDKKAwU_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-c7XoYM-SSY_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-j8EeIX9ynk_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-t478yabOQw_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-uIyMR9luvg_30.000_40.000.wav 30.000 40.000 Civil defense siren
+-wgP6ua-t4k_40.000_50.000.wav 40.000 50.000 Civil defense siren
+-zGAb18JxmI_30.000_40.000.wav 30.000 40.000 Civil defense siren
+03NLMEMi8-I_30.000_40.000.wav 30.000 40.000 Civil defense siren
+0552YhBdeXo_30.000_40.000.wav 30.000 40.000 Civil defense siren
+06TM6z3NvuY_30.000_40.000.wav 30.000 40.000 Civil defense siren
+0CUi0oGUzjU_30.000_40.000.wav 30.000 40.000 Civil defense siren
+0GpUFFJNFH8_30.000_40.000.wav 30.000 40.000 Civil defense siren
+0H_WUo2srs0_30.000_40.000.wav 30.000 40.000 Civil defense siren
+0HvYkBXQ44A_30.000_40.000.wav 30.000 40.000 Civil defense siren
+0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Civil defense siren
+0JKcTVpby0I_30.000_40.000.wav 30.000 40.000 Civil defense siren
+0PhU-PIsUMw_40.000_50.000.wav 40.000 50.000 Civil defense siren
+-122tCXtFhU_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-1U98XBTyB4_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Police car (siren)
+-6WqJCSmkCw_70.000_80.000.wav 70.000 80.000 Police car (siren)
+-AF7wp3ezww_140.000_150.000.wav 140.000 150.000 Police car (siren)
+-AFASmp1fpk_6.000_16.000.wav 6.000 16.000 Police car (siren)
+-F2lk9A8B8M_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-GPv09qi9A8_120.000_130.000.wav 120.000 130.000 Police car (siren)
+-Hi-WpRGUpc_9.000_19.000.wav 9.000 19.000 Police car (siren)
+-KsPTvgJJVE_350.000_360.000.wav 350.000 360.000 Police car (siren)
+-MfBpxtGQmE_20.000_30.000.wav 20.000 30.000 Police car (siren)
+-Pg4vVPs4bE_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-UCf_-3yzWU_290.000_300.000.wav 290.000 300.000 Police car (siren)
+-VULyMtKazE_0.000_7.000.wav 0.000 7.000 Police car (siren)
+-XRiLbb3Syo_2.000_12.000.wav 2.000 12.000 Police car (siren)
+-XrpzGb6xCU_190.000_200.000.wav 190.000 200.000 Police car (siren)
+-YsrLG2K1TE_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-ZtZOcg3s7M_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-_8fdnv6Crg_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-az6BooRLxw_40.000_50.000.wav 40.000 50.000 Police car (siren)
+-bs3c27rEtc_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-dBTGdL4RFs_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-gKNRXbpAKs_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Police car (siren)
+-haSUR_IUto_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-l-DEfDAvNA_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-lWs7_49gss_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-lhnhB4rbGw_3.000_13.000.wav 3.000 13.000 Police car (siren)
+-rkJeBBmiTQ_60.000_70.000.wav 60.000 70.000 Police car (siren)
+-rs7FPxzc6w_8.000_18.000.wav 8.000 18.000 Police car (siren)
+-20uudT97E0_30.000_40.000.wav 30.000 40.000 Screaming
+-3bGlOhRkAo_140.000_150.000.wav 140.000 150.000 Screaming
+-4pUrlMafww_1.000_11.000.wav 1.000 11.000 Screaming
+-7R0ybQQAHg_60.000_70.000.wav 60.000 70.000 Screaming
+-7gojlG6bE4_30.000_40.000.wav 30.000 40.000 Screaming
+-GI5PbO6j50_30.000_40.000.wav 30.000 40.000 Screaming
+-MuIRudOtxw_30.000_40.000.wav 30.000 40.000 Screaming
+-WfQBr42ymw_30.000_40.000.wav 30.000 40.000 Screaming
+-YOjIgYspsY_30.000_40.000.wav 30.000 40.000 Screaming
+-g_AcRVFfXU_30.000_40.000.wav 30.000 40.000 Screaming
+-gb5uvwsRpI_30.000_40.000.wav 30.000 40.000 Screaming
+-iAwqlQ3TEk_0.000_3.000.wav 0.000 3.000 Screaming
+-nJoxcmxz5g_30.000_40.000.wav 30.000 40.000 Screaming
+-pwgypWE-J8_30.000_40.000.wav 30.000 40.000 Screaming
+-pzasCR0kpc_30.000_40.000.wav 30.000 40.000 Screaming
+-sUgHKZQKYc_30.000_40.000.wav 30.000 40.000 Screaming
+-uazzQEmQ7c_0.000_10.000.wav 0.000 10.000 Screaming
+-vHJU1wDRsY_30.000_40.000.wav 30.000 40.000 Screaming
+0-RnTXpp8Q0_30.000_40.000.wav 30.000 40.000 Screaming
+09YQukdYVI4_30.000_40.000.wav 30.000 40.000 Screaming
+0Ees8KFCUXM_30.000_40.000.wav 30.000 40.000 Screaming
+0EymGuYWkFk_30.000_40.000.wav 30.000 40.000 Screaming
+0Nw1OyTsaAo_30.000_40.000.wav 30.000 40.000 Screaming
+0YnOMAls83g_30.000_40.000.wav 30.000 40.000 Screaming
+0_gyUQkLCY8_30.000_40.000.wav 30.000 40.000 Screaming
+0_hnDV2SHBI_7.000_17.000.wav 7.000 17.000 Screaming
+0cqEaAkbrbI_80.000_90.000.wav 80.000 90.000 Screaming
+0hC044mDsWA_30.000_40.000.wav 30.000 40.000 Screaming
+0kQANiakiH0_30.000_40.000.wav 30.000 40.000 Screaming
+0rVBXpbgO8s_30.000_40.000.wav 30.000 40.000 Screaming
+---lTs1dxhU_30.000_40.000.wav 30.000 40.000 Car
+--330hg-Ocw_30.000_40.000.wav 30.000 40.000 Car
+--8puiAGLhs_30.000_40.000.wav 30.000 40.000 Car
+--9VR_F7CtY_30.000_40.000.wav 30.000 40.000 Car
+--F70LWypIg_30.000_40.000.wav 30.000 40.000 Car
+--P4wuph3Mc_0.000_8.000.wav 0.000 8.000 Car
+--QvRbvnbUE_30.000_40.000.wav 30.000 40.000 Car
+--SeOZy3Yik_30.000_40.000.wav 30.000 40.000 Car
+--Zz7BgxSUg_30.000_40.000.wav 30.000 40.000 Car
+--e0Vu_ruTc_30.000_40.000.wav 30.000 40.000 Car
+--iFD6IyQW8_30.000_40.000.wav 30.000 40.000 Car
+--jGnLqFsQ4_24.000_34.000.wav 24.000 34.000 Car
+--jc0NAxK8M_30.000_40.000.wav 30.000 40.000 Car
+--v1WjOJv-w_150.000_160.000.wav 150.000 160.000 Car
+--xDffQ9Mwo_30.000_40.000.wav 30.000 40.000 Car
+--yaQA8d1dI_6.000_16.000.wav 6.000 16.000 Car
+--zLzL0sq3M_30.000_40.000.wav 30.000 40.000 Car
+-0-jXXldDOU_10.000_20.000.wav 10.000 20.000 Car
+-03ld83JliM_29.000_39.000.wav 29.000 39.000 Car
+-0B-egfXU7E_30.000_40.000.wav 30.000 40.000 Car
+-0Bkyt8iZ1I_8.000_18.000.wav 8.000 18.000 Car
+-0CIk-OOp7Y_30.000_40.000.wav 30.000 40.000 Car
+-0CRb8H4hzY_4.000_14.000.wav 4.000 14.000 Car
+-0CY5NWBHyY_20.000_30.000.wav 20.000 30.000 Car
+-0HsrVfb5vc_20.000_30.000.wav 20.000 30.000 Car
+-0I89-H0AFo_26.000_36.000.wav 26.000 36.000 Car
+-0P6VDQ1YDs_80.000_90.000.wav 80.000 90.000 Car
+-0PrEsytvc0_30.000_40.000.wav 30.000 40.000 Car
+-0RqnaXZu_E_30.000_40.000.wav 30.000 40.000 Car
+-0Yynyhm1AY_14.000_24.000.wav 14.000 24.000 Car
+---lTs1dxhU_30.000_40.000.wav 30.000 40.000 Car passing by
+--P4wuph3Mc_0.000_8.000.wav 0.000 8.000 Car passing by
+--xDffQ9Mwo_30.000_40.000.wav 30.000 40.000 Car passing by
+--zLzL0sq3M_30.000_40.000.wav 30.000 40.000 Car passing by
+--zbPxnl27o_20.000_30.000.wav 20.000 30.000 Car passing by
+-0CRb8H4hzY_4.000_14.000.wav 4.000 14.000 Car passing by
+-0MnD7jBvkE_0.000_4.000.wav 0.000 4.000 Car passing by
+-0U3c4PN8sc_30.000_40.000.wav 30.000 40.000 Car passing by
+-0Yynyhm1AY_14.000_24.000.wav 14.000 24.000 Car passing by
+-10fWp7Pqs4_30.000_40.000.wav 30.000 40.000 Car passing by
+-14BFlDzjS4_6.000_16.000.wav 6.000 16.000 Car passing by
+-15nPYi2v1g_30.000_40.000.wav 30.000 40.000 Car passing by
+-19pq3HJoBM_30.000_40.000.wav 30.000 40.000 Car passing by
+-1BrkFLHD74_19.000_29.000.wav 19.000 29.000 Car passing by
+-1HlfoHZCEE_6.000_16.000.wav 6.000 16.000 Car passing by
+-1McjOPUzbo_30.000_40.000.wav 30.000 40.000 Car passing by
+-1sGSNmgiPs_4.000_14.000.wav 4.000 14.000 Car passing by
+-2-luek6dI8_30.000_40.000.wav 30.000 40.000 Car passing by
+-21-RfxQscI_30.000_40.000.wav 30.000 40.000 Car passing by
+-25LkbSjEos_30.000_40.000.wav 30.000 40.000 Car passing by
+-2LJWaL2PuA_30.000_40.000.wav 30.000 40.000 Car passing by
+-2ZbvsBSZmY_2.000_12.000.wav 2.000 12.000 Car passing by
+-2cz2qQDmr4_30.000_40.000.wav 30.000 40.000 Car passing by
+-31KUAOSg5U_5.000_15.000.wav 5.000 15.000 Car passing by
+-35qBdzN9ck_30.000_40.000.wav 30.000 40.000 Car passing by
+-3929cmVE20_30.000_40.000.wav 30.000 40.000 Car passing by
+-3M-k4nIYIM_30.000_40.000.wav 30.000 40.000 Car passing by
+-3MNphBfq_0_30.000_40.000.wav 30.000 40.000 Car passing by
+-3_RSVYKkkk_30.000_40.000.wav 30.000 40.000 Car passing by
+-3exNVlj92w_30.000_40.000.wav 30.000 40.000 Car passing by
+--0w1YA1Hm4_30.000_40.000.wav 30.000 40.000 Bus
+-0_vEaaXndY_11.000_21.000.wav 11.000 21.000 Bus
+-5GcZwBvBdI_30.000_40.000.wav 30.000 40.000 Bus
+-5digoPWn6U_8.000_18.000.wav 8.000 18.000 Bus
+-79l4w4DsYM_30.000_40.000.wav 30.000 40.000 Bus
+-7B4pbkIEas_30.000_40.000.wav 30.000 40.000 Bus
+-8YTu7ZGA2w_30.000_40.000.wav 30.000 40.000 Bus
+-93IM29_8rs_14.000_24.000.wav 14.000 24.000 Bus
+-9GhPxGkpio_26.000_36.000.wav 26.000 36.000 Bus
+-9J9xs7LM9Y_25.000_35.000.wav 25.000 35.000 Bus
+-AY_lZLYJR8_8.000_18.000.wav 8.000 18.000 Bus
+-AdQBgtN_4E_30.000_40.000.wav 30.000 40.000 Bus
+-BxfsWlPUPY_30.000_40.000.wav 30.000 40.000 Bus
+-CgCr8Eknm0_14.000_24.000.wav 14.000 24.000 Bus
+-CnsvTDIXdE_20.000_30.000.wav 20.000 30.000 Bus
+-CpMlnGhxEU_0.000_9.000.wav 0.000 9.000 Bus
+-DP_cv0x_Ng_30.000_40.000.wav 30.000 40.000 Bus
+-FEXRjcryZE_30.000_40.000.wav 30.000 40.000 Bus
+-Fp2-w-iLiE_20.000_30.000.wav 20.000 30.000 Bus
+-GLk6G9U09A_30.000_40.000.wav 30.000 40.000 Bus
+-Ga9sSkpngg_30.000_40.000.wav 30.000 40.000 Bus
+-H8V23dZoLo_0.000_10.000.wav 0.000 10.000 Bus
+-HeQfwKbFzg_30.000_40.000.wav 30.000 40.000 Bus
+-HzzEuFBiDU_30.000_40.000.wav 30.000 40.000 Bus
+-I4INTpMKT4_30.000_40.000.wav 30.000 40.000 Bus
+-II-7qJxKPc_21.000_31.000.wav 21.000 31.000 Bus
+-LnpzyfTkF8_30.000_40.000.wav 30.000 40.000 Bus
+-OgRshQfsi8_30.000_40.000.wav 30.000 40.000 Bus
+-P53lJ1ViWk_30.000_40.000.wav 30.000 40.000 Bus
+-PvNUvEov4Q_30.000_40.000.wav 30.000 40.000 Bus
+--12UOziMF0_30.000_40.000.wav 30.000 40.000 Truck
+--73E04RpiQ_0.000_9.000.wav 0.000 9.000 Truck
+--J947HxQVM_0.000_9.000.wav 0.000 9.000 Truck
+--bD1DVKlzQ_30.000_40.000.wav 30.000 40.000 Truck
+--ivFZu-hlc_30.000_40.000.wav 30.000 40.000 Truck
+--wuU7kzB5o_30.000_40.000.wav 30.000 40.000 Truck
+-0B_CYyG5Dg_30.000_40.000.wav 30.000 40.000 Truck
+-0JqTq_4jaE_40.000_50.000.wav 40.000 50.000 Truck
+-0MrEZKJ5MQ_30.000_40.000.wav 30.000 40.000 Truck
+-0awng26xQ8_30.000_40.000.wav 30.000 40.000 Truck
+-0dq1Vg9rd8_30.000_40.000.wav 30.000 40.000 Truck
+-0wkq7CUYME_310.000_320.000.wav 310.000 320.000 Truck
+-14RXdkqYuI_30.000_40.000.wav 30.000 40.000 Truck
+-1B3CzpiW1M_30.000_40.000.wav 30.000 40.000 Truck
+-1Q21cZhHDE_30.000_40.000.wav 30.000 40.000 Truck
+-1ZXXnBXJ6c_8.000_18.000.wav 8.000 18.000 Truck
+-1s0DWApvT8_30.000_40.000.wav 30.000 40.000 Truck
+-1s84_2Vn4g_30.000_40.000.wav 30.000 40.000 Truck
+-26ansJluVo_30.000_40.000.wav 30.000 40.000 Truck
+-2EscdO0l-A_30.000_40.000.wav 30.000 40.000 Truck
+-2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Truck
+-2NBZUCcvm0_30.000_40.000.wav 30.000 40.000 Truck
+-2sT5oBBWWY_30.000_40.000.wav 30.000 40.000 Truck
+-2vmprMUw10_30.000_40.000.wav 30.000 40.000 Truck
+-2x4TB8VWvE_18.000_28.000.wav 18.000 28.000 Truck
+-39q4y0tt-g_30.000_40.000.wav 30.000 40.000 Truck
+-3N5rjPrNCc_190.000_200.000.wav 190.000 200.000 Truck
+-3NcUIyJtFY_30.000_40.000.wav 30.000 40.000 Truck
+-3PplV0ErOk_30.000_40.000.wav 30.000 40.000 Truck
+-3gSkrDKNSA_27.000_37.000.wav 27.000 37.000 Truck
+--p-rk_HBuU_30.000_40.000.wav 30.000 40.000 Motorcycle
+-1WK72M4xeg_220.000_230.000.wav 220.000 230.000 Motorcycle
+-1XfuJcdvfg_30.000_40.000.wav 30.000 40.000 Motorcycle
+-3XWBAmjmaQ_11.000_21.000.wav 11.000 21.000 Motorcycle
+-4-87UgJcUw_70.000_80.000.wav 70.000 80.000 Motorcycle
+-4D3Gkyisyc_30.000_40.000.wav 30.000 40.000 Motorcycle
+-5k5GyHd2So_4.000_14.000.wav 4.000 14.000 Motorcycle
+-6A2L1U9b5Y_54.000_64.000.wav 54.000 64.000 Motorcycle
+-6Yfati1N10_80.000_90.000.wav 80.000 90.000 Motorcycle
+-7_o_GhpZpM_12.000_22.000.wav 12.000 22.000 Motorcycle
+-7rZwMK6uSs_70.000_80.000.wav 70.000 80.000 Motorcycle
+-85f5DKKfSo_30.000_40.000.wav 30.000 40.000 Motorcycle
+-9Smdrt5zwk_40.000_50.000.wav 40.000 50.000 Motorcycle
+-9gZLVDKpnE_30.000_40.000.wav 30.000 40.000 Motorcycle
+-BGebo8V4XY_30.000_40.000.wav 30.000 40.000 Motorcycle
+-DdiduB5B_w_190.000_200.000.wav 190.000 200.000 Motorcycle
+-HIPq7T3eFI_11.000_21.000.wav 11.000 21.000 Motorcycle
+-H_3oEkKe0M_50.000_60.000.wav 50.000 60.000 Motorcycle
+-HmuMoykRqA_500.000_510.000.wav 500.000 510.000 Motorcycle
+-IMRE_psvtI_30.000_40.000.wav 30.000 40.000 Motorcycle
+-Ie4LSPDEF4_6.000_16.000.wav 6.000 16.000 Motorcycle
+-J0F29UCZiA_70.000_80.000.wav 70.000 80.000 Motorcycle
+-KFCJ7ydu2E_0.000_10.000.wav 0.000 10.000 Motorcycle
+-KmDAgYb0Uo_100.000_110.000.wav 100.000 110.000 Motorcycle
+-P7iW3WzNfc_400.000_410.000.wav 400.000 410.000 Motorcycle
+-QMAKXzIGx4_10.000_20.000.wav 10.000 20.000 Motorcycle
+-S-5z2vYtxw_10.000_20.000.wav 10.000 20.000 Motorcycle
+-SlL0NZh51w_30.000_40.000.wav 30.000 40.000 Motorcycle
+-US2mpJxbj4_30.000_40.000.wav 30.000 40.000 Motorcycle
+-VO-C9C0uqY_1.000_11.000.wav 1.000 11.000 Motorcycle
+--H_-CEB2wA_30.000_40.000.wav 30.000 40.000 Train
+-1VsFy0eVJs_30.000_40.000.wav 30.000 40.000 Train
+-1X7kpLnOpM_60.000_70.000.wav 60.000 70.000 Train
+-3FIglJti0s_30.000_40.000.wav 30.000 40.000 Train
+-5QrBL6MzLg_60.000_70.000.wav 60.000 70.000 Train
+-6KOEEiAf9s_19.000_29.000.wav 19.000 29.000 Train
+-97l_c6PToE_30.000_40.000.wav 30.000 40.000 Train
+-9S5Z-uciLo_70.000_80.000.wav 70.000 80.000 Train
+-CkgGfKepO4_140.000_150.000.wav 140.000 150.000 Train
+-E0shPRxAbo_30.000_40.000.wav 30.000 40.000 Train
+-Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Train
+-JpQivta6MQ_20.000_30.000.wav 20.000 30.000 Train
+-K9oTZj3mVQ_30.000_40.000.wav 30.000 40.000 Train
+-KjE40DlSdU_0.000_10.000.wav 0.000 10.000 Train
+-NrFtZ_xxFU_30.000_40.000.wav 30.000 40.000 Train
+-PYRamK58Ss_0.000_10.000.wav 0.000 10.000 Train
+-P_XDJt4p_s_30.000_40.000.wav 30.000 40.000 Train
+-Pjylzex7oc_350.000_360.000.wav 350.000 360.000 Train
+-QHuZGmIy_I_30.000_40.000.wav 30.000 40.000 Train
+-Qfk_Q2ctBs_30.000_40.000.wav 30.000 40.000 Train
+-RXKRoRPWXg_30.000_40.000.wav 30.000 40.000 Train
+-VH414svzI0_30.000_40.000.wav 30.000 40.000 Train
+-WFdYxE-PYI_30.000_40.000.wav 30.000 40.000 Train
+-Wd1pV7UjWg_60.000_70.000.wav 60.000 70.000 Train
+-XcC-UlbcRA_30.000_40.000.wav 30.000 40.000 Train
+-Y2cD8xvCHI_30.000_40.000.wav 30.000 40.000 Train
+-ZKZkMHe3cY_70.000_80.000.wav 70.000 80.000 Train
+-Zq22n4OewA_30.000_40.000.wav 30.000 40.000 Train
+-aZ7XC4LG2A_30.000_40.000.wav 30.000 40.000 Train
+-abVemAm9HM_430.000_440.000.wav 430.000 440.000 Train
+1T1i2rny8RU_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
+7DC3HtNi4fU_160.000_170.000.wav 160.000 170.000 Ambulance (siren)
+-z8jsgl3iHE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+00H_s-krtg8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Fire engine, fire truck (siren)
+4l78f9VZ9uE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
+35EOmSMTQ6I_30.000_40.000.wav 30.000 40.000 Civil defense siren
+06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Police car (siren)
+0EPK7Pv_lbE_30.000_40.000.wav 30.000 40.000 Police car (siren)
+0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Police car (siren)
+17VuPl9Wxvs_20.000_30.000.wav 20.000 30.000 Police car (siren)
+4A1Ar1TIXIY_30.000_40.000.wav 30.000 40.000 Police car (siren)
+-10fWp7Pqs4_30.000_40.000.wav 30.000 40.000 Car
+-122tCXtFhU_30.000_40.000.wav 30.000 40.000 Car
+-14BFlDzjS4_6.000_16.000.wav 6.000 16.000 Car
+-1BrkFLHD74_19.000_29.000.wav 19.000 29.000 Car
+-1HlfoHZCEE_6.000_16.000.wav 6.000 16.000 Car
+-1McjOPUzbo_30.000_40.000.wav 30.000 40.000 Car
+-1sGSNmgiPs_4.000_14.000.wav 4.000 14.000 Car
+-25LkbSjEos_30.000_40.000.wav 30.000 40.000 Car
+-2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Car
+-2LJWaL2PuA_30.000_40.000.wav 30.000 40.000 Car
+-2ZbvsBSZmY_2.000_12.000.wav 2.000 12.000 Car
+-2cz2qQDmr4_30.000_40.000.wav 30.000 40.000 Car
+-31KUAOSg5U_5.000_15.000.wav 5.000 15.000 Car
+-35qBdzN9ck_30.000_40.000.wav 30.000 40.000 Car
+-3929cmVE20_30.000_40.000.wav 30.000 40.000 Car
+-3M-k4nIYIM_30.000_40.000.wav 30.000 40.000 Car
+-3MNphBfq_0_30.000_40.000.wav 30.000 40.000 Car
+-3_RSVYKkkk_30.000_40.000.wav 30.000 40.000 Car
+-AF7wp3ezww_140.000_150.000.wav 140.000 150.000 Car
+-Pg4vVPs4bE_30.000_40.000.wav 30.000 40.000 Car
+-VULyMtKazE_0.000_7.000.wav 0.000 7.000 Car
+-cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Car
+06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Car
+0E4AqW9dmdk_30.000_40.000.wav 30.000 40.000 Car
+0Hz4R_m0hmI_80.000_90.000.wav 80.000 90.000 Car
+4Kpklmj-ze0_53.000_63.000.wav 53.000 63.000 Car
+5tzTahLHylw_70.000_80.000.wav 70.000 80.000 Car
+7NJ5TbNEIvA_250.000_260.000.wav 250.000 260.000 Car
+9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car
+9jYv9WuyknA_130.000_140.000.wav 130.000 140.000 Car
+-l-DEfDAvNA_30.000_40.000.wav 30.000 40.000 Car passing by
+9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car passing by
+-jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Bus
+-45cKZA7Jww_30.000_40.000.wav 30.000 40.000 Truck
+-4B435WQvag_20.000_30.000.wav 20.000 30.000 Truck
+-60XojQWWoc_30.000_40.000.wav 30.000 40.000 Truck
+-6qhtwdfGOA_23.000_33.000.wav 23.000 33.000 Truck
+-8OITuFZha8_30.000_40.000.wav 30.000 40.000 Truck
+-8n2NqDFRko_30.000_40.000.wav 30.000 40.000 Truck
+-AIrHVeCgtM_30.000_40.000.wav 30.000 40.000 Truck
+-AVzYvKHwPg_30.000_40.000.wav 30.000 40.000 Truck
+-BM_EAszxBg_30.000_40.000.wav 30.000 40.000 Truck
+-Ei2LE71Dfg_20.000_30.000.wav 20.000 30.000 Truck
+-FWkB2IDMhc_30.000_40.000.wav 30.000 40.000 Truck
+-Jsu4dbuO4A_30.000_40.000.wav 30.000 40.000 Truck
+-PRrNx6_MD0_16.000_26.000.wav 16.000 26.000 Truck
+-X0vNLwH1C0_30.000_40.000.wav 30.000 40.000 Truck
+-cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Truck
+-oCvKmNbhl0_30.000_40.000.wav 30.000 40.000 Truck
+-oV6dQu5tZo_30.000_40.000.wav 30.000 40.000 Truck
+-qKRKDTbt4c_30.000_40.000.wav 30.000 40.000 Truck
+-r8mfjRiHrU_30.000_40.000.wav 30.000 40.000 Truck
+-s9kwrRilOY_30.000_40.000.wav 30.000 40.000 Truck
+-uMiGr6xvRA_30.000_40.000.wav 30.000 40.000 Truck
+-x70B12Mb-8_30.000_40.000.wav 30.000 40.000 Truck
+-xYsfYZOI-Y_30.000_40.000.wav 30.000 40.000 Truck
+-zxrdL6MlKI_30.000_40.000.wav 30.000 40.000 Truck
+0C3kqtF76t8_50.000_60.000.wav 50.000 60.000 Truck
+0HmiH-wKLB4_30.000_40.000.wav 30.000 40.000 Truck
+0KskqFt3DoY_15.000_25.000.wav 15.000 25.000 Truck
+0OiPtV9sd_w_30.000_40.000.wav 30.000 40.000 Truck
+0VnoYVqd-yo_30.000_40.000.wav 30.000 40.000 Truck
+3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Truck
+-nGBPqlRNg4_30.000_40.000.wav 30.000 40.000 Train
+02w3vd_GgF0_390.000_400.000.wav 390.000 400.000 Train
+0HqeYIREv8M_30.000_40.000.wav 30.000 40.000 Train
+0IpYF91Fdt0_80.000_90.000.wav 80.000 90.000 Train
+0NaZejdABG0_90.000_100.000.wav 90.000 100.000 Train
+0RurXUfKyow_4.000_14.000.wav 4.000 14.000 Train
+0_HnD-rW3lI_170.000_180.000.wav 170.000 180.000 Train
+10i60V1RZkQ_210.000_220.000.wav 210.000 220.000 Train
+1FJY5X1iY9I_170.000_180.000.wav 170.000 180.000 Train
+1U0Ty6CW6AM_40.000_50.000.wav 40.000 50.000 Train
+1hQLr88iCvg_30.000_40.000.wav 30.000 40.000 Train
+1iUXERALOOs_190.000_200.000.wav 190.000 200.000 Train
+1iWFlLpixKU_5.000_15.000.wav 5.000 15.000 Train
+1oJAVJPX0YY_20.000_30.000.wav 20.000 30.000 Train
+26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Train
+2BMHsKLcb7E_90.000_100.000.wav 90.000 100.000 Train
+2RpOd9MJjyQ_10.000_20.000.wav 10.000 20.000 Train
+2U4wSdl10to_200.000_210.000.wav 200.000 210.000 Train
+2aBV6AZt5nk_570.000_580.000.wav 570.000 580.000 Train
+3ntFslTK6hM_90.000_100.000.wav 90.000 100.000 Train
diff --git a/audio_detection/audio_infer/metadata/class_labels_indices.csv b/audio_detection/audio_infer/metadata/class_labels_indices.csv
new file mode 100644
index 0000000000000000000000000000000000000000..3a2767e81114adecde59992cf6607f31c1862f4c
--- /dev/null
+++ b/audio_detection/audio_infer/metadata/class_labels_indices.csv
@@ -0,0 +1,528 @@
+index,mid,display_name
+0,/m/09x0r,"Speech"
+1,/m/05zppz,"Male speech, man speaking"
+2,/m/02zsn,"Female speech, woman speaking"
+3,/m/0ytgt,"Child speech, kid speaking"
+4,/m/01h8n0,"Conversation"
+5,/m/02qldy,"Narration, monologue"
+6,/m/0261r1,"Babbling"
+7,/m/0brhx,"Speech synthesizer"
+8,/m/07p6fty,"Shout"
+9,/m/07q4ntr,"Bellow"
+10,/m/07rwj3x,"Whoop"
+11,/m/07sr1lc,"Yell"
+12,/m/04gy_2,"Battle cry"
+13,/t/dd00135,"Children shouting"
+14,/m/03qc9zr,"Screaming"
+15,/m/02rtxlg,"Whispering"
+16,/m/01j3sz,"Laughter"
+17,/t/dd00001,"Baby laughter"
+18,/m/07r660_,"Giggle"
+19,/m/07s04w4,"Snicker"
+20,/m/07sq110,"Belly laugh"
+21,/m/07rgt08,"Chuckle, chortle"
+22,/m/0463cq4,"Crying, sobbing"
+23,/t/dd00002,"Baby cry, infant cry"
+24,/m/07qz6j3,"Whimper"
+25,/m/07qw_06,"Wail, moan"
+26,/m/07plz5l,"Sigh"
+27,/m/015lz1,"Singing"
+28,/m/0l14jd,"Choir"
+29,/m/01swy6,"Yodeling"
+30,/m/02bk07,"Chant"
+31,/m/01c194,"Mantra"
+32,/t/dd00003,"Male singing"
+33,/t/dd00004,"Female singing"
+34,/t/dd00005,"Child singing"
+35,/t/dd00006,"Synthetic singing"
+36,/m/06bxc,"Rapping"
+37,/m/02fxyj,"Humming"
+38,/m/07s2xch,"Groan"
+39,/m/07r4k75,"Grunt"
+40,/m/01w250,"Whistling"
+41,/m/0lyf6,"Breathing"
+42,/m/07mzm6,"Wheeze"
+43,/m/01d3sd,"Snoring"
+44,/m/07s0dtb,"Gasp"
+45,/m/07pyy8b,"Pant"
+46,/m/07q0yl5,"Snort"
+47,/m/01b_21,"Cough"
+48,/m/0dl9sf8,"Throat clearing"
+49,/m/01hsr_,"Sneeze"
+50,/m/07ppn3j,"Sniff"
+51,/m/06h7j,"Run"
+52,/m/07qv_x_,"Shuffle"
+53,/m/07pbtc8,"Walk, footsteps"
+54,/m/03cczk,"Chewing, mastication"
+55,/m/07pdhp0,"Biting"
+56,/m/0939n_,"Gargling"
+57,/m/01g90h,"Stomach rumble"
+58,/m/03q5_w,"Burping, eructation"
+59,/m/02p3nc,"Hiccup"
+60,/m/02_nn,"Fart"
+61,/m/0k65p,"Hands"
+62,/m/025_jnm,"Finger snapping"
+63,/m/0l15bq,"Clapping"
+64,/m/01jg02,"Heart sounds, heartbeat"
+65,/m/01jg1z,"Heart murmur"
+66,/m/053hz1,"Cheering"
+67,/m/028ght,"Applause"
+68,/m/07rkbfh,"Chatter"
+69,/m/03qtwd,"Crowd"
+70,/m/07qfr4h,"Hubbub, speech noise, speech babble"
+71,/t/dd00013,"Children playing"
+72,/m/0jbk,"Animal"
+73,/m/068hy,"Domestic animals, pets"
+74,/m/0bt9lr,"Dog"
+75,/m/05tny_,"Bark"
+76,/m/07r_k2n,"Yip"
+77,/m/07qf0zm,"Howl"
+78,/m/07rc7d9,"Bow-wow"
+79,/m/0ghcn6,"Growling"
+80,/t/dd00136,"Whimper (dog)"
+81,/m/01yrx,"Cat"
+82,/m/02yds9,"Purr"
+83,/m/07qrkrw,"Meow"
+84,/m/07rjwbb,"Hiss"
+85,/m/07r81j2,"Caterwaul"
+86,/m/0ch8v,"Livestock, farm animals, working animals"
+87,/m/03k3r,"Horse"
+88,/m/07rv9rh,"Clip-clop"
+89,/m/07q5rw0,"Neigh, whinny"
+90,/m/01xq0k1,"Cattle, bovinae"
+91,/m/07rpkh9,"Moo"
+92,/m/0239kh,"Cowbell"
+93,/m/068zj,"Pig"
+94,/t/dd00018,"Oink"
+95,/m/03fwl,"Goat"
+96,/m/07q0h5t,"Bleat"
+97,/m/07bgp,"Sheep"
+98,/m/025rv6n,"Fowl"
+99,/m/09b5t,"Chicken, rooster"
+100,/m/07st89h,"Cluck"
+101,/m/07qn5dc,"Crowing, cock-a-doodle-doo"
+102,/m/01rd7k,"Turkey"
+103,/m/07svc2k,"Gobble"
+104,/m/09ddx,"Duck"
+105,/m/07qdb04,"Quack"
+106,/m/0dbvp,"Goose"
+107,/m/07qwf61,"Honk"
+108,/m/01280g,"Wild animals"
+109,/m/0cdnk,"Roaring cats (lions, tigers)"
+110,/m/04cvmfc,"Roar"
+111,/m/015p6,"Bird"
+112,/m/020bb7,"Bird vocalization, bird call, bird song"
+113,/m/07pggtn,"Chirp, tweet"
+114,/m/07sx8x_,"Squawk"
+115,/m/0h0rv,"Pigeon, dove"
+116,/m/07r_25d,"Coo"
+117,/m/04s8yn,"Crow"
+118,/m/07r5c2p,"Caw"
+119,/m/09d5_,"Owl"
+120,/m/07r_80w,"Hoot"
+121,/m/05_wcq,"Bird flight, flapping wings"
+122,/m/01z5f,"Canidae, dogs, wolves"
+123,/m/06hps,"Rodents, rats, mice"
+124,/m/04rmv,"Mouse"
+125,/m/07r4gkf,"Patter"
+126,/m/03vt0,"Insect"
+127,/m/09xqv,"Cricket"
+128,/m/09f96,"Mosquito"
+129,/m/0h2mp,"Fly, housefly"
+130,/m/07pjwq1,"Buzz"
+131,/m/01h3n,"Bee, wasp, etc."
+132,/m/09ld4,"Frog"
+133,/m/07st88b,"Croak"
+134,/m/078jl,"Snake"
+135,/m/07qn4z3,"Rattle"
+136,/m/032n05,"Whale vocalization"
+137,/m/04rlf,"Music"
+138,/m/04szw,"Musical instrument"
+139,/m/0fx80y,"Plucked string instrument"
+140,/m/0342h,"Guitar"
+141,/m/02sgy,"Electric guitar"
+142,/m/018vs,"Bass guitar"
+143,/m/042v_gx,"Acoustic guitar"
+144,/m/06w87,"Steel guitar, slide guitar"
+145,/m/01glhc,"Tapping (guitar technique)"
+146,/m/07s0s5r,"Strum"
+147,/m/018j2,"Banjo"
+148,/m/0jtg0,"Sitar"
+149,/m/04rzd,"Mandolin"
+150,/m/01bns_,"Zither"
+151,/m/07xzm,"Ukulele"
+152,/m/05148p4,"Keyboard (musical)"
+153,/m/05r5c,"Piano"
+154,/m/01s0ps,"Electric piano"
+155,/m/013y1f,"Organ"
+156,/m/03xq_f,"Electronic organ"
+157,/m/03gvt,"Hammond organ"
+158,/m/0l14qv,"Synthesizer"
+159,/m/01v1d8,"Sampler"
+160,/m/03q5t,"Harpsichord"
+161,/m/0l14md,"Percussion"
+162,/m/02hnl,"Drum kit"
+163,/m/0cfdd,"Drum machine"
+164,/m/026t6,"Drum"
+165,/m/06rvn,"Snare drum"
+166,/m/03t3fj,"Rimshot"
+167,/m/02k_mr,"Drum roll"
+168,/m/0bm02,"Bass drum"
+169,/m/011k_j,"Timpani"
+170,/m/01p970,"Tabla"
+171,/m/01qbl,"Cymbal"
+172,/m/03qtq,"Hi-hat"
+173,/m/01sm1g,"Wood block"
+174,/m/07brj,"Tambourine"
+175,/m/05r5wn,"Rattle (instrument)"
+176,/m/0xzly,"Maraca"
+177,/m/0mbct,"Gong"
+178,/m/016622,"Tubular bells"
+179,/m/0j45pbj,"Mallet percussion"
+180,/m/0dwsp,"Marimba, xylophone"
+181,/m/0dwtp,"Glockenspiel"
+182,/m/0dwt5,"Vibraphone"
+183,/m/0l156b,"Steelpan"
+184,/m/05pd6,"Orchestra"
+185,/m/01kcd,"Brass instrument"
+186,/m/0319l,"French horn"
+187,/m/07gql,"Trumpet"
+188,/m/07c6l,"Trombone"
+189,/m/0l14_3,"Bowed string instrument"
+190,/m/02qmj0d,"String section"
+191,/m/07y_7,"Violin, fiddle"
+192,/m/0d8_n,"Pizzicato"
+193,/m/01xqw,"Cello"
+194,/m/02fsn,"Double bass"
+195,/m/085jw,"Wind instrument, woodwind instrument"
+196,/m/0l14j_,"Flute"
+197,/m/06ncr,"Saxophone"
+198,/m/01wy6,"Clarinet"
+199,/m/03m5k,"Harp"
+200,/m/0395lw,"Bell"
+201,/m/03w41f,"Church bell"
+202,/m/027m70_,"Jingle bell"
+203,/m/0gy1t2s,"Bicycle bell"
+204,/m/07n_g,"Tuning fork"
+205,/m/0f8s22,"Chime"
+206,/m/026fgl,"Wind chime"
+207,/m/0150b9,"Change ringing (campanology)"
+208,/m/03qjg,"Harmonica"
+209,/m/0mkg,"Accordion"
+210,/m/0192l,"Bagpipes"
+211,/m/02bxd,"Didgeridoo"
+212,/m/0l14l2,"Shofar"
+213,/m/07kc_,"Theremin"
+214,/m/0l14t7,"Singing bowl"
+215,/m/01hgjl,"Scratching (performance technique)"
+216,/m/064t9,"Pop music"
+217,/m/0glt670,"Hip hop music"
+218,/m/02cz_7,"Beatboxing"
+219,/m/06by7,"Rock music"
+220,/m/03lty,"Heavy metal"
+221,/m/05r6t,"Punk rock"
+222,/m/0dls3,"Grunge"
+223,/m/0dl5d,"Progressive rock"
+224,/m/07sbbz2,"Rock and roll"
+225,/m/05w3f,"Psychedelic rock"
+226,/m/06j6l,"Rhythm and blues"
+227,/m/0gywn,"Soul music"
+228,/m/06cqb,"Reggae"
+229,/m/01lyv,"Country"
+230,/m/015y_n,"Swing music"
+231,/m/0gg8l,"Bluegrass"
+232,/m/02x8m,"Funk"
+233,/m/02w4v,"Folk music"
+234,/m/06j64v,"Middle Eastern music"
+235,/m/03_d0,"Jazz"
+236,/m/026z9,"Disco"
+237,/m/0ggq0m,"Classical music"
+238,/m/05lls,"Opera"
+239,/m/02lkt,"Electronic music"
+240,/m/03mb9,"House music"
+241,/m/07gxw,"Techno"
+242,/m/07s72n,"Dubstep"
+243,/m/0283d,"Drum and bass"
+244,/m/0m0jc,"Electronica"
+245,/m/08cyft,"Electronic dance music"
+246,/m/0fd3y,"Ambient music"
+247,/m/07lnk,"Trance music"
+248,/m/0g293,"Music of Latin America"
+249,/m/0ln16,"Salsa music"
+250,/m/0326g,"Flamenco"
+251,/m/0155w,"Blues"
+252,/m/05fw6t,"Music for children"
+253,/m/02v2lh,"New-age music"
+254,/m/0y4f8,"Vocal music"
+255,/m/0z9c,"A capella"
+256,/m/0164x2,"Music of Africa"
+257,/m/0145m,"Afrobeat"
+258,/m/02mscn,"Christian music"
+259,/m/016cjb,"Gospel music"
+260,/m/028sqc,"Music of Asia"
+261,/m/015vgc,"Carnatic music"
+262,/m/0dq0md,"Music of Bollywood"
+263,/m/06rqw,"Ska"
+264,/m/02p0sh1,"Traditional music"
+265,/m/05rwpb,"Independent music"
+266,/m/074ft,"Song"
+267,/m/025td0t,"Background music"
+268,/m/02cjck,"Theme music"
+269,/m/03r5q_,"Jingle (music)"
+270,/m/0l14gg,"Soundtrack music"
+271,/m/07pkxdp,"Lullaby"
+272,/m/01z7dr,"Video game music"
+273,/m/0140xf,"Christmas music"
+274,/m/0ggx5q,"Dance music"
+275,/m/04wptg,"Wedding music"
+276,/t/dd00031,"Happy music"
+277,/t/dd00032,"Funny music"
+278,/t/dd00033,"Sad music"
+279,/t/dd00034,"Tender music"
+280,/t/dd00035,"Exciting music"
+281,/t/dd00036,"Angry music"
+282,/t/dd00037,"Scary music"
+283,/m/03m9d0z,"Wind"
+284,/m/09t49,"Rustling leaves"
+285,/t/dd00092,"Wind noise (microphone)"
+286,/m/0jb2l,"Thunderstorm"
+287,/m/0ngt1,"Thunder"
+288,/m/0838f,"Water"
+289,/m/06mb1,"Rain"
+290,/m/07r10fb,"Raindrop"
+291,/t/dd00038,"Rain on surface"
+292,/m/0j6m2,"Stream"
+293,/m/0j2kx,"Waterfall"
+294,/m/05kq4,"Ocean"
+295,/m/034srq,"Waves, surf"
+296,/m/06wzb,"Steam"
+297,/m/07swgks,"Gurgling"
+298,/m/02_41,"Fire"
+299,/m/07pzfmf,"Crackle"
+300,/m/07yv9,"Vehicle"
+301,/m/019jd,"Boat, Water vehicle"
+302,/m/0hsrw,"Sailboat, sailing ship"
+303,/m/056ks2,"Rowboat, canoe, kayak"
+304,/m/02rlv9,"Motorboat, speedboat"
+305,/m/06q74,"Ship"
+306,/m/012f08,"Motor vehicle (road)"
+307,/m/0k4j,"Car"
+308,/m/0912c9,"Vehicle horn, car horn, honking"
+309,/m/07qv_d5,"Toot"
+310,/m/02mfyn,"Car alarm"
+311,/m/04gxbd,"Power windows, electric windows"
+312,/m/07rknqz,"Skidding"
+313,/m/0h9mv,"Tire squeal"
+314,/t/dd00134,"Car passing by"
+315,/m/0ltv,"Race car, auto racing"
+316,/m/07r04,"Truck"
+317,/m/0gvgw0,"Air brake"
+318,/m/05x_td,"Air horn, truck horn"
+319,/m/02rhddq,"Reversing beeps"
+320,/m/03cl9h,"Ice cream truck, ice cream van"
+321,/m/01bjv,"Bus"
+322,/m/03j1ly,"Emergency vehicle"
+323,/m/04qvtq,"Police car (siren)"
+324,/m/012n7d,"Ambulance (siren)"
+325,/m/012ndj,"Fire engine, fire truck (siren)"
+326,/m/04_sv,"Motorcycle"
+327,/m/0btp2,"Traffic noise, roadway noise"
+328,/m/06d_3,"Rail transport"
+329,/m/07jdr,"Train"
+330,/m/04zmvq,"Train whistle"
+331,/m/0284vy3,"Train horn"
+332,/m/01g50p,"Railroad car, train wagon"
+333,/t/dd00048,"Train wheels squealing"
+334,/m/0195fx,"Subway, metro, underground"
+335,/m/0k5j,"Aircraft"
+336,/m/014yck,"Aircraft engine"
+337,/m/04229,"Jet engine"
+338,/m/02l6bg,"Propeller, airscrew"
+339,/m/09ct_,"Helicopter"
+340,/m/0cmf2,"Fixed-wing aircraft, airplane"
+341,/m/0199g,"Bicycle"
+342,/m/06_fw,"Skateboard"
+343,/m/02mk9,"Engine"
+344,/t/dd00065,"Light engine (high frequency)"
+345,/m/08j51y,"Dental drill, dentist's drill"
+346,/m/01yg9g,"Lawn mower"
+347,/m/01j4z9,"Chainsaw"
+348,/t/dd00066,"Medium engine (mid frequency)"
+349,/t/dd00067,"Heavy engine (low frequency)"
+350,/m/01h82_,"Engine knocking"
+351,/t/dd00130,"Engine starting"
+352,/m/07pb8fc,"Idling"
+353,/m/07q2z82,"Accelerating, revving, vroom"
+354,/m/02dgv,"Door"
+355,/m/03wwcy,"Doorbell"
+356,/m/07r67yg,"Ding-dong"
+357,/m/02y_763,"Sliding door"
+358,/m/07rjzl8,"Slam"
+359,/m/07r4wb8,"Knock"
+360,/m/07qcpgn,"Tap"
+361,/m/07q6cd_,"Squeak"
+362,/m/0642b4,"Cupboard open or close"
+363,/m/0fqfqc,"Drawer open or close"
+364,/m/04brg2,"Dishes, pots, and pans"
+365,/m/023pjk,"Cutlery, silverware"
+366,/m/07pn_8q,"Chopping (food)"
+367,/m/0dxrf,"Frying (food)"
+368,/m/0fx9l,"Microwave oven"
+369,/m/02pjr4,"Blender"
+370,/m/02jz0l,"Water tap, faucet"
+371,/m/0130jx,"Sink (filling or washing)"
+372,/m/03dnzn,"Bathtub (filling or washing)"
+373,/m/03wvsk,"Hair dryer"
+374,/m/01jt3m,"Toilet flush"
+375,/m/012xff,"Toothbrush"
+376,/m/04fgwm,"Electric toothbrush"
+377,/m/0d31p,"Vacuum cleaner"
+378,/m/01s0vc,"Zipper (clothing)"
+379,/m/03v3yw,"Keys jangling"
+380,/m/0242l,"Coin (dropping)"
+381,/m/01lsmm,"Scissors"
+382,/m/02g901,"Electric shaver, electric razor"
+383,/m/05rj2,"Shuffling cards"
+384,/m/0316dw,"Typing"
+385,/m/0c2wf,"Typewriter"
+386,/m/01m2v,"Computer keyboard"
+387,/m/081rb,"Writing"
+388,/m/07pp_mv,"Alarm"
+389,/m/07cx4,"Telephone"
+390,/m/07pp8cl,"Telephone bell ringing"
+391,/m/01hnzm,"Ringtone"
+392,/m/02c8p,"Telephone dialing, DTMF"
+393,/m/015jpf,"Dial tone"
+394,/m/01z47d,"Busy signal"
+395,/m/046dlr,"Alarm clock"
+396,/m/03kmc9,"Siren"
+397,/m/0dgbq,"Civil defense siren"
+398,/m/030rvx,"Buzzer"
+399,/m/01y3hg,"Smoke detector, smoke alarm"
+400,/m/0c3f7m,"Fire alarm"
+401,/m/04fq5q,"Foghorn"
+402,/m/0l156k,"Whistle"
+403,/m/06hck5,"Steam whistle"
+404,/t/dd00077,"Mechanisms"
+405,/m/02bm9n,"Ratchet, pawl"
+406,/m/01x3z,"Clock"
+407,/m/07qjznt,"Tick"
+408,/m/07qjznl,"Tick-tock"
+409,/m/0l7xg,"Gears"
+410,/m/05zc1,"Pulleys"
+411,/m/0llzx,"Sewing machine"
+412,/m/02x984l,"Mechanical fan"
+413,/m/025wky1,"Air conditioning"
+414,/m/024dl,"Cash register"
+415,/m/01m4t,"Printer"
+416,/m/0dv5r,"Camera"
+417,/m/07bjf,"Single-lens reflex camera"
+418,/m/07k1x,"Tools"
+419,/m/03l9g,"Hammer"
+420,/m/03p19w,"Jackhammer"
+421,/m/01b82r,"Sawing"
+422,/m/02p01q,"Filing (rasp)"
+423,/m/023vsd,"Sanding"
+424,/m/0_ksk,"Power tool"
+425,/m/01d380,"Drill"
+426,/m/014zdl,"Explosion"
+427,/m/032s66,"Gunshot, gunfire"
+428,/m/04zjc,"Machine gun"
+429,/m/02z32qm,"Fusillade"
+430,/m/0_1c,"Artillery fire"
+431,/m/073cg4,"Cap gun"
+432,/m/0g6b5,"Fireworks"
+433,/g/122z_qxw,"Firecracker"
+434,/m/07qsvvw,"Burst, pop"
+435,/m/07pxg6y,"Eruption"
+436,/m/07qqyl4,"Boom"
+437,/m/083vt,"Wood"
+438,/m/07pczhz,"Chop"
+439,/m/07pl1bw,"Splinter"
+440,/m/07qs1cx,"Crack"
+441,/m/039jq,"Glass"
+442,/m/07q7njn,"Chink, clink"
+443,/m/07rn7sz,"Shatter"
+444,/m/04k94,"Liquid"
+445,/m/07rrlb6,"Splash, splatter"
+446,/m/07p6mqd,"Slosh"
+447,/m/07qlwh6,"Squish"
+448,/m/07r5v4s,"Drip"
+449,/m/07prgkl,"Pour"
+450,/m/07pqc89,"Trickle, dribble"
+451,/t/dd00088,"Gush"
+452,/m/07p7b8y,"Fill (with liquid)"
+453,/m/07qlf79,"Spray"
+454,/m/07ptzwd,"Pump (liquid)"
+455,/m/07ptfmf,"Stir"
+456,/m/0dv3j,"Boiling"
+457,/m/0790c,"Sonar"
+458,/m/0dl83,"Arrow"
+459,/m/07rqsjt,"Whoosh, swoosh, swish"
+460,/m/07qnq_y,"Thump, thud"
+461,/m/07rrh0c,"Thunk"
+462,/m/0b_fwt,"Electronic tuner"
+463,/m/02rr_,"Effects unit"
+464,/m/07m2kt,"Chorus effect"
+465,/m/018w8,"Basketball bounce"
+466,/m/07pws3f,"Bang"
+467,/m/07ryjzk,"Slap, smack"
+468,/m/07rdhzs,"Whack, thwack"
+469,/m/07pjjrj,"Smash, crash"
+470,/m/07pc8lb,"Breaking"
+471,/m/07pqn27,"Bouncing"
+472,/m/07rbp7_,"Whip"
+473,/m/07pyf11,"Flap"
+474,/m/07qb_dv,"Scratch"
+475,/m/07qv4k0,"Scrape"
+476,/m/07pdjhy,"Rub"
+477,/m/07s8j8t,"Roll"
+478,/m/07plct2,"Crushing"
+479,/t/dd00112,"Crumpling, crinkling"
+480,/m/07qcx4z,"Tearing"
+481,/m/02fs_r,"Beep, bleep"
+482,/m/07qwdck,"Ping"
+483,/m/07phxs1,"Ding"
+484,/m/07rv4dm,"Clang"
+485,/m/07s02z0,"Squeal"
+486,/m/07qh7jl,"Creak"
+487,/m/07qwyj0,"Rustle"
+488,/m/07s34ls,"Whir"
+489,/m/07qmpdm,"Clatter"
+490,/m/07p9k1k,"Sizzle"
+491,/m/07qc9xj,"Clicking"
+492,/m/07rwm0c,"Clickety-clack"
+493,/m/07phhsh,"Rumble"
+494,/m/07qyrcz,"Plop"
+495,/m/07qfgpx,"Jingle, tinkle"
+496,/m/07rcgpl,"Hum"
+497,/m/07p78v5,"Zing"
+498,/t/dd00121,"Boing"
+499,/m/07s12q4,"Crunch"
+500,/m/028v0c,"Silence"
+501,/m/01v_m0,"Sine wave"
+502,/m/0b9m1,"Harmonic"
+503,/m/0hdsk,"Chirp tone"
+504,/m/0c1dj,"Sound effect"
+505,/m/07pt_g0,"Pulse"
+506,/t/dd00125,"Inside, small room"
+507,/t/dd00126,"Inside, large room or hall"
+508,/t/dd00127,"Inside, public space"
+509,/t/dd00128,"Outside, urban or manmade"
+510,/t/dd00129,"Outside, rural or natural"
+511,/m/01b9nn,"Reverberation"
+512,/m/01jnbd,"Echo"
+513,/m/096m7z,"Noise"
+514,/m/06_y0by,"Environmental noise"
+515,/m/07rgkc5,"Static"
+516,/m/06xkwv,"Mains hum"
+517,/m/0g12c5,"Distortion"
+518,/m/08p9q4,"Sidetone"
+519,/m/07szfh9,"Cacophony"
+520,/m/0chx_,"White noise"
+521,/m/0cj0r,"Pink noise"
+522,/m/07p_0gm,"Throbbing"
+523,/m/01jwx6,"Vibration"
+524,/m/07c52,"Television"
+525,/m/06bz3,"Radio"
+526,/m/07hvw1,"Field recording"
diff --git a/audio_detection/audio_infer/pytorch/__pycache__/models.cpython-38.pyc b/audio_detection/audio_infer/pytorch/__pycache__/models.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4699888fb861c2ffee9c8575b4116eba8e7a41b6
Binary files /dev/null and b/audio_detection/audio_infer/pytorch/__pycache__/models.cpython-38.pyc differ
diff --git a/audio_detection/audio_infer/pytorch/__pycache__/pytorch_utils.cpython-38.pyc b/audio_detection/audio_infer/pytorch/__pycache__/pytorch_utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b4489e8b01c6cced77a08735295746c01e8f831
Binary files /dev/null and b/audio_detection/audio_infer/pytorch/__pycache__/pytorch_utils.cpython-38.pyc differ
diff --git a/audio_detection/audio_infer/pytorch/evaluate.py b/audio_detection/audio_infer/pytorch/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f1fa38eedd9e9cd2580143ceb92aba8f81becf3
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/evaluate.py
@@ -0,0 +1,42 @@
+from sklearn import metrics
+
+from pytorch_utils import forward
+
+
+class Evaluator(object):
+ def __init__(self, model):
+ """Evaluator.
+
+ Args:
+ model: object
+ """
+ self.model = model
+
+ def evaluate(self, data_loader):
+ """Forward evaluation data and calculate statistics.
+
+ Args:
+ data_loader: object
+
+ Returns:
+ statistics: dict,
+ {'average_precision': (classes_num,), 'auc': (classes_num,)}
+ """
+
+ # Forward
+ output_dict = forward(
+ model=self.model,
+ generator=data_loader,
+ return_target=True)
+
+ clipwise_output = output_dict['clipwise_output'] # (audios_num, classes_num)
+ target = output_dict['target'] # (audios_num, classes_num)
+
+ average_precision = metrics.average_precision_score(
+ target, clipwise_output, average=None)
+
+ auc = metrics.roc_auc_score(target, clipwise_output, average=None)
+
+ statistics = {'average_precision': average_precision, 'auc': auc}
+
+ return statistics
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/finetune_template.py b/audio_detection/audio_infer/pytorch/finetune_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd43e462c47857f805b1ef4d345711354a1cff3d
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/finetune_template.py
@@ -0,0 +1,127 @@
+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import h5py
+import math
+import time
+import logging
+import matplotlib.pyplot as plt
+
+import torch
+torch.backends.cudnn.benchmark=True
+torch.manual_seed(0)
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data
+
+from utilities import get_filename
+from models import *
+import config
+
+
+class Transfer_Cnn14(nn.Module):
+ def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, classes_num, freeze_base):
+ """Classifier for a new task using pretrained Cnn14 as a sub module.
+ """
+ super(Transfer_Cnn14, self).__init__()
+ audioset_classes_num = 527
+
+ self.base = Cnn14(sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, audioset_classes_num)
+
+ # Transfer to another task layer
+ self.fc_transfer = nn.Linear(2048, classes_num, bias=True)
+
+ if freeze_base:
+ # Freeze AudioSet pretrained layers
+ for param in self.base.parameters():
+ param.requires_grad = False
+
+ self.init_weights()
+
+ def init_weights(self):
+ init_layer(self.fc_transfer)
+
+ def load_from_pretrain(self, pretrained_checkpoint_path):
+ checkpoint = torch.load(pretrained_checkpoint_path)
+ self.base.load_state_dict(checkpoint['model'])
+
+ def forward(self, input, mixup_lambda=None):
+ """Input: (batch_size, data_length)
+ """
+ output_dict = self.base(input, mixup_lambda)
+ embedding = output_dict['embedding']
+
+ clipwise_output = torch.log_softmax(self.fc_transfer(embedding), dim=-1)
+ output_dict['clipwise_output'] = clipwise_output
+
+ return output_dict
+
+
+def train(args):
+
+ # Arugments & parameters
+ sample_rate = args.sample_rate
+ window_size = args.window_size
+ hop_size = args.hop_size
+ mel_bins = args.mel_bins
+ fmin = args.fmin
+ fmax = args.fmax
+ model_type = args.model_type
+ pretrained_checkpoint_path = args.pretrained_checkpoint_path
+ freeze_base = args.freeze_base
+ device = 'cuda' if (args.cuda and torch.cuda.is_available()) else 'cpu'
+
+ classes_num = config.classes_num
+ pretrain = True if pretrained_checkpoint_path else False
+
+ # Model
+ Model = eval(model_type)
+ model = Model(sample_rate, window_size, hop_size, mel_bins, fmin, fmax,
+ classes_num, freeze_base)
+
+ # Load pretrained model
+ if pretrain:
+ logging.info('Load pretrained model from {}'.format(pretrained_checkpoint_path))
+ model.load_from_pretrain(pretrained_checkpoint_path)
+
+ # Parallel
+ print('GPU number: {}'.format(torch.cuda.device_count()))
+ model = torch.nn.DataParallel(model)
+
+ if 'cuda' in device:
+ model.to(device)
+
+ print('Load pretrained model successfully!')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Example of parser. ')
+ subparsers = parser.add_subparsers(dest='mode')
+
+ # Train
+ parser_train = subparsers.add_parser('train')
+ parser_train.add_argument('--sample_rate', type=int, required=True)
+ parser_train.add_argument('--window_size', type=int, required=True)
+ parser_train.add_argument('--hop_size', type=int, required=True)
+ parser_train.add_argument('--mel_bins', type=int, required=True)
+ parser_train.add_argument('--fmin', type=int, required=True)
+ parser_train.add_argument('--fmax', type=int, required=True)
+ parser_train.add_argument('--model_type', type=str, required=True)
+ parser_train.add_argument('--pretrained_checkpoint_path', type=str)
+ parser_train.add_argument('--freeze_base', action='store_true', default=False)
+ parser_train.add_argument('--cuda', action='store_true', default=False)
+
+ # Parse arguments
+ args = parser.parse_args()
+ args.filename = get_filename(__file__)
+
+ if args.mode == 'train':
+ train(args)
+
+ else:
+ raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/inference.py b/audio_detection/audio_infer/pytorch/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..49dc75f740aec7be287eab70bae1f7677ccc4662
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/inference.py
@@ -0,0 +1,206 @@
+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import librosa
+import matplotlib.pyplot as plt
+import torch
+
+from utilities import create_folder, get_filename
+from models import *
+from pytorch_utils import move_data_to_device
+import config
+
+def audio_tagging(args):
+ """Inference audio tagging result of an audio clip.
+ """
+
+ # Arugments & parameters
+ sample_rate = args.sample_rate
+ window_size = args.window_size
+ hop_size = args.hop_size
+ mel_bins = args.mel_bins
+ fmin = args.fmin
+ fmax = args.fmax
+ model_type = args.model_type
+ checkpoint_path = args.checkpoint_path
+ audio_path = args.audio_path
+ device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+
+ classes_num = config.classes_num
+ labels = config.labels
+
+ # Model
+ Model = eval(model_type)
+ model = Model(sample_rate=sample_rate, window_size=window_size,
+ hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax,
+ classes_num=classes_num)
+
+ checkpoint = torch.load(checkpoint_path, map_location=device)
+ model.load_state_dict(checkpoint['model'])
+
+ # Parallel
+ if 'cuda' in str(device):
+ model.to(device)
+ print('GPU number: {}'.format(torch.cuda.device_count()))
+ model = torch.nn.DataParallel(model)
+ else:
+ print('Using CPU.')
+
+ # Load audio
+ (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+
+ waveform = waveform[None, :] # (1, audio_length)
+ waveform = move_data_to_device(waveform, device)
+
+ # Forward
+ with torch.no_grad():
+ model.eval()
+ batch_output_dict = model(waveform, None)
+
+ clipwise_output = batch_output_dict['clipwise_output'].data.cpu().numpy()[0]
+ """(classes_num,)"""
+
+ sorted_indexes = np.argsort(clipwise_output)[::-1]
+
+ # Print audio tagging top probabilities
+ for k in range(10):
+ print('{}: {:.3f}'.format(np.array(labels)[sorted_indexes[k]],
+ clipwise_output[sorted_indexes[k]]))
+
+ # Print embedding
+ if 'embedding' in batch_output_dict.keys():
+ embedding = batch_output_dict['embedding'].data.cpu().numpy()[0]
+ print('embedding: {}'.format(embedding.shape))
+
+ return clipwise_output, labels
+
+
+def sound_event_detection(args):
+ """Inference sound event detection result of an audio clip.
+ """
+
+ # Arugments & parameters
+ sample_rate = args.sample_rate
+ window_size = args.window_size
+ hop_size = args.hop_size
+ mel_bins = args.mel_bins
+ fmin = args.fmin
+ fmax = args.fmax
+ model_type = args.model_type
+ checkpoint_path = args.checkpoint_path
+ audio_path = args.audio_path
+ device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+
+ classes_num = config.classes_num
+ labels = config.labels
+ frames_per_second = sample_rate // hop_size
+
+ # Paths
+ fig_path = os.path.join('results', '{}.png'.format(get_filename(audio_path)))
+ create_folder(os.path.dirname(fig_path))
+
+ # Model
+ Model = eval(model_type)
+ model = Model(sample_rate=sample_rate, window_size=window_size,
+ hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax,
+ classes_num=classes_num)
+
+ checkpoint = torch.load(checkpoint_path, map_location=device)
+ model.load_state_dict(checkpoint['model'])
+
+ # Parallel
+ print('GPU number: {}'.format(torch.cuda.device_count()))
+ model = torch.nn.DataParallel(model)
+
+ if 'cuda' in str(device):
+ model.to(device)
+
+ # Load audio
+ (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+
+ waveform = waveform[None, :] # (1, audio_length)
+ waveform = move_data_to_device(waveform, device)
+
+ # Forward
+ with torch.no_grad():
+ model.eval()
+ batch_output_dict = model(waveform, None)
+
+ framewise_output = batch_output_dict['framewise_output'].data.cpu().numpy()[0]
+ """(time_steps, classes_num)"""
+
+ print('Sound event detection result (time_steps x classes_num): {}'.format(
+ framewise_output.shape))
+
+ sorted_indexes = np.argsort(np.max(framewise_output, axis=0))[::-1]
+
+ top_k = 10 # Show top results
+ top_result_mat = framewise_output[:, sorted_indexes[0 : top_k]]
+ """(time_steps, top_k)"""
+
+ # Plot result
+ stft = librosa.core.stft(y=waveform[0].data.cpu().numpy(), n_fft=window_size,
+ hop_length=hop_size, window='hann', center=True)
+ frames_num = stft.shape[-1]
+
+ fig, axs = plt.subplots(2, 1, sharex=True, figsize=(10, 4))
+ axs[0].matshow(np.log(np.abs(stft)), origin='lower', aspect='auto', cmap='jet')
+ axs[0].set_ylabel('Frequency bins')
+ axs[0].set_title('Log spectrogram')
+ axs[1].matshow(top_result_mat.T, origin='upper', aspect='auto', cmap='jet', vmin=0, vmax=1)
+ axs[1].xaxis.set_ticks(np.arange(0, frames_num, frames_per_second))
+ axs[1].xaxis.set_ticklabels(np.arange(0, frames_num / frames_per_second))
+ axs[1].yaxis.set_ticks(np.arange(0, top_k))
+ axs[1].yaxis.set_ticklabels(np.array(labels)[sorted_indexes[0 : top_k]])
+ axs[1].yaxis.grid(color='k', linestyle='solid', linewidth=0.3, alpha=0.3)
+ axs[1].set_xlabel('Seconds')
+ axs[1].xaxis.set_ticks_position('bottom')
+
+ plt.tight_layout()
+ plt.savefig(fig_path)
+ print('Save sound event detection visualization to {}'.format(fig_path))
+
+ return framewise_output, labels
+
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser(description='Example of parser. ')
+ subparsers = parser.add_subparsers(dest='mode')
+
+ parser_at = subparsers.add_parser('audio_tagging')
+ parser_at.add_argument('--sample_rate', type=int, default=32000)
+ parser_at.add_argument('--window_size', type=int, default=1024)
+ parser_at.add_argument('--hop_size', type=int, default=320)
+ parser_at.add_argument('--mel_bins', type=int, default=64)
+ parser_at.add_argument('--fmin', type=int, default=50)
+ parser_at.add_argument('--fmax', type=int, default=14000)
+ parser_at.add_argument('--model_type', type=str, required=True)
+ parser_at.add_argument('--checkpoint_path', type=str, required=True)
+ parser_at.add_argument('--audio_path', type=str, required=True)
+ parser_at.add_argument('--cuda', action='store_true', default=False)
+
+ parser_sed = subparsers.add_parser('sound_event_detection')
+ parser_sed.add_argument('--sample_rate', type=int, default=32000)
+ parser_sed.add_argument('--window_size', type=int, default=1024)
+ parser_sed.add_argument('--hop_size', type=int, default=320)
+ parser_sed.add_argument('--mel_bins', type=int, default=64)
+ parser_sed.add_argument('--fmin', type=int, default=50)
+ parser_sed.add_argument('--fmax', type=int, default=14000)
+ parser_sed.add_argument('--model_type', type=str, required=True)
+ parser_sed.add_argument('--checkpoint_path', type=str, required=True)
+ parser_sed.add_argument('--audio_path', type=str, required=True)
+ parser_sed.add_argument('--cuda', action='store_true', default=False)
+
+ args = parser.parse_args()
+
+ if args.mode == 'audio_tagging':
+ audio_tagging(args)
+
+ elif args.mode == 'sound_event_detection':
+ sound_event_detection(args)
+
+ else:
+ raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/losses.py b/audio_detection/audio_infer/pytorch/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..587e8a64f2593e4a72c1a29cf374c1e24e20c366
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/losses.py
@@ -0,0 +1,14 @@
+import torch
+import torch.nn.functional as F
+
+
+def clip_bce(output_dict, target_dict):
+ """Binary crossentropy loss.
+ """
+ return F.binary_cross_entropy(
+ output_dict['clipwise_output'], target_dict['target'])
+
+
+def get_loss_func(loss_type):
+ if loss_type == 'clip_bce':
+ return clip_bce
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/main.py b/audio_detection/audio_infer/pytorch/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..358293521706ff525f6f1b1274085a08236394ff
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/main.py
@@ -0,0 +1,378 @@
+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import time
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data
+
+from utilities import (create_folder, get_filename, create_logging, Mixup,
+ StatisticsContainer)
+from models import (PVT, PVT2, PVT_lr, PVT_nopretrain, PVT_2layer, Cnn14, Cnn14_no_specaug, Cnn14_no_dropout,
+ Cnn6, Cnn10, ResNet22, ResNet38, ResNet54, Cnn14_emb512, Cnn14_emb128,
+ Cnn14_emb32, MobileNetV1, MobileNetV2, LeeNet11, LeeNet24, DaiNet19,
+ Res1dNet31, Res1dNet51, Wavegram_Cnn14, Wavegram_Logmel_Cnn14,
+ Wavegram_Logmel128_Cnn14, Cnn14_16k, Cnn14_8k, Cnn14_mel32, Cnn14_mel128,
+ Cnn14_mixup_time_domain, Cnn14_DecisionLevelMax, Cnn14_DecisionLevelAtt, Cnn6_Transformer, GLAM, GLAM2, GLAM3, Cnn4, EAT)
+#from models_test import (PVT_test)
+#from models1 import (PVT1)
+#from models_vig import (VIG, VIG2)
+#from models_vvt import (VVT)
+#from models2 import (MPVIT, MPVIT2)
+#from models_reshape import (PVT_reshape, PVT_tscam)
+#from models_swin import (Swin, Swin_nopretrain)
+#from models_swin2 import (Swin2)
+#from models_van import (Van, Van_tiny)
+#from models_focal import (Focal)
+#from models_cross import (Cross)
+#from models_cov import (Cov)
+#from models_cnn import (Cnn_light)
+#from models_twins import (Twins)
+#from models_cmt import (Cmt, Cmt1)
+#from models_shunted import (Shunted)
+#from models_quadtree import (Quadtree, Quadtree2, Quadtree_nopretrain)
+#from models_davit import (Davit_tscam, Davit, Davit_nopretrain)
+from pytorch_utils import (move_data_to_device, count_parameters, count_flops,
+ do_mixup)
+from data_generator import (AudioSetDataset, TrainSampler, BalancedTrainSampler,
+ AlternateTrainSampler, EvaluateSampler, collate_fn)
+from evaluate import Evaluator
+import config
+from losses import get_loss_func
+
+
+def train(args):
+ """Train AudioSet tagging model.
+
+ Args:
+ dataset_dir: str
+ workspace: str
+ data_type: 'balanced_train' | 'full_train'
+ window_size: int
+ hop_size: int
+ mel_bins: int
+ model_type: str
+ loss_type: 'clip_bce'
+ balanced: 'none' | 'balanced' | 'alternate'
+ augmentation: 'none' | 'mixup'
+ batch_size: int
+ learning_rate: float
+ resume_iteration: int
+ early_stop: int
+ accumulation_steps: int
+ cuda: bool
+ """
+
+ # Arugments & parameters
+ workspace = args.workspace
+ data_type = args.data_type
+ sample_rate = args.sample_rate
+ window_size = args.window_size
+ hop_size = args.hop_size
+ mel_bins = args.mel_bins
+ fmin = args.fmin
+ fmax = args.fmax
+ model_type = args.model_type
+ loss_type = args.loss_type
+ balanced = args.balanced
+ augmentation = args.augmentation
+ batch_size = args.batch_size
+ learning_rate = args.learning_rate
+ resume_iteration = args.resume_iteration
+ early_stop = args.early_stop
+ device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+ filename = args.filename
+
+ num_workers = 8
+ clip_samples = config.clip_samples
+ classes_num = config.classes_num
+ loss_func = get_loss_func(loss_type)
+
+ # Paths
+ black_list_csv = None
+
+ train_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes',
+ '{}.h5'.format(data_type))
+
+ eval_bal_indexes_hdf5_path = os.path.join(workspace,
+ 'hdf5s', 'indexes', 'balanced_train.h5')
+
+ eval_test_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes',
+ 'eval.h5')
+
+ checkpoints_dir = os.path.join(workspace, 'checkpoints', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size))
+ create_folder(checkpoints_dir)
+
+ statistics_path = os.path.join(workspace, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+ create_folder(os.path.dirname(statistics_path))
+
+ logs_dir = os.path.join(workspace, 'logs', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size))
+
+ create_logging(logs_dir, filemode='w')
+ logging.info(args)
+
+ if 'cuda' in str(device):
+ logging.info('Using GPU.')
+ device = 'cuda'
+ else:
+ logging.info('Using CPU. Set --cuda flag to use GPU.')
+ device = 'cpu'
+
+ # Model
+ Model = eval(model_type)
+ model = Model(sample_rate=sample_rate, window_size=window_size,
+ hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax,
+ classes_num=classes_num)
+ total = sum(p.numel() for p in model.parameters())
+ print("Total params: %.2fM" % (total/1e6))
+ logging.info("Total params: %.2fM" % (total/1e6))
+ #params_num = count_parameters(model)
+ # flops_num = count_flops(model, clip_samples)
+ #logging.info('Parameters num: {}'.format(params_num))
+ # logging.info('Flops num: {:.3f} G'.format(flops_num / 1e9))
+
+ # Dataset will be used by DataLoader later. Dataset takes a meta as input
+ # and return a waveform and a target.
+ dataset = AudioSetDataset(sample_rate=sample_rate)
+
+ # Train sampler
+ if balanced == 'none':
+ Sampler = TrainSampler
+ elif balanced == 'balanced':
+ Sampler = BalancedTrainSampler
+ elif balanced == 'alternate':
+ Sampler = AlternateTrainSampler
+
+ train_sampler = Sampler(
+ indexes_hdf5_path=train_indexes_hdf5_path,
+ batch_size=batch_size * 2 if 'mixup' in augmentation else batch_size,
+ black_list_csv=black_list_csv)
+
+ # Evaluate sampler
+ eval_bal_sampler = EvaluateSampler(
+ indexes_hdf5_path=eval_bal_indexes_hdf5_path, batch_size=batch_size)
+
+ eval_test_sampler = EvaluateSampler(
+ indexes_hdf5_path=eval_test_indexes_hdf5_path, batch_size=batch_size)
+
+ # Data loader
+ train_loader = torch.utils.data.DataLoader(dataset=dataset,
+ batch_sampler=train_sampler, collate_fn=collate_fn,
+ num_workers=num_workers, pin_memory=True)
+
+ eval_bal_loader = torch.utils.data.DataLoader(dataset=dataset,
+ batch_sampler=eval_bal_sampler, collate_fn=collate_fn,
+ num_workers=num_workers, pin_memory=True)
+
+ eval_test_loader = torch.utils.data.DataLoader(dataset=dataset,
+ batch_sampler=eval_test_sampler, collate_fn=collate_fn,
+ num_workers=num_workers, pin_memory=True)
+ mix=0.5
+ if 'mixup' in augmentation:
+ mixup_augmenter = Mixup(mixup_alpha=mix)
+ print(mix)
+ logging.info(mix)
+
+ # Evaluator
+ evaluator = Evaluator(model=model)
+
+ # Statistics
+ statistics_container = StatisticsContainer(statistics_path)
+
+ # Optimizer
+ optimizer = optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.05, amsgrad=True)
+ scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4, min_lr=1e-06, verbose=True)
+ train_bgn_time = time.time()
+
+ # Resume training
+ if resume_iteration > 0:
+ resume_checkpoint_path = os.path.join(workspace, 'checkpoints', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ '{}_iterations.pth'.format(resume_iteration))
+
+ logging.info('Loading checkpoint {}'.format(resume_checkpoint_path))
+ checkpoint = torch.load(resume_checkpoint_path)
+ model.load_state_dict(checkpoint['model'])
+ train_sampler.load_state_dict(checkpoint['sampler'])
+ statistics_container.load_state_dict(resume_iteration)
+ iteration = checkpoint['iteration']
+
+ else:
+ iteration = 0
+
+ # Parallel
+ print('GPU number: {}'.format(torch.cuda.device_count()))
+ model = torch.nn.DataParallel(model)
+
+ if 'cuda' in str(device):
+ model.to(device)
+
+ if resume_iteration:
+ optimizer.load_state_dict(checkpoint['optimizer'])
+ scheduler.load_state_dict(checkpoint['scheduler'])
+ print(optimizer.state_dict()['param_groups'][0]['lr'])
+
+ time1 = time.time()
+
+ for batch_data_dict in train_loader:
+ """batch_data_dict: {
+ 'audio_name': (batch_size [*2 if mixup],),
+ 'waveform': (batch_size [*2 if mixup], clip_samples),
+ 'target': (batch_size [*2 if mixup], classes_num),
+ (ifexist) 'mixup_lambda': (batch_size * 2,)}
+ """
+
+ # Evaluate
+ if (iteration % 2000 == 0 and iteration >= resume_iteration) or (iteration == 0):
+ train_fin_time = time.time()
+
+ bal_statistics = evaluator.evaluate(eval_bal_loader)
+ test_statistics = evaluator.evaluate(eval_test_loader)
+
+ logging.info('Validate bal mAP: {:.3f}'.format(
+ np.mean(bal_statistics['average_precision'])))
+
+ logging.info('Validate test mAP: {:.3f}'.format(
+ np.mean(test_statistics['average_precision'])))
+
+ statistics_container.append(iteration, bal_statistics, data_type='bal')
+ statistics_container.append(iteration, test_statistics, data_type='test')
+ statistics_container.dump()
+
+ train_time = train_fin_time - train_bgn_time
+ validate_time = time.time() - train_fin_time
+
+ logging.info(
+ 'iteration: {}, train time: {:.3f} s, validate time: {:.3f} s'
+ ''.format(iteration, train_time, validate_time))
+
+ logging.info('------------------------------------')
+
+ train_bgn_time = time.time()
+
+ # Save model
+ if iteration % 2000 == 0:
+ checkpoint = {
+ 'iteration': iteration,
+ 'model': model.module.state_dict(),
+ 'sampler': train_sampler.state_dict(),
+ 'optimizer': optimizer.state_dict(),
+ 'scheduler': scheduler.state_dict()}
+
+ checkpoint_path = os.path.join(
+ checkpoints_dir, '{}_iterations.pth'.format(iteration))
+
+ torch.save(checkpoint, checkpoint_path)
+ logging.info('Model saved to {}'.format(checkpoint_path))
+
+ # Mixup lambda
+ if 'mixup' in augmentation:
+ batch_data_dict['mixup_lambda'] = mixup_augmenter.get_lambda(
+ batch_size=len(batch_data_dict['waveform']))
+
+ # Move data to device
+ for key in batch_data_dict.keys():
+ batch_data_dict[key] = move_data_to_device(batch_data_dict[key], device)
+
+ # Forward
+ model.train()
+
+ if 'mixup' in augmentation:
+ batch_output_dict = model(batch_data_dict['waveform'],
+ batch_data_dict['mixup_lambda'])
+ """{'clipwise_output': (batch_size, classes_num), ...}"""
+
+ batch_target_dict = {'target': do_mixup(batch_data_dict['target'],
+ batch_data_dict['mixup_lambda'])}
+ """{'target': (batch_size, classes_num)}"""
+ else:
+ batch_output_dict = model(batch_data_dict['waveform'], None)
+ """{'clipwise_output': (batch_size, classes_num), ...}"""
+
+ batch_target_dict = {'target': batch_data_dict['target']}
+ """{'target': (batch_size, classes_num)}"""
+
+ # Loss
+ loss = loss_func(batch_output_dict, batch_target_dict)
+ # Backward
+ loss.backward()
+
+ optimizer.step()
+ optimizer.zero_grad()
+
+ if iteration % 10 == 0:
+ print(iteration, loss)
+ #print('--- Iteration: {}, train time: {:.3f} s / 10 iterations ---'\
+ # .format(iteration, time.time() - time1))
+ #time1 = time.time()
+
+ if iteration % 2000 == 0:
+ scheduler.step(np.mean(test_statistics['average_precision']))
+ print(optimizer.state_dict()['param_groups'][0]['lr'])
+ logging.info(optimizer.state_dict()['param_groups'][0]['lr'])
+
+ # Stop learning
+ if iteration == early_stop:
+ break
+
+ iteration += 1
+
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser(description='Example of parser. ')
+ subparsers = parser.add_subparsers(dest='mode')
+
+ parser_train = subparsers.add_parser('train')
+ parser_train.add_argument('--workspace', type=str, required=True)
+ parser_train.add_argument('--data_type', type=str, default='full_train', choices=['balanced_train', 'full_train'])
+ parser_train.add_argument('--sample_rate', type=int, default=32000)
+ parser_train.add_argument('--window_size', type=int, default=1024)
+ parser_train.add_argument('--hop_size', type=int, default=320)
+ parser_train.add_argument('--mel_bins', type=int, default=64)
+ parser_train.add_argument('--fmin', type=int, default=50)
+ parser_train.add_argument('--fmax', type=int, default=14000)
+ parser_train.add_argument('--model_type', type=str, required=True)
+ parser_train.add_argument('--loss_type', type=str, default='clip_bce', choices=['clip_bce'])
+ parser_train.add_argument('--balanced', type=str, default='balanced', choices=['none', 'balanced', 'alternate'])
+ parser_train.add_argument('--augmentation', type=str, default='mixup', choices=['none', 'mixup'])
+ parser_train.add_argument('--batch_size', type=int, default=32)
+ parser_train.add_argument('--learning_rate', type=float, default=1e-3)
+ parser_train.add_argument('--resume_iteration', type=int, default=0)
+ parser_train.add_argument('--early_stop', type=int, default=1000000)
+ parser_train.add_argument('--cuda', action='store_true', default=False)
+
+ args = parser.parse_args()
+ args.filename = get_filename(__file__)
+
+ if args.mode == 'train':
+ train(args)
+
+ else:
+ raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/models.py b/audio_detection/audio_infer/pytorch/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cf5456d1ee9a26a4afe58cea2b11ad78033e01e
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/models.py
@@ -0,0 +1,951 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+
+from audio_infer.pytorch.pytorch_utils import do_mixup, interpolate, pad_framewise_output
+import os
+import sys
+import math
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+from audio_infer.pytorch.pytorch_utils import do_mixup
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+import warnings
+from functools import partial
+#from mmdet.models.builder import BACKBONES
+from mmdet.utils import get_root_logger
+from mmcv.runner import load_checkpoint
+os.environ['TORCH_HOME'] = '../pretrained_models'
+from copy import deepcopy
+from timm.models.helpers import load_pretrained
+from torch.cuda.amp import autocast
+from collections import OrderedDict
+import io
+import re
+from mmcv.runner import _load_checkpoint, load_state_dict
+import mmcv.runner
+import copy
+import random
+from einops import rearrange
+from einops.layers.torch import Rearrange, Reduce
+from torch import nn, einsum
+
+
+def load_checkpoint(model,
+ filename,
+ map_location=None,
+ strict=False,
+ logger=None,
+ revise_keys=[(r'^module\.', '')]):
+ """Load checkpoint from a file or URI.
+
+ Args:
+ model (Module): Module to load checkpoint.
+ filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+ ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+ details.
+ map_location (str): Same as :func:`torch.load`.
+ strict (bool): Whether to allow different params for the model and
+ checkpoint.
+ logger (:mod:`logging.Logger` or None): The logger for error message.
+ revise_keys (list): A list of customized keywords to modify the
+ state_dict in checkpoint. Each item is a (pattern, replacement)
+ pair of the regular expression operations. Default: strip
+ the prefix 'module.' by [(r'^module\\.', '')].
+
+ Returns:
+ dict or OrderedDict: The loaded checkpoint.
+ """
+
+ checkpoint = _load_checkpoint(filename, map_location, logger)
+ new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+ new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+ checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+ # OrderedDict is a subclass of dict
+ if not isinstance(checkpoint, dict):
+ raise RuntimeError(
+ f'No state_dict found in checkpoint file {filename}')
+ # get state_dict from checkpoint
+ if 'state_dict' in checkpoint:
+ state_dict = checkpoint['state_dict']
+ else:
+ state_dict = checkpoint
+
+ # strip prefix of state_dict
+ metadata = getattr(state_dict, '_metadata', OrderedDict())
+ for p, r in revise_keys:
+ state_dict = OrderedDict(
+ {re.sub(p, r, k): v
+ for k, v in state_dict.items()})
+ state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()})
+ # Keep metadata in state_dict
+ state_dict._metadata = metadata
+
+ # load state_dict
+ load_state_dict(model, state_dict, strict, logger)
+ return checkpoint
+
+def init_layer(layer):
+ """Initialize a Linear or Convolutional layer. """
+ nn.init.xavier_uniform_(layer.weight)
+
+ if hasattr(layer, 'bias'):
+ if layer.bias is not None:
+ layer.bias.data.fill_(0.)
+
+
+def init_bn(bn):
+ """Initialize a Batchnorm layer. """
+ bn.bias.data.fill_(0.)
+ bn.weight.data.fill_(1.)
+
+
+
+
+class TimeShift(nn.Module):
+ def __init__(self, mean, std):
+ super().__init__()
+ self.mean = mean
+ self.std = std
+
+ def forward(self, x):
+ if self.training:
+ shift = torch.empty(1).normal_(self.mean, self.std).int().item()
+ x = torch.roll(x, shift, dims=2)
+ return x
+
+class LinearSoftPool(nn.Module):
+ """LinearSoftPool
+ Linear softmax, takes logits and returns a probability, near to the actual maximum value.
+ Taken from the paper:
+ A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling
+ https://arxiv.org/abs/1810.09050
+ """
+ def __init__(self, pooldim=1):
+ super().__init__()
+ self.pooldim = pooldim
+
+ def forward(self, logits, time_decision):
+ return (time_decision**2).sum(self.pooldim) / time_decision.sum(
+ self.pooldim)
+
+class PVT(nn.Module):
+ def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, classes_num):
+
+ super(PVT, self).__init__()
+
+ window = 'hann'
+ center = True
+ pad_mode = 'reflect'
+ ref = 1.0
+ amin = 1e-10
+ top_db = None
+
+ # Spectrogram extractor
+ self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+ win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+ freeze_parameters=True)
+
+ # Logmel feature extractor
+ self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+ n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+ freeze_parameters=True)
+
+ self.time_shift = TimeShift(0, 10)
+ # Spec augmenter
+ self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+ freq_drop_width=8, freq_stripes_num=2)
+
+ self.bn0 = nn.BatchNorm2d(64)
+ self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+ fdim=64,
+ patch_size=7,
+ stride=4,
+ in_chans=1,
+ num_classes=classes_num,
+ embed_dims=[64, 128, 320, 512],
+ depths=[3, 4, 6, 3],
+ num_heads=[1, 2, 5, 8],
+ mlp_ratios=[8, 8, 4, 4],
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.0,
+ drop_path_rate=0.1,
+ sr_ratios=[8, 4, 2, 1],
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ num_stages=4,
+ #pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+ )
+ #self.temp_pool = LinearSoftPool()
+ self.avgpool = nn.AdaptiveAvgPool1d(1)
+ self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+ self.init_weights()
+
+ def init_weights(self):
+ init_bn(self.bn0)
+ init_layer(self.fc_audioset)
+
+ def forward(self, input, mixup_lambda=None):
+ """Input: (batch_size, times_steps, freq_bins)"""
+
+ interpolate_ratio = 32
+
+ x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
+ x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
+ frames_num = x.shape[2]
+ x = x.transpose(1, 3)
+ x = self.bn0(x)
+ x = x.transpose(1, 3)
+
+ if self.training:
+ x = self.time_shift(x)
+ x = self.spec_augmenter(x)
+
+ # Mixup on spectrogram
+ if self.training and mixup_lambda is not None:
+ x = do_mixup(x, mixup_lambda)
+ #print(x.shape) #torch.Size([10, 1, 1001, 64])
+ x = self.pvt_transformer(x)
+ #print(x.shape) #torch.Size([10, 800, 128])
+ x = torch.mean(x, dim=3)
+
+ x = x.transpose(1, 2).contiguous()
+ framewise_output = torch.sigmoid(self.fc_audioset(x))
+ #clipwise_output = torch.mean(framewise_output, dim=1)
+ #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+ x = framewise_output.transpose(1, 2).contiguous()
+ x = self.avgpool(x)
+ clipwise_output = torch.flatten(x, 1)
+ #print(framewise_output.shape) #torch.Size([10, 100, 17])
+ framewise_output = interpolate(framewise_output, interpolate_ratio)
+ #framewise_output = framewise_output[:,:1000,:]
+ #framewise_output = pad_framewise_output(framewise_output, frames_num)
+ output_dict = {'framewise_output': framewise_output,
+ 'clipwise_output': clipwise_output}
+
+ return output_dict
+
+class PVT2(nn.Module):
+ def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, classes_num):
+
+ super(PVT2, self).__init__()
+
+ window = 'hann'
+ center = True
+ pad_mode = 'reflect'
+ ref = 1.0
+ amin = 1e-10
+ top_db = None
+
+ # Spectrogram extractor
+ self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+ win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+ freeze_parameters=True)
+
+ # Logmel feature extractor
+ self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+ n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+ freeze_parameters=True)
+
+ self.time_shift = TimeShift(0, 10)
+ # Spec augmenter
+ self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+ freq_drop_width=8, freq_stripes_num=2)
+
+ self.bn0 = nn.BatchNorm2d(64)
+ self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+ fdim=64,
+ patch_size=7,
+ stride=4,
+ in_chans=1,
+ num_classes=classes_num,
+ embed_dims=[64, 128, 320, 512],
+ depths=[3, 4, 6, 3],
+ num_heads=[1, 2, 5, 8],
+ mlp_ratios=[8, 8, 4, 4],
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.0,
+ drop_path_rate=0.1,
+ sr_ratios=[8, 4, 2, 1],
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ num_stages=4,
+ pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+ )
+ #self.temp_pool = LinearSoftPool()
+ self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+ self.init_weights()
+
+ def init_weights(self):
+ init_bn(self.bn0)
+ init_layer(self.fc_audioset)
+
+ def forward(self, input, mixup_lambda=None):
+ """Input: (batch_size, times_steps, freq_bins)"""
+
+ interpolate_ratio = 32
+
+ x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
+ x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
+ frames_num = x.shape[2]
+ x = x.transpose(1, 3)
+ x = self.bn0(x)
+ x = x.transpose(1, 3)
+
+ if self.training:
+ #x = self.time_shift(x)
+ x = self.spec_augmenter(x)
+
+ # Mixup on spectrogram
+ if self.training and mixup_lambda is not None:
+ x = do_mixup(x, mixup_lambda)
+ #print(x.shape) #torch.Size([10, 1, 1001, 64])
+ x = self.pvt_transformer(x)
+ #print(x.shape) #torch.Size([10, 800, 128])
+ x = torch.mean(x, dim=3)
+
+ x = x.transpose(1, 2).contiguous()
+ framewise_output = torch.sigmoid(self.fc_audioset(x))
+ clipwise_output = torch.mean(framewise_output, dim=1)
+ #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+ #print(framewise_output.shape) #torch.Size([10, 100, 17])
+ framewise_output = interpolate(framewise_output, interpolate_ratio)
+ #framewise_output = framewise_output[:,:1000,:]
+ #framewise_output = pad_framewise_output(framewise_output, frames_num)
+ output_dict = {'framewise_output': framewise_output,
+ 'clipwise_output': clipwise_output}
+
+ return output_dict
+
+class PVT_2layer(nn.Module):
+ def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, classes_num):
+
+ super(PVT_2layer, self).__init__()
+
+ window = 'hann'
+ center = True
+ pad_mode = 'reflect'
+ ref = 1.0
+ amin = 1e-10
+ top_db = None
+
+ # Spectrogram extractor
+ self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+ win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+ freeze_parameters=True)
+
+ # Logmel feature extractor
+ self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+ n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+ freeze_parameters=True)
+
+ self.time_shift = TimeShift(0, 10)
+ # Spec augmenter
+ self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+ freq_drop_width=8, freq_stripes_num=2)
+
+ self.bn0 = nn.BatchNorm2d(64)
+ self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+ fdim=64,
+ patch_size=7,
+ stride=4,
+ in_chans=1,
+ num_classes=classes_num,
+ embed_dims=[64, 128],
+ depths=[3, 4],
+ num_heads=[1, 2],
+ mlp_ratios=[8, 8],
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.0,
+ drop_path_rate=0.1,
+ sr_ratios=[8, 4],
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ num_stages=2,
+ pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+ )
+ #self.temp_pool = LinearSoftPool()
+ self.avgpool = nn.AdaptiveAvgPool1d(1)
+ self.fc_audioset = nn.Linear(128, classes_num, bias=True)
+
+ self.init_weights()
+
+ def init_weights(self):
+ init_bn(self.bn0)
+ init_layer(self.fc_audioset)
+
+ def forward(self, input, mixup_lambda=None):
+ """Input: (batch_size, times_steps, freq_bins)"""
+
+ interpolate_ratio = 8
+
+ x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
+ x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
+ frames_num = x.shape[2]
+ x = x.transpose(1, 3)
+ x = self.bn0(x)
+ x = x.transpose(1, 3)
+
+ if self.training:
+ x = self.time_shift(x)
+ x = self.spec_augmenter(x)
+
+ # Mixup on spectrogram
+ if self.training and mixup_lambda is not None:
+ x = do_mixup(x, mixup_lambda)
+ #print(x.shape) #torch.Size([10, 1, 1001, 64])
+ x = self.pvt_transformer(x)
+ #print(x.shape) #torch.Size([10, 800, 128])
+ x = torch.mean(x, dim=3)
+
+ x = x.transpose(1, 2).contiguous()
+ framewise_output = torch.sigmoid(self.fc_audioset(x))
+ #clipwise_output = torch.mean(framewise_output, dim=1)
+ #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+ x = framewise_output.transpose(1, 2).contiguous()
+ x = self.avgpool(x)
+ clipwise_output = torch.flatten(x, 1)
+ #print(framewise_output.shape) #torch.Size([10, 100, 17])
+ framewise_output = interpolate(framewise_output, interpolate_ratio)
+ #framewise_output = framewise_output[:,:1000,:]
+ #framewise_output = pad_framewise_output(framewise_output, frames_num)
+ output_dict = {'framewise_output': framewise_output,
+ 'clipwise_output': clipwise_output}
+
+ return output_dict
+
+class PVT_lr(nn.Module):
+ def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, classes_num):
+
+ super(PVT_lr, self).__init__()
+
+ window = 'hann'
+ center = True
+ pad_mode = 'reflect'
+ ref = 1.0
+ amin = 1e-10
+ top_db = None
+
+ # Spectrogram extractor
+ self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+ win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+ freeze_parameters=True)
+
+ # Logmel feature extractor
+ self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+ n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+ freeze_parameters=True)
+
+ self.time_shift = TimeShift(0, 10)
+ # Spec augmenter
+ self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+ freq_drop_width=8, freq_stripes_num=2)
+
+ self.bn0 = nn.BatchNorm2d(64)
+ self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+ fdim=64,
+ patch_size=7,
+ stride=4,
+ in_chans=1,
+ num_classes=classes_num,
+ embed_dims=[64, 128, 320, 512],
+ depths=[3, 4, 6, 3],
+ num_heads=[1, 2, 5, 8],
+ mlp_ratios=[8, 8, 4, 4],
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.0,
+ drop_path_rate=0.1,
+ sr_ratios=[8, 4, 2, 1],
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ num_stages=4,
+ pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+ )
+ self.temp_pool = LinearSoftPool()
+ self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+ self.init_weights()
+
+ def init_weights(self):
+ init_bn(self.bn0)
+ init_layer(self.fc_audioset)
+
+ def forward(self, input, mixup_lambda=None):
+ """Input: (batch_size, times_steps, freq_bins)"""
+
+ interpolate_ratio = 32
+
+ x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
+ x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
+ frames_num = x.shape[2]
+ x = x.transpose(1, 3)
+ x = self.bn0(x)
+ x = x.transpose(1, 3)
+
+ if self.training:
+ x = self.time_shift(x)
+ x = self.spec_augmenter(x)
+
+ # Mixup on spectrogram
+ if self.training and mixup_lambda is not None:
+ x = do_mixup(x, mixup_lambda)
+ #print(x.shape) #torch.Size([10, 1, 1001, 64])
+ x = self.pvt_transformer(x)
+ #print(x.shape) #torch.Size([10, 800, 128])
+ x = torch.mean(x, dim=3)
+
+ x = x.transpose(1, 2).contiguous()
+ framewise_output = torch.sigmoid(self.fc_audioset(x))
+ clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+ #print(framewise_output.shape) #torch.Size([10, 100, 17])
+ framewise_output = interpolate(framewise_output, interpolate_ratio)
+ #framewise_output = framewise_output[:,:1000,:]
+ #framewise_output = pad_framewise_output(framewise_output, frames_num)
+ output_dict = {'framewise_output': framewise_output,
+ 'clipwise_output': clipwise_output}
+
+ return output_dict
+
+
+class PVT_nopretrain(nn.Module):
+ def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, classes_num):
+
+ super(PVT_nopretrain, self).__init__()
+
+ window = 'hann'
+ center = True
+ pad_mode = 'reflect'
+ ref = 1.0
+ amin = 1e-10
+ top_db = None
+
+ # Spectrogram extractor
+ self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+ win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+ freeze_parameters=True)
+
+ # Logmel feature extractor
+ self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+ n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+ freeze_parameters=True)
+
+ self.time_shift = TimeShift(0, 10)
+ # Spec augmenter
+ self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+ freq_drop_width=8, freq_stripes_num=2)
+
+ self.bn0 = nn.BatchNorm2d(64)
+ self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+ fdim=64,
+ patch_size=7,
+ stride=4,
+ in_chans=1,
+ num_classes=classes_num,
+ embed_dims=[64, 128, 320, 512],
+ depths=[3, 4, 6, 3],
+ num_heads=[1, 2, 5, 8],
+ mlp_ratios=[8, 8, 4, 4],
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.0,
+ drop_path_rate=0.1,
+ sr_ratios=[8, 4, 2, 1],
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ num_stages=4,
+ #pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+ )
+ self.temp_pool = LinearSoftPool()
+ self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+ self.init_weights()
+
+ def init_weights(self):
+ init_bn(self.bn0)
+ init_layer(self.fc_audioset)
+
+ def forward(self, input, mixup_lambda=None):
+ """Input: (batch_size, times_steps, freq_bins)"""
+
+ interpolate_ratio = 32
+
+ x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
+ x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
+ frames_num = x.shape[2]
+ x = x.transpose(1, 3)
+ x = self.bn0(x)
+ x = x.transpose(1, 3)
+
+ if self.training:
+ x = self.time_shift(x)
+ x = self.spec_augmenter(x)
+
+ # Mixup on spectrogram
+ if self.training and mixup_lambda is not None:
+ x = do_mixup(x, mixup_lambda)
+ #print(x.shape) #torch.Size([10, 1, 1001, 64])
+ x = self.pvt_transformer(x)
+ #print(x.shape) #torch.Size([10, 800, 128])
+ x = torch.mean(x, dim=3)
+
+ x = x.transpose(1, 2).contiguous()
+ framewise_output = torch.sigmoid(self.fc_audioset(x))
+ clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+ #print(framewise_output.shape) #torch.Size([10, 100, 17])
+ framewise_output = interpolate(framewise_output, interpolate_ratio)
+ framewise_output = framewise_output[:,:1000,:]
+ #framewise_output = pad_framewise_output(framewise_output, frames_num)
+ output_dict = {'framewise_output': framewise_output,
+ 'clipwise_output': clipwise_output}
+
+ return output_dict
+
+
+class Mlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.dwconv = DWConv(hidden_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+ self.linear = linear
+ if self.linear:
+ self.relu = nn.ReLU()
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+ elif isinstance(m, nn.Conv2d):
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ fan_out //= m.groups
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+ if m.bias is not None:
+ m.bias.data.zero_()
+
+ def forward(self, x, H, W):
+ x = self.fc1(x)
+ if self.linear:
+ x = self.relu(x)
+ x = self.dwconv(x, H, W)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class Attention(nn.Module):
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False):
+ super().__init__()
+ assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+ self.dim = dim
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.q = nn.Linear(dim, dim, bias=qkv_bias)
+ self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ self.linear = linear
+ self.sr_ratio = sr_ratio
+ if not linear:
+ if sr_ratio > 1:
+ self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+ self.norm = nn.LayerNorm(dim)
+ else:
+ self.pool = nn.AdaptiveAvgPool2d(7)
+ self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
+ self.norm = nn.LayerNorm(dim)
+ self.act = nn.GELU()
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+ elif isinstance(m, nn.Conv2d):
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ fan_out //= m.groups
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+ if m.bias is not None:
+ m.bias.data.zero_()
+
+ def forward(self, x, H, W):
+ B, N, C = x.shape
+ q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+ if not self.linear:
+ if self.sr_ratio > 1:
+ x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+ x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
+ x_ = self.norm(x_)
+ kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ else:
+ kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ else:
+ x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+ x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1)
+ x_ = self.norm(x_)
+ x_ = self.act(x_)
+ kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ k, v = kv[0], kv[1]
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+
+ return x
+
+
+class Pooling(nn.Module):
+ """
+ Implementation of pooling for PoolFormer
+ --pool_size: pooling size
+ """
+ def __init__(self, pool_size=3):
+ super().__init__()
+ self.pool = nn.AvgPool2d(
+ pool_size, stride=1, padding=pool_size//2, count_include_pad=False)
+
+ def forward(self, x):
+ return self.pool(x) - x
+
+class Block(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim,
+ num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)
+ #self.norm3 = norm_layer(dim)
+ #self.token_mixer = Pooling(pool_size=3)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear)
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+ elif isinstance(m, nn.Conv2d):
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ fan_out //= m.groups
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+ if m.bias is not None:
+ m.bias.data.zero_()
+
+ def forward(self, x, H, W):
+ x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+ x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+ return x
+
+
+class OverlapPatchEmbed(nn.Module):
+ """ Image to Patch Embedding
+ """
+
+ def __init__(self, tdim, fdim, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+ super().__init__()
+ img_size = (tdim, fdim)
+ patch_size = to_2tuple(patch_size)
+
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.H, self.W = img_size[0] // stride, img_size[1] // stride
+ self.num_patches = self.H * self.W
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+ padding=(patch_size[0] // 3, patch_size[1] // 3))
+ self.norm = nn.LayerNorm(embed_dim)
+
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+ elif isinstance(m, nn.Conv2d):
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ fan_out //= m.groups
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+ if m.bias is not None:
+ m.bias.data.zero_()
+
+ def forward(self, x):
+ x = self.proj(x)
+ _, _, H, W = x.shape
+ x = x.flatten(2).transpose(1, 2)
+ x = self.norm(x)
+
+ return x, H, W
+
+
+class PyramidVisionTransformerV2(nn.Module):
+ def __init__(self, tdim=1001, fdim=64, patch_size=16, stride=4, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
+ num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
+ attn_drop_rate=0., drop_path_rate=0.1, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3],
+ sr_ratios=[8, 4, 2, 1], num_stages=2, linear=False, pretrained=None):
+ super().__init__()
+ # self.num_classes = num_classes
+ self.depths = depths
+ self.num_stages = num_stages
+ self.linear = linear
+
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
+ cur = 0
+
+ for i in range(num_stages):
+ patch_embed = OverlapPatchEmbed(tdim=tdim if i == 0 else tdim // (2 ** (i + 1)),
+ fdim=fdim if i == 0 else tdim // (2 ** (i + 1)),
+ patch_size=7 if i == 0 else 3,
+ stride=stride if i == 0 else 2,
+ in_chans=in_chans if i == 0 else embed_dims[i - 1],
+ embed_dim=embed_dims[i])
+ block = nn.ModuleList([Block(
+ dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer,
+ sr_ratio=sr_ratios[i], linear=linear)
+ for j in range(depths[i])])
+ norm = norm_layer(embed_dims[i])
+ cur += depths[i]
+
+ setattr(self, f"patch_embed{i + 1}", patch_embed)
+ setattr(self, f"block{i + 1}", block)
+ setattr(self, f"norm{i + 1}", norm)
+ #self.n = nn.Linear(125, 250, bias=True)
+ # classification head
+ # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
+ self.apply(self._init_weights)
+ self.init_weights(pretrained)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+ elif isinstance(m, nn.Conv2d):
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ fan_out //= m.groups
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+ if m.bias is not None:
+ m.bias.data.zero_()
+
+ def init_weights(self, pretrained=None):
+ if isinstance(pretrained, str):
+ logger = get_root_logger()
+ load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
+
+ def freeze_patch_emb(self):
+ self.patch_embed1.requires_grad = False
+
+ @torch.jit.ignore
+ def no_weight_decay(self):
+ return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better
+
+ def get_classifier(self):
+ return self.head
+
+ def reset_classifier(self, num_classes, global_pool=''):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def forward_features(self, x):
+ B = x.shape[0]
+
+ for i in range(self.num_stages):
+ patch_embed = getattr(self, f"patch_embed{i + 1}")
+ block = getattr(self, f"block{i + 1}")
+ norm = getattr(self, f"norm{i + 1}")
+ x, H, W = patch_embed(x)
+ #print(x.shape)
+ for blk in block:
+ x = blk(x, H, W)
+ #print(x.shape)
+ x = norm(x)
+ #if i != self.num_stages - 1:
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+ #print(x.shape)
+ return x
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ # x = self.head(x)
+
+ return x
+
+class DWConv(nn.Module):
+ def __init__(self, dim=768):
+ super(DWConv, self).__init__()
+ self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+ def forward(self, x, H, W):
+ B, N, C = x.shape
+ x = x.transpose(1, 2).view(B, C, H, W)
+ x = self.dwconv(x)
+ x = x.flatten(2).transpose(1, 2)
+
+ return x
+
+
+def _conv_filter(state_dict, patch_size=16):
+ """ convert patch embedding weight from manual patchify + linear proj to conv"""
+ out_dict = {}
+ for k, v in state_dict.items():
+ if 'patch_embed.proj.weight' in k:
+ v = v.reshape((v.shape[0], 3, patch_size, patch_size))
+ out_dict[k] = v
+
+ return out_dict
diff --git a/audio_detection/audio_infer/pytorch/pytorch_utils.py b/audio_detection/audio_infer/pytorch/pytorch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a135b336866acc61e834e42e5aa9e9db3f7998ff
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/pytorch_utils.py
@@ -0,0 +1,251 @@
+import numpy as np
+import time
+import torch
+import torch.nn as nn
+
+
+def move_data_to_device(x, device):
+ if 'float' in str(x.dtype):
+ x = torch.Tensor(x)
+ elif 'int' in str(x.dtype):
+ x = torch.LongTensor(x)
+ else:
+ return x
+
+ return x.to(device)
+
+
+def do_mixup(x, mixup_lambda):
+ """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes
+ (1, 3, 5, ...).
+
+ Args:
+ x: (batch_size * 2, ...)
+ mixup_lambda: (batch_size * 2,)
+
+ Returns:
+ out: (batch_size, ...)
+ """
+ out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
+ x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
+ return out
+
+
+def append_to_dict(dict, key, value):
+ if key in dict.keys():
+ dict[key].append(value)
+ else:
+ dict[key] = [value]
+
+
+def forward(model, generator, return_input=False,
+ return_target=False):
+ """Forward data to a model.
+
+ Args:
+ model: object
+ generator: object
+ return_input: bool
+ return_target: bool
+
+ Returns:
+ audio_name: (audios_num,)
+ clipwise_output: (audios_num, classes_num)
+ (ifexist) segmentwise_output: (audios_num, segments_num, classes_num)
+ (ifexist) framewise_output: (audios_num, frames_num, classes_num)
+ (optional) return_input: (audios_num, segment_samples)
+ (optional) return_target: (audios_num, classes_num)
+ """
+ output_dict = {}
+ device = next(model.parameters()).device
+ time1 = time.time()
+
+ # Forward data to a model in mini-batches
+ for n, batch_data_dict in enumerate(generator):
+ print(n)
+ batch_waveform = move_data_to_device(batch_data_dict['waveform'], device)
+
+ with torch.no_grad():
+ model.eval()
+ batch_output = model(batch_waveform)
+
+ append_to_dict(output_dict, 'audio_name', batch_data_dict['audio_name'])
+
+ append_to_dict(output_dict, 'clipwise_output',
+ batch_output['clipwise_output'].data.cpu().numpy())
+
+ if 'segmentwise_output' in batch_output.keys():
+ append_to_dict(output_dict, 'segmentwise_output',
+ batch_output['segmentwise_output'].data.cpu().numpy())
+
+ if 'framewise_output' in batch_output.keys():
+ append_to_dict(output_dict, 'framewise_output',
+ batch_output['framewise_output'].data.cpu().numpy())
+
+ if return_input:
+ append_to_dict(output_dict, 'waveform', batch_data_dict['waveform'])
+
+ if return_target:
+ if 'target' in batch_data_dict.keys():
+ append_to_dict(output_dict, 'target', batch_data_dict['target'])
+
+ if n % 10 == 0:
+ print(' --- Inference time: {:.3f} s / 10 iterations ---'.format(
+ time.time() - time1))
+ time1 = time.time()
+
+ for key in output_dict.keys():
+ output_dict[key] = np.concatenate(output_dict[key], axis=0)
+
+ return output_dict
+
+
+def interpolate(x, ratio):
+ """Interpolate data in time domain. This is used to compensate the
+ resolution reduction in downsampling of a CNN.
+
+ Args:
+ x: (batch_size, time_steps, classes_num)
+ ratio: int, ratio to interpolate
+
+ Returns:
+ upsampled: (batch_size, time_steps * ratio, classes_num)
+ """
+ (batch_size, time_steps, classes_num) = x.shape
+ upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+ upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+ return upsampled
+
+
+def pad_framewise_output(framewise_output, frames_num):
+ """Pad framewise_output to the same length as input frames. The pad value
+ is the same as the value of the last frame.
+
+ Args:
+ framewise_output: (batch_size, frames_num, classes_num)
+ frames_num: int, number of frames to pad
+
+ Outputs:
+ output: (batch_size, frames_num, classes_num)
+ """
+ pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
+ """tensor for padding"""
+
+ output = torch.cat((framewise_output, pad), dim=1)
+ """(batch_size, frames_num, classes_num)"""
+
+ return output
+
+
+def count_parameters(model):
+ return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def count_flops(model, audio_length):
+ """Count flops. Code modified from others' implementation.
+ """
+ multiply_adds = True
+ list_conv2d=[]
+ def conv2d_hook(self, input, output):
+ batch_size, input_channels, input_height, input_width = input[0].size()
+ output_channels, output_height, output_width = output[0].size()
+
+ kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+ bias_ops = 1 if self.bias is not None else 0
+
+ params = output_channels * (kernel_ops + bias_ops)
+ flops = batch_size * params * output_height * output_width
+
+ list_conv2d.append(flops)
+
+ list_conv1d=[]
+ def conv1d_hook(self, input, output):
+ batch_size, input_channels, input_length = input[0].size()
+ output_channels, output_length = output[0].size()
+
+ kernel_ops = self.kernel_size[0] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+ bias_ops = 1 if self.bias is not None else 0
+
+ params = output_channels * (kernel_ops + bias_ops)
+ flops = batch_size * params * output_length
+
+ list_conv1d.append(flops)
+
+ list_linear=[]
+ def linear_hook(self, input, output):
+ batch_size = input[0].size(0) if input[0].dim() == 2 else 1
+
+ weight_ops = self.weight.nelement() * (2 if multiply_adds else 1)
+ bias_ops = self.bias.nelement()
+
+ flops = batch_size * (weight_ops + bias_ops)
+ list_linear.append(flops)
+
+ list_bn=[]
+ def bn_hook(self, input, output):
+ list_bn.append(input[0].nelement() * 2)
+
+ list_relu=[]
+ def relu_hook(self, input, output):
+ list_relu.append(input[0].nelement() * 2)
+
+ list_pooling2d=[]
+ def pooling2d_hook(self, input, output):
+ batch_size, input_channels, input_height, input_width = input[0].size()
+ output_channels, output_height, output_width = output[0].size()
+
+ kernel_ops = self.kernel_size * self.kernel_size
+ bias_ops = 0
+ params = output_channels * (kernel_ops + bias_ops)
+ flops = batch_size * params * output_height * output_width
+
+ list_pooling2d.append(flops)
+
+ list_pooling1d=[]
+ def pooling1d_hook(self, input, output):
+ batch_size, input_channels, input_length = input[0].size()
+ output_channels, output_length = output[0].size()
+
+ kernel_ops = self.kernel_size[0]
+ bias_ops = 0
+
+ params = output_channels * (kernel_ops + bias_ops)
+ flops = batch_size * params * output_length
+
+ list_pooling2d.append(flops)
+
+ def foo(net):
+ childrens = list(net.children())
+ if not childrens:
+ if isinstance(net, nn.Conv2d):
+ net.register_forward_hook(conv2d_hook)
+ elif isinstance(net, nn.Conv1d):
+ net.register_forward_hook(conv1d_hook)
+ elif isinstance(net, nn.Linear):
+ net.register_forward_hook(linear_hook)
+ elif isinstance(net, nn.BatchNorm2d) or isinstance(net, nn.BatchNorm1d):
+ net.register_forward_hook(bn_hook)
+ elif isinstance(net, nn.ReLU):
+ net.register_forward_hook(relu_hook)
+ elif isinstance(net, nn.AvgPool2d) or isinstance(net, nn.MaxPool2d):
+ net.register_forward_hook(pooling2d_hook)
+ elif isinstance(net, nn.AvgPool1d) or isinstance(net, nn.MaxPool1d):
+ net.register_forward_hook(pooling1d_hook)
+ else:
+ print('Warning: flop of module {} is not counted!'.format(net))
+ return
+ for c in childrens:
+ foo(c)
+
+ # Register hook
+ foo(model)
+
+ device = device = next(model.parameters()).device
+ input = torch.rand(1, audio_length).to(device)
+
+ out = model(input)
+
+ total_flops = sum(list_conv2d) + sum(list_conv1d) + sum(list_linear) + \
+ sum(list_bn) + sum(list_relu) + sum(list_pooling2d) + sum(list_pooling1d)
+
+ return total_flops
\ No newline at end of file
diff --git a/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png b/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png
new file mode 100644
index 0000000000000000000000000000000000000000..3c2b5d8cceac7f40a4bdba8bd1a75d590b4382ee
Binary files /dev/null and b/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png differ
diff --git a/audio_detection/audio_infer/useful_ckpts/audio_detection.pth b/audio_detection/audio_infer/useful_ckpts/audio_detection.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8bc6c65802de022080d76fc07bb68a563c6d87bf
--- /dev/null
+++ b/audio_detection/audio_infer/useful_ckpts/audio_detection.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f909808f17d424dc29063a21953ff2be103489518a4f60a6c649d2e3e7d3e81
+size 441042195
diff --git a/audio_detection/audio_infer/utils/__pycache__/config.cpython-38.pyc b/audio_detection/audio_infer/utils/__pycache__/config.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..724d543c0b401c546e16e5db5c7be6d7b1b78c8a
Binary files /dev/null and b/audio_detection/audio_infer/utils/__pycache__/config.cpython-38.pyc differ
diff --git a/audio_detection/audio_infer/utils/config.py b/audio_detection/audio_infer/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..934be1c68f4e1562e5fcef81d2f8db131cb39b9f
--- /dev/null
+++ b/audio_detection/audio_infer/utils/config.py
@@ -0,0 +1,94 @@
+import numpy as np
+import csv
+
+sample_rate = 32000
+clip_samples = sample_rate * 10 # Audio clips are 10-second
+
+# Load label
+with open('./audio_detection/audio_infer/metadata/class_labels_indices.csv', 'r') as f:
+ reader = csv.reader(f, delimiter=',')
+ lines = list(reader)
+
+labels = []
+ids = [] # Each label has a unique id such as "/m/068hy"
+for i1 in range(1, len(lines)):
+ id = lines[i1][1]
+ label = lines[i1][2]
+ ids.append(id)
+ labels.append(label)
+
+classes_num = len(labels)
+
+lb_to_ix = {label : i for i, label in enumerate(labels)}
+ix_to_lb = {i : label for i, label in enumerate(labels)}
+
+id_to_ix = {id : i for i, id in enumerate(ids)}
+ix_to_id = {i : id for i, id in enumerate(ids)}
+
+full_samples_per_class = np.array([
+ 937432, 16344, 7822, 10271, 2043, 14420, 733, 1511,
+ 1258, 424, 1751, 704, 369, 590, 1063, 1375,
+ 5026, 743, 853, 1648, 714, 1497, 1251, 2139,
+ 1093, 133, 224, 39469, 6423, 407, 1559, 4546,
+ 6826, 7464, 2468, 549, 4063, 334, 587, 238,
+ 1766, 691, 114, 2153, 236, 209, 421, 740,
+ 269, 959, 137, 4192, 485, 1515, 655, 274,
+ 69, 157, 1128, 807, 1022, 346, 98, 680,
+ 890, 352, 4169, 2061, 1753, 9883, 1339, 708,
+ 37857, 18504, 12864, 2475, 2182, 757, 3624, 677,
+ 1683, 3583, 444, 1780, 2364, 409, 4060, 3097,
+ 3143, 502, 723, 600, 230, 852, 1498, 1865,
+ 1879, 2429, 5498, 5430, 2139, 1761, 1051, 831,
+ 2401, 2258, 1672, 1711, 987, 646, 794, 25061,
+ 5792, 4256, 96, 8126, 2740, 752, 513, 554,
+ 106, 254, 1592, 556, 331, 615, 2841, 737,
+ 265, 1349, 358, 1731, 1115, 295, 1070, 972,
+ 174, 937780, 112337, 42509, 49200, 11415, 6092, 13851,
+ 2665, 1678, 13344, 2329, 1415, 2244, 1099, 5024,
+ 9872, 10948, 4409, 2732, 1211, 1289, 4807, 5136,
+ 1867, 16134, 14519, 3086, 19261, 6499, 4273, 2790,
+ 8820, 1228, 1575, 4420, 3685, 2019, 664, 324,
+ 513, 411, 436, 2997, 5162, 3806, 1389, 899,
+ 8088, 7004, 1105, 3633, 2621, 9753, 1082, 26854,
+ 3415, 4991, 2129, 5546, 4489, 2850, 1977, 1908,
+ 1719, 1106, 1049, 152, 136, 802, 488, 592,
+ 2081, 2712, 1665, 1128, 250, 544, 789, 2715,
+ 8063, 7056, 2267, 8034, 6092, 3815, 1833, 3277,
+ 8813, 2111, 4662, 2678, 2954, 5227, 1472, 2591,
+ 3714, 1974, 1795, 4680, 3751, 6585, 2109, 36617,
+ 6083, 16264, 17351, 3449, 5034, 3931, 2599, 4134,
+ 3892, 2334, 2211, 4516, 2766, 2862, 3422, 1788,
+ 2544, 2403, 2892, 4042, 3460, 1516, 1972, 1563,
+ 1579, 2776, 1647, 4535, 3921, 1261, 6074, 2922,
+ 3068, 1948, 4407, 712, 1294, 1019, 1572, 3764,
+ 5218, 975, 1539, 6376, 1606, 6091, 1138, 1169,
+ 7925, 3136, 1108, 2677, 2680, 1383, 3144, 2653,
+ 1986, 1800, 1308, 1344, 122231, 12977, 2552, 2678,
+ 7824, 768, 8587, 39503, 3474, 661, 430, 193,
+ 1405, 1442, 3588, 6280, 10515, 785, 710, 305,
+ 206, 4990, 5329, 3398, 1771, 3022, 6907, 1523,
+ 8588, 12203, 666, 2113, 7916, 434, 1636, 5185,
+ 1062, 664, 952, 3490, 2811, 2749, 2848, 15555,
+ 363, 117, 1494, 1647, 5886, 4021, 633, 1013,
+ 5951, 11343, 2324, 243, 372, 943, 734, 242,
+ 3161, 122, 127, 201, 1654, 768, 134, 1467,
+ 642, 1148, 2156, 1368, 1176, 302, 1909, 61,
+ 223, 1812, 287, 422, 311, 228, 748, 230,
+ 1876, 539, 1814, 737, 689, 1140, 591, 943,
+ 353, 289, 198, 490, 7938, 1841, 850, 457,
+ 814, 146, 551, 728, 1627, 620, 648, 1621,
+ 2731, 535, 88, 1736, 736, 328, 293, 3170,
+ 344, 384, 7640, 433, 215, 715, 626, 128,
+ 3059, 1833, 2069, 3732, 1640, 1508, 836, 567,
+ 2837, 1151, 2068, 695, 1494, 3173, 364, 88,
+ 188, 740, 677, 273, 1533, 821, 1091, 293,
+ 647, 318, 1202, 328, 532, 2847, 526, 721,
+ 370, 258, 956, 1269, 1641, 339, 1322, 4485,
+ 286, 1874, 277, 757, 1393, 1330, 380, 146,
+ 377, 394, 318, 339, 1477, 1886, 101, 1435,
+ 284, 1425, 686, 621, 221, 117, 87, 1340,
+ 201, 1243, 1222, 651, 1899, 421, 712, 1016,
+ 1279, 124, 351, 258, 7043, 368, 666, 162,
+ 7664, 137, 70159, 26179, 6321, 32236, 33320, 771,
+ 1169, 269, 1103, 444, 364, 2710, 121, 751,
+ 1609, 855, 1141, 2287, 1940, 3943, 289])
diff --git a/audio_detection/audio_infer/utils/crash.py b/audio_detection/audio_infer/utils/crash.py
new file mode 100644
index 0000000000000000000000000000000000000000..98a06e20bc793687ec259e23c8b9e503887b34f5
--- /dev/null
+++ b/audio_detection/audio_infer/utils/crash.py
@@ -0,0 +1,12 @@
+import sys
+
+class ExceptionHook:
+ instance = None
+ def __call__(self, *args, **kwargs):
+ if self.instance is None:
+ from IPython.core import ultratb
+ self.instance = ultratb.FormattedTB(mode='Plain',
+ color_scheme='Linux', call_pdb=1)
+ return self.instance(*args, **kwargs)
+
+sys.excepthook = ExceptionHook()
diff --git a/audio_detection/audio_infer/utils/create_black_list.py b/audio_detection/audio_infer/utils/create_black_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadbe94599997e3476f37f8c4cdd30ca86a8720e
--- /dev/null
+++ b/audio_detection/audio_infer/utils/create_black_list.py
@@ -0,0 +1,64 @@
+import argparse
+import csv
+import os
+
+from utilities import create_folder
+
+
+def dcase2017task4(args):
+ """Create black list. Black list is a list of audio ids that will be
+ skipped in training.
+ """
+
+ # Augments & parameters
+ workspace = args.workspace
+
+ # Black list from DCASE 2017 Task 4
+ test_weak_csv = 'metadata/black_list/groundtruth_weak_label_testing_set.csv'
+ evaluation_weak_csv = 'metadata/black_list/groundtruth_weak_label_evaluation_set.csv'
+
+ black_list_csv = os.path.join(workspace, 'black_list', 'dcase2017task4.csv')
+ create_folder(os.path.dirname(black_list_csv))
+
+ def get_id_sets(csv_path):
+ with open(csv_path, 'r') as fr:
+ reader = csv.reader(fr, delimiter='\t')
+ lines = list(reader)
+
+ ids_set = []
+
+ for line in lines:
+ """line: ['-5QrBL6MzLg_60.000_70.000.wav', '60.000', '70.000', 'Train horn']"""
+ ids_set.append(line[0][0 : 11])
+
+ ids_set = list(set(ids_set))
+ return ids_set
+
+ test_ids_set = get_id_sets(test_weak_csv)
+ evaluation_ids_set = get_id_sets(evaluation_weak_csv)
+
+ full_ids_set = test_ids_set + evaluation_ids_set
+
+ # Write black list
+ fw = open(black_list_csv, 'w')
+
+ for id in full_ids_set:
+ fw.write('{}\n'.format(id))
+
+ print('Write black list to {}'.format(black_list_csv))
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='')
+ subparsers = parser.add_subparsers(dest='mode')
+
+ parser_dcase2017task4 = subparsers.add_parser('dcase2017task4')
+ parser_dcase2017task4.add_argument('--workspace', type=str, required=True)
+
+ args = parser.parse_args()
+
+ if args.mode == 'dcase2017task4':
+ dcase2017task4(args)
+
+ else:
+ raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/create_indexes.py b/audio_detection/audio_infer/utils/create_indexes.py
new file mode 100644
index 0000000000000000000000000000000000000000..78be38cb3c693fa9ef7b44c52c407640e9e32aab
--- /dev/null
+++ b/audio_detection/audio_infer/utils/create_indexes.py
@@ -0,0 +1,126 @@
+import numpy as np
+import argparse
+import csv
+import os
+import glob
+import datetime
+import time
+import logging
+import h5py
+import librosa
+
+from utilities import create_folder, get_sub_filepaths
+import config
+
+
+def create_indexes(args):
+ """Create indexes a for dataloader to read for training. When users have
+ a new task and their own data, they need to create similar indexes. The
+ indexes contain meta information of "where to find the data for training".
+ """
+
+ # Arguments & parameters
+ waveforms_hdf5_path = args.waveforms_hdf5_path
+ indexes_hdf5_path = args.indexes_hdf5_path
+
+ # Paths
+ create_folder(os.path.dirname(indexes_hdf5_path))
+
+ with h5py.File(waveforms_hdf5_path, 'r') as hr:
+ with h5py.File(indexes_hdf5_path, 'w') as hw:
+ audios_num = len(hr['audio_name'])
+ hw.create_dataset('audio_name', data=hr['audio_name'][:], dtype='S20')
+ hw.create_dataset('target', data=hr['target'][:], dtype=np.bool)
+ hw.create_dataset('hdf5_path', data=[waveforms_hdf5_path.encode()] * audios_num, dtype='S200')
+ hw.create_dataset('index_in_hdf5', data=np.arange(audios_num), dtype=np.int32)
+
+ print('Write to {}'.format(indexes_hdf5_path))
+
+
+def combine_full_indexes(args):
+ """Combine all balanced and unbalanced indexes hdf5s to a single hdf5. This
+ combined indexes hdf5 is used for training with full data (~20k balanced
+ audio clips + ~1.9m unbalanced audio clips).
+ """
+
+ # Arguments & parameters
+ indexes_hdf5s_dir = args.indexes_hdf5s_dir
+ full_indexes_hdf5_path = args.full_indexes_hdf5_path
+
+ classes_num = config.classes_num
+
+ # Paths
+ paths = get_sub_filepaths(indexes_hdf5s_dir)
+ paths = [path for path in paths if (
+ 'train' in path and 'full_train' not in path and 'mini' not in path)]
+
+ print('Total {} hdf5 to combine.'.format(len(paths)))
+
+ with h5py.File(full_indexes_hdf5_path, 'w') as full_hf:
+ full_hf.create_dataset(
+ name='audio_name',
+ shape=(0,),
+ maxshape=(None,),
+ dtype='S20')
+
+ full_hf.create_dataset(
+ name='target',
+ shape=(0, classes_num),
+ maxshape=(None, classes_num),
+ dtype=np.bool)
+
+ full_hf.create_dataset(
+ name='hdf5_path',
+ shape=(0,),
+ maxshape=(None,),
+ dtype='S200')
+
+ full_hf.create_dataset(
+ name='index_in_hdf5',
+ shape=(0,),
+ maxshape=(None,),
+ dtype=np.int32)
+
+ for path in paths:
+ with h5py.File(path, 'r') as part_hf:
+ print(path)
+ n = len(full_hf['audio_name'][:])
+ new_n = n + len(part_hf['audio_name'][:])
+
+ full_hf['audio_name'].resize((new_n,))
+ full_hf['audio_name'][n : new_n] = part_hf['audio_name'][:]
+
+ full_hf['target'].resize((new_n, classes_num))
+ full_hf['target'][n : new_n] = part_hf['target'][:]
+
+ full_hf['hdf5_path'].resize((new_n,))
+ full_hf['hdf5_path'][n : new_n] = part_hf['hdf5_path'][:]
+
+ full_hf['index_in_hdf5'].resize((new_n,))
+ full_hf['index_in_hdf5'][n : new_n] = part_hf['index_in_hdf5'][:]
+
+ print('Write combined full hdf5 to {}'.format(full_indexes_hdf5_path))
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ subparsers = parser.add_subparsers(dest='mode')
+
+ parser_create_indexes = subparsers.add_parser('create_indexes')
+ parser_create_indexes.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path of packed waveforms hdf5.')
+ parser_create_indexes.add_argument('--indexes_hdf5_path', type=str, required=True, help='Path to write out indexes hdf5.')
+
+ parser_combine_full_indexes = subparsers.add_parser('combine_full_indexes')
+ parser_combine_full_indexes.add_argument('--indexes_hdf5s_dir', type=str, required=True, help='Directory containing indexes hdf5s to be combined.')
+ parser_combine_full_indexes.add_argument('--full_indexes_hdf5_path', type=str, required=True, help='Path to write out full indexes hdf5 file.')
+
+ args = parser.parse_args()
+
+ if args.mode == 'create_indexes':
+ create_indexes(args)
+
+ elif args.mode == 'combine_full_indexes':
+ combine_full_indexes(args)
+
+ else:
+ raise Exception('Incorrect arguments!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/data_generator.py b/audio_detection/audio_infer/utils/data_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94b6d990b6726c791cbb4cb660abdb93233f965
--- /dev/null
+++ b/audio_detection/audio_infer/utils/data_generator.py
@@ -0,0 +1,421 @@
+import numpy as np
+import h5py
+import csv
+import time
+import logging
+
+from utilities import int16_to_float32
+
+
+def read_black_list(black_list_csv):
+ """Read audio names from black list.
+ """
+ with open(black_list_csv, 'r') as fr:
+ reader = csv.reader(fr)
+ lines = list(reader)
+
+ black_list_names = ['Y{}.wav'.format(line[0]) for line in lines]
+ return black_list_names
+
+
+class AudioSetDataset(object):
+ def __init__(self, sample_rate=32000):
+ """This class takes the meta of an audio clip as input, and return
+ the waveform and target of the audio clip. This class is used by DataLoader.
+ """
+ self.sample_rate = sample_rate
+
+ def __getitem__(self, meta):
+ """Load waveform and target of an audio clip.
+
+ Args:
+ meta: {
+ 'hdf5_path': str,
+ 'index_in_hdf5': int}
+
+ Returns:
+ data_dict: {
+ 'audio_name': str,
+ 'waveform': (clip_samples,),
+ 'target': (classes_num,)}
+ """
+ hdf5_path = meta['hdf5_path']
+ index_in_hdf5 = meta['index_in_hdf5']
+ with h5py.File(hdf5_path, 'r') as hf:
+ audio_name = hf['audio_name'][index_in_hdf5].decode()
+ waveform = int16_to_float32(hf['waveform'][index_in_hdf5])
+ waveform = self.resample(waveform)
+ target = hf['target'][index_in_hdf5].astype(np.float32)
+
+ data_dict = {
+ 'audio_name': audio_name, 'waveform': waveform, 'target': target}
+
+ return data_dict
+
+ def resample(self, waveform):
+ """Resample.
+
+ Args:
+ waveform: (clip_samples,)
+
+ Returns:
+ (resampled_clip_samples,)
+ """
+ if self.sample_rate == 32000:
+ return waveform
+ elif self.sample_rate == 16000:
+ return waveform[0 :: 2]
+ elif self.sample_rate == 8000:
+ return waveform[0 :: 4]
+ else:
+ raise Exception('Incorrect sample rate!')
+
+
+class Base(object):
+ def __init__(self, indexes_hdf5_path, batch_size, black_list_csv, random_seed):
+ """Base class of train sampler.
+
+ Args:
+ indexes_hdf5_path: string
+ batch_size: int
+ black_list_csv: string
+ random_seed: int
+ """
+ self.batch_size = batch_size
+ self.random_state = np.random.RandomState(random_seed)
+
+ # Black list
+ if black_list_csv:
+ self.black_list_names = read_black_list(black_list_csv)
+ else:
+ self.black_list_names = []
+
+ logging.info('Black list samples: {}'.format(len(self.black_list_names)))
+
+ # Load target
+ load_time = time.time()
+
+ with h5py.File(indexes_hdf5_path, 'r') as hf:
+ self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]]
+ self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]]
+ self.indexes_in_hdf5 = hf['index_in_hdf5'][:]
+ self.targets = hf['target'][:].astype(np.float32)
+
+ (self.audios_num, self.classes_num) = self.targets.shape
+ logging.info('Training number: {}'.format(self.audios_num))
+ logging.info('Load target time: {:.3f} s'.format(time.time() - load_time))
+
+
+class TrainSampler(Base):
+ def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
+ random_seed=1234):
+ """Balanced sampler. Generate batch meta for training.
+
+ Args:
+ indexes_hdf5_path: string
+ batch_size: int
+ black_list_csv: string
+ random_seed: int
+ """
+ super(TrainSampler, self).__init__(indexes_hdf5_path, batch_size,
+ black_list_csv, random_seed)
+
+ self.indexes = np.arange(self.audios_num)
+
+ # Shuffle indexes
+ self.random_state.shuffle(self.indexes)
+
+ self.pointer = 0
+
+ def __iter__(self):
+ """Generate batch meta for training.
+
+ Returns:
+ batch_meta: e.g.: [
+ {'hdf5_path': string, 'index_in_hdf5': int},
+ ...]
+ """
+ batch_size = self.batch_size
+
+ while True:
+ batch_meta = []
+ i = 0
+ while i < batch_size:
+ index = self.indexes[self.pointer]
+ self.pointer += 1
+
+ # Shuffle indexes and reset pointer
+ if self.pointer >= self.audios_num:
+ self.pointer = 0
+ self.random_state.shuffle(self.indexes)
+
+ # If audio in black list then continue
+ if self.audio_names[index] in self.black_list_names:
+ continue
+ else:
+ batch_meta.append({
+ 'hdf5_path': self.hdf5_paths[index],
+ 'index_in_hdf5': self.indexes_in_hdf5[index]})
+ i += 1
+
+ yield batch_meta
+
+ def state_dict(self):
+ state = {
+ 'indexes': self.indexes,
+ 'pointer': self.pointer}
+ return state
+
+ def load_state_dict(self, state):
+ self.indexes = state['indexes']
+ self.pointer = state['pointer']
+
+
+class BalancedTrainSampler(Base):
+ def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
+ random_seed=1234):
+ """Balanced sampler. Generate batch meta for training. Data are equally
+ sampled from different sound classes.
+
+ Args:
+ indexes_hdf5_path: string
+ batch_size: int
+ black_list_csv: string
+ random_seed: int
+ """
+ super(BalancedTrainSampler, self).__init__(indexes_hdf5_path,
+ batch_size, black_list_csv, random_seed)
+
+ self.samples_num_per_class = np.sum(self.targets, axis=0)
+ logging.info('samples_num_per_class: {}'.format(
+ self.samples_num_per_class.astype(np.int32)))
+
+ # Training indexes of all sound classes. E.g.:
+ # [[0, 11, 12, ...], [3, 4, 15, 16, ...], [7, 8, ...], ...]
+ self.indexes_per_class = []
+
+ for k in range(self.classes_num):
+ self.indexes_per_class.append(
+ np.where(self.targets[:, k] == 1)[0])
+
+ # Shuffle indexes
+ for k in range(self.classes_num):
+ self.random_state.shuffle(self.indexes_per_class[k])
+
+ self.queue = []
+ self.pointers_of_classes = [0] * self.classes_num
+
+ def expand_queue(self, queue):
+ classes_set = np.arange(self.classes_num).tolist()
+ self.random_state.shuffle(classes_set)
+ queue += classes_set
+ return queue
+
+ def __iter__(self):
+ """Generate batch meta for training.
+
+ Returns:
+ batch_meta: e.g.: [
+ {'hdf5_path': string, 'index_in_hdf5': int},
+ ...]
+ """
+ batch_size = self.batch_size
+
+ while True:
+ batch_meta = []
+ i = 0
+ while i < batch_size:
+ if len(self.queue) == 0:
+ self.queue = self.expand_queue(self.queue)
+
+ class_id = self.queue.pop(0)
+ pointer = self.pointers_of_classes[class_id]
+ self.pointers_of_classes[class_id] += 1
+ index = self.indexes_per_class[class_id][pointer]
+
+ # When finish one epoch of a sound class, then shuffle its indexes and reset pointer
+ if self.pointers_of_classes[class_id] >= self.samples_num_per_class[class_id]:
+ self.pointers_of_classes[class_id] = 0
+ self.random_state.shuffle(self.indexes_per_class[class_id])
+
+ # If audio in black list then continue
+ if self.audio_names[index] in self.black_list_names:
+ continue
+ else:
+ batch_meta.append({
+ 'hdf5_path': self.hdf5_paths[index],
+ 'index_in_hdf5': self.indexes_in_hdf5[index]})
+ i += 1
+
+ yield batch_meta
+
+ def state_dict(self):
+ state = {
+ 'indexes_per_class': self.indexes_per_class,
+ 'queue': self.queue,
+ 'pointers_of_classes': self.pointers_of_classes}
+ return state
+
+ def load_state_dict(self, state):
+ self.indexes_per_class = state['indexes_per_class']
+ self.queue = state['queue']
+ self.pointers_of_classes = state['pointers_of_classes']
+
+
+class AlternateTrainSampler(Base):
+ def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
+ random_seed=1234):
+ """AlternateSampler is a combination of Sampler and Balanced Sampler.
+ AlternateSampler alternately sample data from Sampler and Blanced Sampler.
+
+ Args:
+ indexes_hdf5_path: string
+ batch_size: int
+ black_list_csv: string
+ random_seed: int
+ """
+ self.sampler1 = TrainSampler(indexes_hdf5_path, batch_size,
+ black_list_csv, random_seed)
+
+ self.sampler2 = BalancedTrainSampler(indexes_hdf5_path, batch_size,
+ black_list_csv, random_seed)
+
+ self.batch_size = batch_size
+ self.count = 0
+
+ def __iter__(self):
+ """Generate batch meta for training.
+
+ Returns:
+ batch_meta: e.g.: [
+ {'hdf5_path': string, 'index_in_hdf5': int},
+ ...]
+ """
+ batch_size = self.batch_size
+
+ while True:
+ self.count += 1
+
+ if self.count % 2 == 0:
+ batch_meta = []
+ i = 0
+ while i < batch_size:
+ index = self.sampler1.indexes[self.sampler1.pointer]
+ self.sampler1.pointer += 1
+
+ # Shuffle indexes and reset pointer
+ if self.sampler1.pointer >= self.sampler1.audios_num:
+ self.sampler1.pointer = 0
+ self.sampler1.random_state.shuffle(self.sampler1.indexes)
+
+ # If audio in black list then continue
+ if self.sampler1.audio_names[index] in self.sampler1.black_list_names:
+ continue
+ else:
+ batch_meta.append({
+ 'hdf5_path': self.sampler1.hdf5_paths[index],
+ 'index_in_hdf5': self.sampler1.indexes_in_hdf5[index]})
+ i += 1
+
+ elif self.count % 2 == 1:
+ batch_meta = []
+ i = 0
+ while i < batch_size:
+ if len(self.sampler2.queue) == 0:
+ self.sampler2.queue = self.sampler2.expand_queue(self.sampler2.queue)
+
+ class_id = self.sampler2.queue.pop(0)
+ pointer = self.sampler2.pointers_of_classes[class_id]
+ self.sampler2.pointers_of_classes[class_id] += 1
+ index = self.sampler2.indexes_per_class[class_id][pointer]
+
+ # When finish one epoch of a sound class, then shuffle its indexes and reset pointer
+ if self.sampler2.pointers_of_classes[class_id] >= self.sampler2.samples_num_per_class[class_id]:
+ self.sampler2.pointers_of_classes[class_id] = 0
+ self.sampler2.random_state.shuffle(self.sampler2.indexes_per_class[class_id])
+
+ # If audio in black list then continue
+ if self.sampler2.audio_names[index] in self.sampler2.black_list_names:
+ continue
+ else:
+ batch_meta.append({
+ 'hdf5_path': self.sampler2.hdf5_paths[index],
+ 'index_in_hdf5': self.sampler2.indexes_in_hdf5[index]})
+ i += 1
+
+ yield batch_meta
+
+ def state_dict(self):
+ state = {
+ 'sampler1': self.sampler1.state_dict(),
+ 'sampler2': self.sampler2.state_dict()}
+ return state
+
+ def load_state_dict(self, state):
+ self.sampler1.load_state_dict(state['sampler1'])
+ self.sampler2.load_state_dict(state['sampler2'])
+
+
+class EvaluateSampler(object):
+ def __init__(self, indexes_hdf5_path, batch_size):
+ """Evaluate sampler. Generate batch meta for evaluation.
+
+ Args:
+ indexes_hdf5_path: string
+ batch_size: int
+ """
+ self.batch_size = batch_size
+
+ with h5py.File(indexes_hdf5_path, 'r') as hf:
+ self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]]
+ self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]]
+ self.indexes_in_hdf5 = hf['index_in_hdf5'][:]
+ self.targets = hf['target'][:].astype(np.float32)
+
+ self.audios_num = len(self.audio_names)
+
+ def __iter__(self):
+ """Generate batch meta for training.
+
+ Returns:
+ batch_meta: e.g.: [
+ {'hdf5_path': string,
+ 'index_in_hdf5': int}
+ ...]
+ """
+ batch_size = self.batch_size
+ pointer = 0
+
+ while pointer < self.audios_num:
+ batch_indexes = np.arange(pointer,
+ min(pointer + batch_size, self.audios_num))
+
+ batch_meta = []
+
+ for index in batch_indexes:
+ batch_meta.append({
+ 'audio_name': self.audio_names[index],
+ 'hdf5_path': self.hdf5_paths[index],
+ 'index_in_hdf5': self.indexes_in_hdf5[index],
+ 'target': self.targets[index]})
+
+ pointer += batch_size
+ yield batch_meta
+
+
+def collate_fn(list_data_dict):
+ """Collate data.
+ Args:
+ list_data_dict, e.g., [{'audio_name': str, 'waveform': (clip_samples,), ...},
+ {'audio_name': str, 'waveform': (clip_samples,), ...},
+ ...]
+ Returns:
+ np_data_dict, dict, e.g.,
+ {'audio_name': (batch_size,), 'waveform': (batch_size, clip_samples), ...}
+ """
+ np_data_dict = {}
+
+ for key in list_data_dict[0].keys():
+ np_data_dict[key] = np.array([data_dict[key] for data_dict in list_data_dict])
+
+ return np_data_dict
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/dataset.py b/audio_detection/audio_infer/utils/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f11755a027de97d236e447e576c4b1ed4e8a36
--- /dev/null
+++ b/audio_detection/audio_infer/utils/dataset.py
@@ -0,0 +1,224 @@
+import numpy as np
+import argparse
+import csv
+import os
+import glob
+import datetime
+import time
+import logging
+import h5py
+import librosa
+
+from utilities import (create_folder, get_filename, create_logging,
+ float32_to_int16, pad_or_truncate, read_metadata)
+import config
+
+
+def split_unbalanced_csv_to_partial_csvs(args):
+ """Split unbalanced csv to part csvs. Each part csv contains up to 50000 ids.
+ """
+
+ unbalanced_csv_path = args.unbalanced_csv
+ unbalanced_partial_csvs_dir = args.unbalanced_partial_csvs_dir
+
+ create_folder(unbalanced_partial_csvs_dir)
+
+ with open(unbalanced_csv_path, 'r') as f:
+ lines = f.readlines()
+
+ lines = lines[3:] # Remove head info
+ audios_num_per_file = 50000
+
+ files_num = int(np.ceil(len(lines) / float(audios_num_per_file)))
+
+ for r in range(files_num):
+ lines_per_file = lines[r * audios_num_per_file :
+ (r + 1) * audios_num_per_file]
+
+ out_csv_path = os.path.join(unbalanced_partial_csvs_dir,
+ 'unbalanced_train_segments_part{:02d}.csv'.format(r))
+
+ with open(out_csv_path, 'w') as f:
+ f.write('empty\n')
+ f.write('empty\n')
+ f.write('empty\n')
+ for line in lines_per_file:
+ f.write(line)
+
+ print('Write out csv to {}'.format(out_csv_path))
+
+
+def download_wavs(args):
+ """Download videos and extract audio in wav format.
+ """
+
+ # Paths
+ csv_path = args.csv_path
+ audios_dir = args.audios_dir
+ mini_data = args.mini_data
+
+ if mini_data:
+ logs_dir = '_logs/download_dataset/{}'.format(get_filename(csv_path))
+ else:
+ logs_dir = '_logs/download_dataset_minidata/{}'.format(get_filename(csv_path))
+
+ create_folder(audios_dir)
+ create_folder(logs_dir)
+ create_logging(logs_dir, filemode='w')
+ logging.info('Download log is saved to {}'.format(logs_dir))
+
+ # Read csv
+ with open(csv_path, 'r') as f:
+ lines = f.readlines()
+
+ lines = lines[3:] # Remove csv head info
+
+ if mini_data:
+ lines = lines[0 : 10] # Download partial data for debug
+
+ download_time = time.time()
+
+ # Download
+ for (n, line) in enumerate(lines):
+
+ items = line.split(', ')
+ audio_id = items[0]
+ start_time = float(items[1])
+ end_time = float(items[2])
+ duration = end_time - start_time
+
+ logging.info('{} {} start_time: {:.1f}, end_time: {:.1f}'.format(
+ n, audio_id, start_time, end_time))
+
+ # Download full video of whatever format
+ video_name = os.path.join(audios_dir, '_Y{}.%(ext)s'.format(audio_id))
+ os.system("youtube-dl --quiet -o '{}' -x https://www.youtube.com/watch?v={}"\
+ .format(video_name, audio_id))
+
+ video_paths = glob.glob(os.path.join(audios_dir, '_Y' + audio_id + '.*'))
+
+ # If download successful
+ if len(video_paths) > 0:
+ video_path = video_paths[0] # Choose one video
+
+ # Add 'Y' to the head because some video ids are started with '-'
+ # which will cause problem
+ audio_path = os.path.join(audios_dir, 'Y' + audio_id + '.wav')
+
+ # Extract audio in wav format
+ os.system("ffmpeg -loglevel panic -i {} -ac 1 -ar 32000 -ss {} -t 00:00:{} {} "\
+ .format(video_path,
+ str(datetime.timedelta(seconds=start_time)), duration,
+ audio_path))
+
+ # Remove downloaded video
+ os.system("rm {}".format(video_path))
+
+ logging.info("Download and convert to {}".format(audio_path))
+
+ logging.info('Download finished! Time spent: {:.3f} s'.format(
+ time.time() - download_time))
+
+ logging.info('Logs can be viewed in {}'.format(logs_dir))
+
+
+def pack_waveforms_to_hdf5(args):
+ """Pack waveform and target of several audio clips to a single hdf5 file.
+ This can speed up loading and training.
+ """
+
+ # Arguments & parameters
+ audios_dir = args.audios_dir
+ csv_path = args.csv_path
+ waveforms_hdf5_path = args.waveforms_hdf5_path
+ mini_data = args.mini_data
+
+ clip_samples = config.clip_samples
+ classes_num = config.classes_num
+ sample_rate = config.sample_rate
+ id_to_ix = config.id_to_ix
+
+ # Paths
+ if mini_data:
+ prefix = 'mini_'
+ waveforms_hdf5_path += '.mini'
+ else:
+ prefix = ''
+
+ create_folder(os.path.dirname(waveforms_hdf5_path))
+
+ logs_dir = '_logs/pack_waveforms_to_hdf5/{}{}'.format(prefix, get_filename(csv_path))
+ create_folder(logs_dir)
+ create_logging(logs_dir, filemode='w')
+ logging.info('Write logs to {}'.format(logs_dir))
+
+ # Read csv file
+ meta_dict = read_metadata(csv_path, classes_num, id_to_ix)
+
+ if mini_data:
+ mini_num = 10
+ for key in meta_dict.keys():
+ meta_dict[key] = meta_dict[key][0 : mini_num]
+
+ audios_num = len(meta_dict['audio_name'])
+
+ # Pack waveform to hdf5
+ total_time = time.time()
+
+ with h5py.File(waveforms_hdf5_path, 'w') as hf:
+ hf.create_dataset('audio_name', shape=((audios_num,)), dtype='S20')
+ hf.create_dataset('waveform', shape=((audios_num, clip_samples)), dtype=np.int16)
+ hf.create_dataset('target', shape=((audios_num, classes_num)), dtype=np.bool)
+ hf.attrs.create('sample_rate', data=sample_rate, dtype=np.int32)
+
+ # Pack waveform & target of several audio clips to a single hdf5 file
+ for n in range(audios_num):
+ audio_path = os.path.join(audios_dir, meta_dict['audio_name'][n])
+
+ if os.path.isfile(audio_path):
+ logging.info('{} {}'.format(n, audio_path))
+ (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+ audio = pad_or_truncate(audio, clip_samples)
+
+ hf['audio_name'][n] = meta_dict['audio_name'][n].encode()
+ hf['waveform'][n] = float32_to_int16(audio)
+ hf['target'][n] = meta_dict['target'][n]
+ else:
+ logging.info('{} File does not exist! {}'.format(n, audio_path))
+
+ logging.info('Write to {}'.format(waveforms_hdf5_path))
+ logging.info('Pack hdf5 time: {:.3f}'.format(time.time() - total_time))
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ subparsers = parser.add_subparsers(dest='mode')
+
+ parser_split = subparsers.add_parser('split_unbalanced_csv_to_partial_csvs')
+ parser_split.add_argument('--unbalanced_csv', type=str, required=True, help='Path of unbalanced_csv file to read.')
+ parser_split.add_argument('--unbalanced_partial_csvs_dir', type=str, required=True, help='Directory to save out split unbalanced partial csv.')
+
+ parser_download_wavs = subparsers.add_parser('download_wavs')
+ parser_download_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
+ parser_download_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
+ parser_download_wavs.add_argument('--mini_data', action='store_true', default=True, help='Set true to only download 10 audios for debugging.')
+
+ parser_pack_wavs = subparsers.add_parser('pack_waveforms_to_hdf5')
+ parser_pack_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
+ parser_pack_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
+ parser_pack_wavs.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path to save out packed hdf5.')
+ parser_pack_wavs.add_argument('--mini_data', action='store_true', default=False, help='Set true to only download 10 audios for debugging.')
+
+ args = parser.parse_args()
+
+ if args.mode == 'split_unbalanced_csv_to_partial_csvs':
+ split_unbalanced_csv_to_partial_csvs(args)
+
+ elif args.mode == 'download_wavs':
+ download_wavs(args)
+
+ elif args.mode == 'pack_waveforms_to_hdf5':
+ pack_waveforms_to_hdf5(args)
+
+ else:
+ raise Exception('Incorrect arguments!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/plot_for_paper.py b/audio_detection/audio_infer/utils/plot_for_paper.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e799a7e7eea9ffc5bced214a8beb0a558842eb
--- /dev/null
+++ b/audio_detection/audio_infer/utils/plot_for_paper.py
@@ -0,0 +1,565 @@
+import os
+import sys
+import numpy as np
+import argparse
+import h5py
+import time
+import pickle
+import matplotlib.pyplot as plt
+import csv
+from sklearn import metrics
+
+from utilities import (create_folder, get_filename, d_prime)
+import config
+
+
+def load_statistics(statistics_path):
+ statistics_dict = pickle.load(open(statistics_path, 'rb'))
+
+ bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num)
+ bal_map = np.mean(bal_map, axis=-1)
+ test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num)
+ test_map = np.mean(test_map, axis=-1)
+
+ return bal_map, test_map
+
+
+def crop_label(label):
+ max_len = 16
+ if len(label) <= max_len:
+ return label
+ else:
+ words = label.split(' ')
+ cropped_label = ''
+ for w in words:
+ if len(cropped_label + ' ' + w) > max_len:
+ break
+ else:
+ cropped_label += ' {}'.format(w)
+ return cropped_label
+
+
+def add_comma(integer):
+ """E.g., 1234567 -> 1,234,567
+ """
+ integer = int(integer)
+ if integer >= 1000:
+ return str(integer // 1000) + ',' + str(integer % 1000)
+ else:
+ return str(integer)
+
+
+def plot_classwise_iteration_map(args):
+
+ # Paths
+ save_out_path = 'results/classwise_iteration_map.pdf'
+ create_folder(os.path.dirname(save_out_path))
+
+ # Load statistics
+ statistics_dict = pickle.load(open('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl', 'rb'))
+
+ mAP_mat = np.array([e['average_precision'] for e in statistics_dict['test']])
+ mAP_mat = mAP_mat[0 : 300, :] # 300 * 2000 = 600k iterations
+ sorted_indexes = np.argsort(config.full_samples_per_class)[::-1]
+
+ fig, axs = plt.subplots(1, 3, figsize=(20, 5))
+ ranges = [np.arange(0, 10), np.arange(250, 260), np.arange(517, 527)]
+ axs[0].set_ylabel('AP')
+
+ for col in range(0, 3):
+ axs[col].set_ylim(0, 1.)
+ axs[col].set_xlim(0, 301)
+ axs[col].set_xlabel('Iterations')
+ axs[col].set_ylabel('AP')
+ axs[col].xaxis.set_ticks(np.arange(0, 301, 100))
+ axs[col].xaxis.set_ticklabels(['0', '200k', '400k', '600k'])
+ lines = []
+ for _ix in ranges[col]:
+ _label = crop_label(config.labels[sorted_indexes[_ix]]) + \
+ ' ({})'.format(add_comma(config.full_samples_per_class[sorted_indexes[_ix]]))
+ line, = axs[col].plot(mAP_mat[:, sorted_indexes[_ix]], label=_label)
+ lines.append(line)
+ box = axs[col].get_position()
+ axs[col].set_position([box.x0, box.y0, box.width * 1., box.height])
+ axs[col].legend(handles=lines, bbox_to_anchor=(1., 1.))
+ axs[col].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+
+ plt.tight_layout(pad=4, w_pad=1, h_pad=1)
+ plt.savefig(save_out_path)
+ print(save_out_path)
+
+
+def plot_six_figures(args):
+
+ # Arguments & parameters
+ classes_num = config.classes_num
+ labels = config.labels
+ max_plot_iteration = 540000
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ # Paths
+ class_labels_indices_path = os.path.join('metadata', 'class_labels_indices.csv')
+ save_out_path = 'results/six_figures.pdf'
+ create_folder(os.path.dirname(save_out_path))
+
+ # Plot
+ fig, ax = plt.subplots(2, 3, figsize=(14, 7))
+ bal_alpha = 0.3
+ test_alpha = 1.0
+ linewidth = 1.
+
+ # (a) Comparison of architectures
+ if True:
+ lines = []
+
+ # Wavegram-Logmel-CNN
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl')
+ line, = ax[0, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 0].plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Cnn14
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+ line, = ax[0, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 0].plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # MobileNetV1
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_MobileNetV1_balanced_mixup_bs32.pkl')
+ line, = ax[0, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 0].plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[0, 0].legend(handles=lines, loc=2)
+ ax[0, 0].set_title('(a) Comparison of architectures')
+
+ # (b) Comparison of training data and augmentation'
+ if True:
+ lines = []
+
+ # Full data + balanced sampler + mixup
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+ line, = ax[0, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Full data + balanced sampler + mixup in time domain
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_timedomain_bs32.pkl')
+ line, = ax[0, 1].plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Full data + balanced sampler + no mixup
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_nomixup_bs32.pkl')
+ line, = ax[0, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Full data + uniform sampler + no mixup
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_nobalanced_nomixup_bs32.pkl')
+ line, = ax[0, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Balanced data + balanced sampler + mixup
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_mixup_bs32.pkl')
+ line, = ax[0, 1].plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Balanced data + balanced sampler + no mixup
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_nomixup_bs32.pkl')
+ line, = ax[0, 1].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[0, 1].legend(handles=lines, loc=2, fontsize=8)
+ ax[0, 1].set_title('(b) Comparison of training data and augmentation')
+
+ # (c) Comparison of embedding size
+ if True:
+ lines = []
+
+ # Embedding size 2048
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+ line, = ax[0, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 2].plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Embedding size 128
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb128_balanced_mixup_bs32.pkl')
+ line, = ax[0, 2].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 2].plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Embedding size 32
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb32_balanced_mixup_bs32.pkl')
+ line, = ax[0, 2].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 2].plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[0, 2].legend(handles=lines, loc=2)
+ ax[0, 2].set_title('(c) Comparison of embedding size')
+
+ # (d) Comparison of amount of training data
+ if True:
+ lines = []
+
+ # 100% of full training data
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+ line, = ax[1, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 0].plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # 80% of full training data
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.8full_train_Cnn14_balanced_mixup_bs32.pkl')
+ line, = ax[1, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 0].plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # 50% of full training data
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.5full_train_Cnn14_balanced_mixup_bs32.pkl')
+ line, = ax[1, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 0].plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[1, 0].legend(handles=lines, loc=2)
+ ax[1, 0].set_title('(d) Comparison of amount of training data')
+
+ # (e) Comparison of sampling rate
+ if True:
+ lines = []
+
+ # Cnn14 + 32 kHz
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+ line, = ax[1, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 1].plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Cnn14 + 16 kHz
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_16k_balanced_mixup_bs32.pkl')
+ line, = ax[1, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 1].plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Cnn14 + 8 kHz
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_8k_balanced_mixup_bs32.pkl')
+ line, = ax[1, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 1].plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[1, 1].legend(handles=lines, loc=2)
+ ax[1, 1].set_title('(e) Comparison of sampling rate')
+
+ # (f) Comparison of mel bins number
+ if True:
+ lines = []
+
+ # Cnn14 + 128 mel bins
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel128_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+ line, = ax[1, 2].plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax[1, 2].plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Cnn14 + 64 mel bins
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+ line, = ax[1, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 2].plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # Cnn14 + 32 mel bins
+ (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel32_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+ line, = ax[1, 2].plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax[1, 2].plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[1, 2].legend(handles=lines, loc=2)
+ ax[1, 2].set_title('(f) Comparison of mel bins number')
+
+ for i in range(2):
+ for j in range(3):
+ ax[i, j].set_ylim(0, 0.8)
+ ax[i, j].set_xlim(0, len(iterations))
+ ax[i, j].set_xlabel('Iterations')
+ ax[i, j].set_ylabel('mAP')
+ ax[i, j].xaxis.set_ticks(np.arange(0, len(iterations), 50))
+ ax[i, j].xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k'])
+ ax[i, j].yaxis.set_ticks(np.arange(0, 0.81, 0.05))
+ ax[i, j].yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3',
+ '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8'])
+ ax[i, j].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+ ax[i, j].xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+
+ plt.tight_layout(0, 1, 0)
+ plt.savefig(save_out_path)
+ print('Save figure to {}'.format(save_out_path))
+
+
+def plot_complexity_map(args):
+
+ # Paths
+ save_out_path = 'results/complexity_mAP.pdf'
+ create_folder(os.path.dirname(save_out_path))
+
+ plt.figure(figsize=(5, 5))
+ fig, ax = plt.subplots(1, 1)
+
+ model_types = np.array(['Cnn6', 'Cnn10', 'Cnn14', 'ResNet22', 'ResNet38', 'ResNet54',
+ 'MobileNetV1', 'MobileNetV2', 'DaiNet', 'LeeNet', 'LeeNet18',
+ 'Res1dNet30', 'Res1dNet44', 'Wavegram-CNN', 'Wavegram-\nLogmel-CNN'])
+ flops = np.array([21.986, 28.166, 42.220, 30.081, 48.962, 54.563, 3.614, 2.810,
+ 30.395, 4.741, 26.369, 32.688, 61.833, 44.234, 53.510])
+ mAPs = np.array([0.343, 0.380, 0.431, 0.430, 0.434, 0.429, 0.389, 0.383, 0.295,
+ 0.266, 0.336, 0.365, 0.355, 0.389, 0.439])
+
+ sorted_indexes = np.sort(flops)
+ ax.scatter(flops, mAPs)
+
+ shift = [[-5.5, -0.004], [1, -0.004], [-1, -0.014], [-2, 0.006], [-7, 0.006],
+ [1, -0.01], [0.5, 0.004], [-1, -0.014], [1, -0.007], [0.8, -0.008],
+ [1, -0.007], [1, 0.002], [-6, -0.015], [1, -0.008], [0.8, 0]]
+
+ for i, model_type in enumerate(model_types):
+ ax.annotate(model_type, (flops[i] + shift[i][0], mAPs[i] + shift[i][1]))
+
+ ax.plot(flops[[0, 1, 2]], mAPs[[0, 1, 2]])
+ ax.plot(flops[[3, 4, 5]], mAPs[[3, 4, 5]])
+ ax.plot(flops[[6, 7]], mAPs[[6, 7]])
+ ax.plot(flops[[9, 10]], mAPs[[9, 10]])
+ ax.plot(flops[[11, 12]], mAPs[[11, 12]])
+ ax.plot(flops[[13, 14]], mAPs[[13, 14]])
+
+ ax.set_xlim(0, 70)
+ ax.set_ylim(0.2, 0.5)
+ ax.set_xlabel('Multi-load_statisticss (million)', fontsize=15)
+ ax.set_ylabel('mAP', fontsize=15)
+ ax.tick_params(axis='x', labelsize=12)
+ ax.tick_params(axis='y', labelsize=12)
+
+ plt.tight_layout(0, 0, 0)
+
+ plt.savefig(save_out_path)
+ print('Write out figure to {}'.format(save_out_path))
+
+
+def plot_long_fig(args):
+
+ # Paths
+ stats = pickle.load(open('paper_statistics/stats_for_long_fig.pkl', 'rb'))
+
+ save_out_path = 'results/long_fig.pdf'
+ create_folder(os.path.dirname(save_out_path))
+
+ # Load meta
+ N = len(config.labels)
+ sorted_indexes = stats['sorted_indexes_for_plot']
+ sorted_labels = np.array(config.labels)[sorted_indexes]
+ audio_clips_per_class = stats['official_balanced_training_samples'] + stats['official_unbalanced_training_samples']
+ audio_clips_per_class = audio_clips_per_class[sorted_indexes]
+
+ # Prepare axes for plot
+ (ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b) = prepare_plot_long_4_rows(sorted_labels)
+
+ # plot the number of training samples
+ ax1a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+ ax2a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+ ax3a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+ ax4a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+
+ # Load mAP of different systems
+ """Average instance system of [1] with an mAP of 0.317.
+ [1] Kong, Qiuqiang, Changsong Yu, Yong Xu, Turab Iqbal, Wenwu Wang, and
+ Mark D. Plumbley. "Weakly labelled audioset tagging with attention neural
+ networks." IEEE/ACM Transactions on Audio, Speech, and Language Processing
+ 27, no. 11 (2019): 1791-1802."""
+ maps_avg_instances = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision']
+ maps_avg_instances = maps_avg_instances[sorted_indexes]
+
+ # PANNs Cnn14
+ maps_panns_cnn14 = stats['panns_cnn14']['eval']['average_precision']
+ maps_panns_cnn14 = maps_panns_cnn14[sorted_indexes]
+
+ # PANNs MobileNetV1
+ maps_panns_mobilenetv1 = stats['panns_mobilenetv1']['eval']['average_precision']
+ maps_panns_mobilenetv1 = maps_panns_mobilenetv1[sorted_indexes]
+
+ # PANNs Wavegram-Logmel-Cnn14
+ maps_panns_wavegram_logmel_cnn14 = stats['panns_wavegram_logmel_cnn14']['eval']['average_precision']
+ maps_panns_wavegram_logmel_cnn14 = maps_panns_wavegram_logmel_cnn14[sorted_indexes]
+
+ # Plot mAPs
+ _scatter_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='g')
+ _scatter_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='r')
+ _scatter_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, s=5, c='b')
+ _scatter_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, s=5, c='k')
+
+ linewidth = 0.7
+ line0te = _plot_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b,
+ c='g', linewidth=linewidth, label='AP with Wavegram-Logmel-CNN')
+ line1te = _plot_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, c='r',
+ linewidth=linewidth, label='AP with CNN14')
+ line2te = _plot_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, c='b',
+ linewidth=linewidth, label='AP with MobileNetV1')
+ line3te = _plot_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, c='k',
+ linewidth=linewidth, label='AP with averaging instances (baseline)')
+
+ # Plot label quality
+ label_quality = stats['label_quality']
+ sorted_label_quality = np.array(label_quality)[sorted_indexes]
+ for k in range(len(sorted_label_quality)):
+ if sorted_label_quality[k] and sorted_label_quality[k] == 1:
+ sorted_label_quality[k] = 0.99
+
+ ax1b.scatter(np.arange(N)[sorted_label_quality != None],
+ sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+ ax2b.scatter(np.arange(N)[sorted_label_quality != None],
+ sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+ ax3b.scatter(np.arange(N)[sorted_label_quality != None],
+ sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+ line_label_quality = ax4b.scatter(np.arange(N)[sorted_label_quality != None],
+ sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+', label='Label quality')
+ ax1b.scatter(np.arange(N)[sorted_label_quality == None],
+ 0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+ ax2b.scatter(np.arange(N)[sorted_label_quality == None],
+ 0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+ ax3b.scatter(np.arange(N)[sorted_label_quality == None],
+ 0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+ ax4b.scatter(np.arange(N)[sorted_label_quality == None],
+ 0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+
+ plt.legend(handles=[line0te, line1te, line2te, line3te, line_label_quality], fontsize=6, loc=1)
+ plt.tight_layout(0, 0, 0)
+ plt.savefig(save_out_path)
+ print('Save fig to {}'.format(save_out_path))
+
+
+def prepare_plot_long_4_rows(sorted_lbs):
+ N = len(sorted_lbs)
+
+ f,(ax1a, ax2a, ax3a, ax4a) = plt.subplots(4, 1, sharey=False, facecolor='w', figsize=(10, 10.5))
+
+ fontsize = 5
+
+ K = 132
+ ax1a.set_xlim(0, K)
+ ax2a.set_xlim(K, 2 * K)
+ ax3a.set_xlim(2 * K, 3 * K)
+ ax4a.set_xlim(3 * K, N)
+
+ truncated_sorted_lbs = []
+ for lb in sorted_lbs:
+ lb = lb[0 : 25]
+ words = lb.split(' ')
+ if len(words[-1]) < 3:
+ lb = ' '.join(words[0:-1])
+ truncated_sorted_lbs.append(lb)
+
+ ax1a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+ ax2a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+ ax3a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+ ax4a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+
+ ax1a.set_yscale('log')
+ ax2a.set_yscale('log')
+ ax3a.set_yscale('log')
+ ax4a.set_yscale('log')
+
+ ax1b = ax1a.twinx()
+ ax2b = ax2a.twinx()
+ ax3b = ax3a.twinx()
+ ax4b = ax4a.twinx()
+ ax1b.set_ylim(0., 1.)
+ ax2b.set_ylim(0., 1.)
+ ax3b.set_ylim(0., 1.)
+ ax4b.set_ylim(0., 1.)
+ ax1b.set_ylabel('Average precision')
+ ax2b.set_ylabel('Average precision')
+ ax3b.set_ylabel('Average precision')
+ ax4b.set_ylabel('Average precision')
+
+ ax1b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+ ax2b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+ ax3b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+ ax4b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+
+ ax1a.xaxis.set_ticks(np.arange(K))
+ ax1a.xaxis.set_ticklabels(truncated_sorted_lbs[0:K], rotation=90, fontsize=fontsize)
+ ax1a.xaxis.tick_bottom()
+ ax1a.set_ylabel("Number of audio clips")
+
+ ax2a.xaxis.set_ticks(np.arange(K, 2*K))
+ ax2a.xaxis.set_ticklabels(truncated_sorted_lbs[K:2*K], rotation=90, fontsize=fontsize)
+ ax2a.xaxis.tick_bottom()
+ ax2a.set_ylabel("Number of audio clips")
+
+ ax3a.xaxis.set_ticks(np.arange(2*K, 3*K))
+ ax3a.xaxis.set_ticklabels(truncated_sorted_lbs[2*K:3*K], rotation=90, fontsize=fontsize)
+ ax3a.xaxis.tick_bottom()
+ ax3a.set_ylabel("Number of audio clips")
+
+ ax4a.xaxis.set_ticks(np.arange(3*K, N))
+ ax4a.xaxis.set_ticklabels(truncated_sorted_lbs[3*K:], rotation=90, fontsize=fontsize)
+ ax4a.xaxis.tick_bottom()
+ ax4a.set_ylabel("Number of audio clips")
+
+ ax1a.spines['right'].set_visible(False)
+ ax1b.spines['right'].set_visible(False)
+ ax2a.spines['left'].set_visible(False)
+ ax2b.spines['left'].set_visible(False)
+ ax2a.spines['right'].set_visible(False)
+ ax2b.spines['right'].set_visible(False)
+ ax3a.spines['left'].set_visible(False)
+ ax3b.spines['left'].set_visible(False)
+ ax3a.spines['right'].set_visible(False)
+ ax3b.spines['right'].set_visible(False)
+ ax4a.spines['left'].set_visible(False)
+ ax4b.spines['left'].set_visible(False)
+
+ plt.subplots_adjust(hspace = 0.8)
+
+ return ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b
+
+
+def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.):
+ N = len(x)
+ ax.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+ ax2.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+ ax3.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+ ax4.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+
+
+def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, label=""):
+ N = len(x)
+ ax.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+ ax2.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+ ax3.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+ line, = ax4.plot(x, c=c, linewidth=linewidth, alpha=alpha, label=label)
+ return line
+
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser(description='')
+ subparsers = parser.add_subparsers(dest='mode')
+
+ parser_classwise_iteration_map = subparsers.add_parser('plot_classwise_iteration_map')
+ parser_six_figures = subparsers.add_parser('plot_six_figures')
+ parser_complexity_map = subparsers.add_parser('plot_complexity_map')
+ parser_long_fig = subparsers.add_parser('plot_long_fig')
+
+ args = parser.parse_args()
+
+ if args.mode == 'plot_classwise_iteration_map':
+ plot_classwise_iteration_map(args)
+
+ elif args.mode == 'plot_six_figures':
+ plot_six_figures(args)
+
+ elif args.mode == 'plot_complexity_map':
+ plot_complexity_map(args)
+
+ elif args.mode == 'plot_long_fig':
+ plot_long_fig(args)
+
+ else:
+ raise Exception('Incorrect argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/plot_statistics.py b/audio_detection/audio_infer/utils/plot_statistics.py
new file mode 100644
index 0000000000000000000000000000000000000000..bebb28af3e3468e8422c6901e1aba9600270ef89
--- /dev/null
+++ b/audio_detection/audio_infer/utils/plot_statistics.py
@@ -0,0 +1,2034 @@
+import os
+import sys
+import numpy as np
+import argparse
+import h5py
+import time
+import _pickle as cPickle
+import _pickle
+import matplotlib.pyplot as plt
+import csv
+from sklearn import metrics
+
+from utilities import (create_folder, get_filename, d_prime)
+import config
+
+
+def _load_metrics0(filename, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+ workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+ statistics_path = os.path.join(workspace0, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+
+ statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+ bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num)
+ bal_map = np.mean(bal_map, axis=-1)
+ test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num)
+ test_map = np.mean(test_map, axis=-1)
+ legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+ # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+ return bal_map, test_map, legend
+
+
+def _load_metrics0_classwise(filename, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+ workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+ statistics_path = os.path.join(workspace0, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+
+ statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+ return statistics_dict['test'][300]['average_precision']
+
+
+def _load_metrics0_classwise2(filename, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+ workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+ statistics_path = os.path.join(workspace0, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+
+ statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+ k = 270
+ mAP = np.mean(statistics_dict['test'][k]['average_precision'])
+ mAUC = np.mean(statistics_dict['test'][k]['auc'])
+ dprime = d_prime(mAUC)
+ return mAP, mAUC, dprime
+
+
+def _load_metrics_classwise(filename, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+ workspace = '/mnt/cephfs_new_wj/speechsv/kongqiuqiang/workspaces/cvssp/pub_audioset_tagging_cnn'
+ statistics_path = os.path.join(workspace, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+
+ statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+ k = 300
+ mAP = np.mean(statistics_dict['test'][k]['average_precision'])
+ mAUC = np.mean(statistics_dict['test'][k]['auc'])
+ dprime = d_prime(mAUC)
+ return mAP, mAUC, dprime
+
+
+def plot(args):
+
+ # Arguments & parameters
+ dataset_dir = args.dataset_dir
+ workspace = args.workspace
+ select = args.select
+
+ classes_num = config.classes_num
+ max_plot_iteration = 1000000
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ class_labels_indices_path = os.path.join(dataset_dir, 'metadata',
+ 'class_labels_indices.csv')
+
+ save_out_path = 'results/{}.pdf'.format(select)
+ create_folder(os.path.dirname(save_out_path))
+
+ # Read labels
+ labels = config.labels
+
+ # Plot
+ fig, ax = plt.subplots(1, 1, figsize=(15, 8))
+ lines = []
+
+ def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+ statistics_path = os.path.join(workspace, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+
+ statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+ bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num)
+ bal_map = np.mean(bal_map, axis=-1)
+ test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num)
+ test_map = np.mean(test_map, axis=-1)
+ legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+ # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+ return bal_map, test_map, legend
+
+ bal_alpha = 0.3
+ test_alpha = 1.0
+ lines = []
+
+ if select == '1_cnn13':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_no_dropout', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13_no_specaug', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_no_specaug', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_no_dropout', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'none', 32)
+ line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_no_mixup', color='k', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_mixup_in_wave', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_mixup_in_wave', color='c', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_pooling':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_gwrp', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13_gmpgapgwrp', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_att', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13_gmpgapatt', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_resnet':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'ResNet18', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='ResNet18', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='resnet34', color='k', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'ResNet50', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='resnet50', color='c', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_densenet':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'DenseNet121', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='densenet121', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'DenseNet201', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='densenet201', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_cnn9':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn5', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn9', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_hop':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 500, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13_hop500', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 640, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13_hop640', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 1000, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13_hop1000', color='k', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_emb':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13_emb32', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13_emb128', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13_emb512', color='k', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_mobilenet':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='mobilenetv1', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'MobileNetV2', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='mobilenetv2', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_waveform':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn1d_LeeNet', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet18', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn1d_LeeNet18', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn1d_DaiNet', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn1d_DaiNet', color='k', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='c', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet50', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn1d_ResNet50', color='m', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_waveform_cnn2d':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_SpAndWav', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_WavCnn2d', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_decision_level':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelMax', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_DecisionLevelMax', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAvg', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_DecisionLevelAvg', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAtt', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_DecisionLevelAtt', color='k', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_transformer':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer1', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_Transformer1', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer3', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_Transformer3', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer6', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_Transformer6', color='k', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_aug':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,balanced,mixup', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,none,none', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,balanced,none', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup_from_0_epoch', 32)
+ line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,balanced,mixup_from_0_epoch', color='m', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_bal_train_aug':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,balanced,mixup', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,none,none', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,balanced,none', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup_from_0_epoch', 32)
+ line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,balanced,mixup_from_0_epoch', color='m', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_sr':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14_16k', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14_8k', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_time_domain':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14_time_domain', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_partial_full':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'partial_0.9_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,partial_0.9', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,partial_0.8', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'partial_0.7_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,partial_0.7', color='k', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,partial_0.5', color='m', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_window':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 2048,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14_win2048', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_melbins':
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14_mel32', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14_mel128', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '1_alternate':
+ max_plot_iteration = 2000000
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'alternate', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14_alternate', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '2_all':
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='MobileNetV1', color='k', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='ResNet34', color='grey', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_WavCnn2d', color='m', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_SpAndWav', color='orange', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '2_emb':
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_emb32', color='r', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_128', color='k', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='Cnn13_512', color='g', alpha=test_alpha)
+ lines.append(line)
+
+ elif select == '2_aug':
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_no_specaug', 'clip_bce', 'none', 'none', 32)
+ line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='cnn14,none,none', color='c', alpha=test_alpha)
+ lines.append(line)
+
+
+
+ ax.set_ylim(0, 1.)
+ ax.set_xlim(0, len(iterations))
+ ax.xaxis.set_ticks(np.arange(0, len(iterations), 25))
+ ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000))
+ ax.yaxis.set_ticks(np.arange(0, 1.01, 0.05))
+ ax.yaxis.set_ticklabels(np.around(np.arange(0, 1.01, 0.05), decimals=2))
+ ax.grid(color='b', linestyle='solid', linewidth=0.3)
+ plt.legend(handles=lines, loc=2)
+ # box = ax.get_position()
+ # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+ # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0))
+
+ plt.savefig(save_out_path)
+ print('Save figure to {}'.format(save_out_path))
+
+
+def plot_for_paper(args):
+
+ # Arguments & parameters
+ dataset_dir = args.dataset_dir
+ workspace = args.workspace
+ select = args.select
+
+ classes_num = config.classes_num
+ max_plot_iteration = 1000000
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ class_labels_indices_path = os.path.join(dataset_dir, 'metadata',
+ 'class_labels_indices.csv')
+
+ save_out_path = 'results/paper_{}.pdf'.format(select)
+ create_folder(os.path.dirname(save_out_path))
+
+ # Read labels
+ labels = config.labels
+
+ # Plot
+ fig, ax = plt.subplots(1, 1, figsize=(6, 4))
+ lines = []
+
+ def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+ statistics_path = os.path.join(workspace, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+
+ statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+ bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num)
+ bal_map = np.mean(bal_map, axis=-1)
+ test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num)
+ test_map = np.mean(test_map, axis=-1)
+ legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+ # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+ return bal_map, test_map, legend
+
+ bal_alpha = 0.3
+ test_alpha = 1.0
+ lines = []
+ linewidth = 1.
+
+ max_plot_iteration = 540000
+
+ if select == '2_all':
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ # line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha)
+ # lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ # line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha)
+ # lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ # line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha)
+ # lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ # line, = ax.plot(test_map, label='Wavegram-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+ # lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ elif select == '2_emb':
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+ # line, = ax.plot(test_map, label='Cnn13_512', color='g', alpha=test_alpha)
+ # lines.append(line)
+
+ elif select == '2_bal':
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+ line, = ax.plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ elif select == '2_sr':
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ elif select == '2_partial':
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'partial_0.9_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ # line, = ax.plot(test_map, label='cnn14,partial_0.9', color='b', alpha=test_alpha, linewidth=linewidth)
+ # lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'partial_0.7_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax.plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+ # line, = ax.plot(test_map, label='cnn14,partial_0.7', color='k', alpha=test_alpha, linewidth=linewidth)
+ # lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ elif select == '2_melbins':
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax.plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+ line, = ax.plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax.set_ylim(0, 0.8)
+ ax.set_xlim(0, len(iterations))
+ ax.set_xlabel('Iterations')
+ ax.set_ylabel('mAP')
+ ax.xaxis.set_ticks(np.arange(0, len(iterations), 50))
+ # ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000))
+ ax.xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k'])
+ ax.yaxis.set_ticks(np.arange(0, 0.81, 0.05))
+ ax.yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3', '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8'])
+ # ax.yaxis.set_ticklabels(np.around(np.arange(0, 0.81, 0.05), decimals=2))
+ ax.yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+ ax.xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+ plt.legend(handles=lines, loc=2)
+ plt.tight_layout(0, 0, 0)
+ # box = ax.get_position()
+ # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+ # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0))
+
+ plt.savefig(save_out_path)
+ print('Save figure to {}'.format(save_out_path))
+
+
+def plot_for_paper2(args):
+
+ # Arguments & parameters
+ dataset_dir = args.dataset_dir
+ workspace = args.workspace
+
+ classes_num = config.classes_num
+ max_plot_iteration = 1000000
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ class_labels_indices_path = os.path.join(dataset_dir, 'metadata',
+ 'class_labels_indices.csv')
+
+ save_out_path = 'results/paper2.pdf'
+ create_folder(os.path.dirname(save_out_path))
+
+ # Read labels
+ labels = config.labels
+
+ # Plot
+ fig, ax = plt.subplots(2, 3, figsize=(14, 7))
+ lines = []
+
+ def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+ statistics_path = os.path.join(workspace, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+
+ statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+ bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num)
+ bal_map = np.mean(bal_map, axis=-1)
+ test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num)
+ test_map = np.mean(test_map, axis=-1)
+ legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+ # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+ return bal_map, test_map, legend
+
+ def _load_metrics0(filename, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+ workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+ statistics_path = os.path.join(workspace0, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+
+ statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+ bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num)
+ bal_map = np.mean(bal_map, axis=-1)
+ test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num)
+ test_map = np.mean(test_map, axis=-1)
+ legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+ # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+ return bal_map, test_map, legend
+
+ bal_alpha = 0.3
+ test_alpha = 1.0
+ lines = []
+ linewidth = 1.
+
+ max_plot_iteration = 540000
+
+ if True:
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[0, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 0].plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ # line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha)
+ # lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ # line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha)
+ # lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[0, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 0].plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+ # line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha)
+ # lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax[0, 0].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+ # line, = ax[0, 0].plot(test_map, label='ResNet38', color='k', alpha=test_alpha, linewidth=linewidth)
+ # lines.append(line)
+
+ # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ # 320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+ # line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ # line, = ax.plot(test_map, label='Wavegram-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+ # lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[0, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 0].plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[0, 0].legend(handles=lines, loc=2)
+ ax[0, 0].set_title('(a) Comparison of architectures')
+
+ if True:
+ lines = []
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[0, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+ line, = ax[0, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[0, 1].plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+ line, = ax[0, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+ line, = ax[0, 1].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[0, 1].plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[0, 1].legend(handles=lines, loc=2, fontsize=8)
+
+ ax[0, 1].set_title('(b) Comparison of training data and augmentation')
+
+ if True:
+ lines = []
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[0, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 2].plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[0, 2].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 2].plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[0, 2].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[0, 2].plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[0, 2].legend(handles=lines, loc=2)
+ ax[0, 2].set_title('(c) Comparison of embedding size')
+
+ if True:
+ lines = []
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[1, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 0].plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[1, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 0].plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[1, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 0].plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[1, 0].legend(handles=lines, loc=2)
+ ax[1, 0].set_title('(d) Comparison of amount of training data')
+
+ if True:
+ lines = []
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[1, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 1].plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[1, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 1].plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[1, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 1].plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[1, 1].legend(handles=lines, loc=2)
+ ax[1, 1].set_title('(e) Comparison of sampling rate')
+
+ if True:
+ lines = []
+ iterations = np.arange(0, max_plot_iteration, 2000)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[1, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+ line, = ax[1, 2].plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[1, 2].plot(bal_map, color='b', alpha=bal_alpha)
+ line, = ax[1, 2].plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024,
+ 320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+ line, = ax[1, 2].plot(bal_map, color='g', alpha=bal_alpha)
+ line, = ax[1, 2].plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth)
+ lines.append(line)
+
+ ax[1, 2].legend(handles=lines, loc=2)
+ ax[1, 2].set_title('(f) Comparison of mel bins number')
+
+ for i in range(2):
+ for j in range(3):
+ ax[i, j].set_ylim(0, 0.8)
+ ax[i, j].set_xlim(0, len(iterations))
+ ax[i, j].set_xlabel('Iterations')
+ ax[i, j].set_ylabel('mAP')
+ ax[i, j].xaxis.set_ticks(np.arange(0, len(iterations), 50))
+ # ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000))
+ ax[i, j].xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k'])
+ ax[i, j].yaxis.set_ticks(np.arange(0, 0.81, 0.05))
+ ax[i, j].yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3', '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8'])
+ # ax.yaxis.set_ticklabels(np.around(np.arange(0, 0.81, 0.05), decimals=2))
+ ax[i, j].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+ ax[i, j].xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+
+ plt.tight_layout(0, 1, 0)
+ # box = ax.get_position()
+ # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+ # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0))
+
+ plt.savefig(save_out_path)
+ print('Save figure to {}'.format(save_out_path))
+
+
+def table_values(args):
+
+ # Arguments & parameters
+ dataset_dir = args.dataset_dir
+ workspace = args.workspace
+ select = args.select
+
+ def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration):
+ statistics_path = os.path.join(workspace, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+
+ statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+ idx = iteration // 2000
+ mAP = np.mean(statistics_dict['test'][idx]['average_precision'])
+ mAUC = np.mean(statistics_dict['test'][idx]['auc'])
+ dprime = d_prime(mAUC)
+
+ print('mAP: {:.3f}'.format(mAP))
+ print('mAUC: {:.3f}'.format(mAUC))
+ print('dprime: {:.3f}'.format(dprime))
+
+
+ if select == 'cnn13':
+ iteration = 600000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'cnn5':
+ iteration = 440000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'cnn9':
+ iteration = 440000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'cnn13_decisionlevelmax':
+ iteration = 400000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelMax', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'cnn13_decisionlevelavg':
+ iteration = 600000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAvg', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'cnn13_decisionlevelatt':
+ iteration = 600000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAtt', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'cnn13_emb32':
+ iteration = 560000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'cnn13_emb128':
+ iteration = 560000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'cnn13_emb512':
+ iteration = 440000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'cnn13_hop500':
+ iteration = 440000
+ _load_metrics('main', 32000, 1024,
+ 500, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'cnn13_hop640':
+ iteration = 440000
+ _load_metrics('main', 32000, 1024,
+ 640, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'cnn13_hop1000':
+ iteration = 540000
+ _load_metrics('main', 32000, 1024,
+ 1000, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'mobilenetv1':
+ iteration = 560000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'mobilenetv2':
+ iteration = 560000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'MobileNetV2', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'resnet18':
+ iteration = 600000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'ResNet18', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'resnet34':
+ iteration = 600000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'resnet50':
+ iteration = 600000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'ResNet50', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'dainet':
+ iteration = 600000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn1d_DaiNet', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'leenet':
+ iteration = 540000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'leenet18':
+ iteration = 440000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet18', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'resnet34_1d':
+ iteration = 500000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'resnet50_1d':
+ iteration = 500000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet50', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'waveform_cnn2d':
+ iteration = 660000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ elif select == 'waveform_spandwav':
+ iteration = 700000
+ _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+
+def crop_label(label):
+ max_len = 16
+ if len(label) <= max_len:
+ return label
+ else:
+ words = label.split(' ')
+ cropped_label = ''
+ for w in words:
+ if len(cropped_label + ' ' + w) > max_len:
+ break
+ else:
+ cropped_label += ' {}'.format(w)
+ return cropped_label
+
+def add_comma(integer):
+ integer = int(integer)
+ if integer >= 1000:
+ return str(integer // 1000) + ',' + str(integer % 1000)
+ else:
+ return str(integer)
+
+
+def plot_class_iteration(args):
+
+ # Arguments & parameters
+ workspace = args.workspace
+ select = args.select
+
+ save_out_path = 'results_map/class_iteration_map.pdf'
+ create_folder(os.path.dirname(save_out_path))
+
+ def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration):
+ statistics_path = os.path.join(workspace, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+
+ statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+ return statistics_dict
+
+ iteration = 600000
+ statistics_dict = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ mAP_mat = np.array([e['average_precision'] for e in statistics_dict['test']])
+ mAP_mat = mAP_mat[0 : 300, :]
+ sorted_indexes = np.argsort(config.full_samples_per_class)[::-1]
+
+
+ fig, axs = plt.subplots(1, 3, figsize=(20, 5))
+ ranges = [np.arange(0, 10), np.arange(250, 260), np.arange(517, 527)]
+ axs[0].set_ylabel('AP')
+
+ for col in range(0, 3):
+ axs[col].set_ylim(0, 1.)
+ axs[col].set_xlim(0, 301)
+ axs[col].set_xlabel('Iterations')
+ axs[col].set_ylabel('AP')
+ axs[col].xaxis.set_ticks(np.arange(0, 301, 100))
+ axs[col].xaxis.set_ticklabels(['0', '200k', '400k', '600k'])
+ lines = []
+ for _ix in ranges[col]:
+ _label = crop_label(config.labels[sorted_indexes[_ix]]) + \
+ ' ({})'.format(add_comma(config.full_samples_per_class[sorted_indexes[_ix]]))
+ line, = axs[col].plot(mAP_mat[:, sorted_indexes[_ix]], label=_label)
+ lines.append(line)
+ box = axs[col].get_position()
+ axs[col].set_position([box.x0, box.y0, box.width * 1., box.height])
+ axs[col].legend(handles=lines, bbox_to_anchor=(1., 1.))
+ axs[col].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+
+ plt.tight_layout(pad=4, w_pad=1, h_pad=1)
+ plt.savefig(save_out_path)
+ print(save_out_path)
+
+
+def _load_old_metrics(workspace, filename, iteration, data_type):
+
+ assert data_type in ['train', 'test']
+
+ stat_name = "stat_{}_iters.p".format(iteration)
+
+ # Load stats
+ stat_path = os.path.join(workspace, "stats", filename, data_type, stat_name)
+ try:
+ stats = cPickle.load(open(stat_path, 'rb'))
+ except:
+ stats = cPickle.load(open(stat_path, 'rb'), encoding='latin1')
+
+ precisions = [stat['precisions'] for stat in stats]
+ recalls = [stat['recalls'] for stat in stats]
+ maps = np.array([stat['AP'] for stat in stats])
+ aucs = np.array([stat['auc'] for stat in stats])
+
+ return {'average_precision': maps, 'AUC': aucs}
+
+def _sort(ys):
+ sorted_idxes = np.argsort(ys)
+ sorted_idxes = sorted_idxes[::-1]
+ sorted_ys = ys[sorted_idxes]
+ sorted_lbs = [config.labels[e] for e in sorted_idxes]
+ return sorted_ys, sorted_idxes, sorted_lbs
+
+def load_data(hdf5_path):
+ with h5py.File(hdf5_path, 'r') as hf:
+ x = hf['x'][:]
+ y = hf['y'][:]
+ video_id_list = list(hf['video_id_list'][:])
+ return x, y, video_id_list
+
+def get_avg_stats(workspace, bgn_iter, fin_iter, interval_iter, filename, data_type):
+
+ assert data_type in ['train', 'test']
+ bal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/bal_train.h5"
+ eval_hdf5 = "/vol/vssp/msos/audioset/packed_features/eval.h5"
+ unbal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/unbal_train.h5"
+
+ t1 = time.time()
+ if data_type == 'test':
+ (te_x, te_y, te_id_list) = load_data(eval_hdf5)
+ elif data_type == 'train':
+ (te_x, te_y, te_id_list) = load_data(bal_train_hdf5)
+ y = te_y
+
+ prob_dir = os.path.join(workspace, "probs", filename, data_type)
+ names = os.listdir(prob_dir)
+
+ probs = []
+ iters = range(bgn_iter, fin_iter, interval_iter)
+ for iter in iters:
+ pickle_path = os.path.join(prob_dir, "prob_%d_iters.p" % iter)
+ try:
+ prob = cPickle.load(open(pickle_path, 'rb'))
+ except:
+ prob = cPickle.load(open(pickle_path, 'rb'), encoding='latin1')
+ probs.append(prob)
+
+ avg_prob = np.mean(np.array(probs), axis=0)
+
+ n_out = y.shape[1]
+ stats = []
+ for k in range(n_out): # around 7 seconds
+ (precisions, recalls, thresholds) = metrics.precision_recall_curve(y[:, k], avg_prob[:, k])
+ avg_precision = metrics.average_precision_score(y[:, k], avg_prob[:, k], average=None)
+ (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], avg_prob[:, k])
+ auc = metrics.roc_auc_score(y[:, k], avg_prob[:, k], average=None)
+ # eer = pp_data.eer(avg_prob[:, k], y[:, k])
+
+ skip = 1000
+ dict = {'precisions': precisions[0::skip], 'recalls': recalls[0::skip], 'AP': avg_precision,
+ 'fpr': fpr[0::skip], 'fnr': 1. - tpr[0::skip], 'auc': auc}
+
+ stats.append(dict)
+
+ mAPs = np.array([e['AP'] for e in stats])
+ aucs = np.array([e['auc'] for e in stats])
+
+ print("Get avg time: {}".format(time.time() - t1))
+
+ return {'average_precision': mAPs, 'auc': aucs}
+
+
+def _samples_num_per_class():
+ bal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/bal_train.h5"
+ eval_hdf5 = "/vol/vssp/msos/audioset/packed_features/eval.h5"
+ unbal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/unbal_train.h5"
+
+ (x, y, id_list) = load_data(eval_hdf5)
+ eval_num = np.sum(y, axis=0)
+
+ (x, y, id_list) = load_data(bal_train_hdf5)
+ bal_num = np.sum(y, axis=0)
+
+ (x, y, id_list) = load_data(unbal_train_hdf5)
+ unbal_num = np.sum(y, axis=0)
+
+ return bal_num, unbal_num, eval_num
+
+
+def get_label_quality():
+
+ rate_csv = '/vol/vssp/msos/qk/workspaces/pub_audioset_tagging_cnn_transfer/metadata/qa_true_counts.csv'
+
+ with open(rate_csv, 'r') as f:
+ reader = csv.reader(f, delimiter=',')
+ lis = list(reader)
+
+ rates = []
+
+ for n in range(1, len(lis)):
+ li = lis[n]
+ if float(li[1]) == 0:
+ rate = None
+ else:
+ rate = float(li[2]) / float(li[1])
+ rates.append(rate)
+
+ return rates
+
+
+def summary_stats(args):
+ # Arguments & parameters
+ workspace = args.workspace
+
+ out_stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl')
+ create_folder(os.path.dirname(out_stat_path))
+
+ # Old workspace
+ old_workspace = '/vol/vssp/msos/qk/workspaces/audioset_classification'
+
+ # bal_train_metrics = _load_old_metrics(old_workspace, 'tmp127', 20000, 'train')
+ # eval_metrics = _load_old_metrics(old_workspace, 'tmp127', 20000, 'test')
+
+ bal_train_metrics = get_avg_stats(old_workspace, bgn_iter=10000, fin_iter=50001, interval_iter=5000, filename='tmp127_re', data_type='train')
+ eval_metrics = get_avg_stats(old_workspace, bgn_iter=10000, fin_iter=50001, interval_iter=5000, filename='tmp127_re', data_type='test')
+
+ maps0te = eval_metrics['average_precision']
+ (maps0te, sorted_idxes, sorted_lbs) = _sort(maps0te)
+
+ bal_num, unbal_num, eval_num = _samples_num_per_class()
+
+ output_dict = {
+ 'labels': config.labels,
+ 'label_quality': get_label_quality(),
+ 'sorted_indexes_for_plot': sorted_idxes,
+ 'official_balanced_trainig_samples': bal_num,
+ 'official_unbalanced_training_samples': unbal_num,
+ 'official_eval_samples': eval_num,
+ 'downloaded_full_training_samples': config.full_samples_per_class,
+ 'averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations':
+ {'bal_train': bal_train_metrics, 'eval': eval_metrics}
+ }
+
+ def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin,
+ fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration):
+ _workspace = '/vol/vssp/msos/qk/bytedance/workspaces_important/pub_audioset_tagging_cnn_transfer'
+ statistics_path = os.path.join(_workspace, 'statistics', filename,
+ 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+ 'data_type={}'.format(data_type), model_type,
+ 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+ 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+ 'statistics.pkl')
+
+ statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+ _idx = iteration // 2000
+ _dict = {'bal_train': {'average_precision': statistics_dict['bal'][_idx]['average_precision'],
+ 'auc': statistics_dict['bal'][_idx]['auc']},
+ 'eval': {'average_precision': statistics_dict['test'][_idx]['average_precision'],
+ 'auc': statistics_dict['test'][_idx]['auc']}}
+ return _dict
+
+ iteration = 600000
+ output_dict['cnn13_system_iteration60k'] = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ iteration = 560000
+ output_dict['mobilenetv1_system_iteration56k'] = _load_metrics('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ cPickle.dump(output_dict, open(out_stat_path, 'wb'))
+ print('Write stats for paper to {}'.format(out_stat_path))
+
+
+def prepare_plot_long_4_rows(sorted_lbs):
+ N = len(sorted_lbs)
+
+ f,(ax1a, ax2a, ax3a, ax4a) = plt.subplots(4, 1,sharey=False, facecolor='w', figsize=(10, 12))
+
+ fontsize = 5
+
+ K = 132
+ ax1a.set_xlim(0, K)
+ ax2a.set_xlim(K, 2 * K)
+ ax3a.set_xlim(2 * K, 3 * K)
+ ax4a.set_xlim(3 * K, N)
+
+ truncated_sorted_lbs = []
+ for lb in sorted_lbs:
+ lb = lb[0 : 25]
+ words = lb.split(' ')
+ if len(words[-1]) < 3:
+ lb = ' '.join(words[0:-1])
+ truncated_sorted_lbs.append(lb)
+
+ ax1a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+ ax2a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+ ax3a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+ ax4a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+
+ ax1a.set_yscale('log')
+ ax2a.set_yscale('log')
+ ax3a.set_yscale('log')
+ ax4a.set_yscale('log')
+
+ ax1b = ax1a.twinx()
+ ax2b = ax2a.twinx()
+ ax3b = ax3a.twinx()
+ ax4b = ax4a.twinx()
+ ax1b.set_ylim(0., 1.)
+ ax2b.set_ylim(0., 1.)
+ ax3b.set_ylim(0., 1.)
+ ax4b.set_ylim(0., 1.)
+ ax1b.set_ylabel('Average precision')
+ ax2b.set_ylabel('Average precision')
+ ax3b.set_ylabel('Average precision')
+ ax4b.set_ylabel('Average precision')
+
+ ax1b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+ ax2b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+ ax3b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+ ax4b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+
+ ax1a.xaxis.set_ticks(np.arange(K))
+ ax1a.xaxis.set_ticklabels(truncated_sorted_lbs[0:K], rotation=90, fontsize=fontsize)
+ ax1a.xaxis.tick_bottom()
+ ax1a.set_ylabel("Number of audio clips")
+
+ ax2a.xaxis.set_ticks(np.arange(K, 2*K))
+ ax2a.xaxis.set_ticklabels(truncated_sorted_lbs[K:2*K], rotation=90, fontsize=fontsize)
+ ax2a.xaxis.tick_bottom()
+ # ax2a.tick_params(left='off', which='both')
+ ax2a.set_ylabel("Number of audio clips")
+
+ ax3a.xaxis.set_ticks(np.arange(2*K, 3*K))
+ ax3a.xaxis.set_ticklabels(truncated_sorted_lbs[2*K:3*K], rotation=90, fontsize=fontsize)
+ ax3a.xaxis.tick_bottom()
+ ax3a.set_ylabel("Number of audio clips")
+
+ ax4a.xaxis.set_ticks(np.arange(3*K, N))
+ ax4a.xaxis.set_ticklabels(truncated_sorted_lbs[3*K:], rotation=90, fontsize=fontsize)
+ ax4a.xaxis.tick_bottom()
+ # ax4a.tick_params(left='off', which='both')
+ ax4a.set_ylabel("Number of audio clips")
+
+ ax1a.spines['right'].set_visible(False)
+ ax1b.spines['right'].set_visible(False)
+ ax2a.spines['left'].set_visible(False)
+ ax2b.spines['left'].set_visible(False)
+ ax2a.spines['right'].set_visible(False)
+ ax2b.spines['right'].set_visible(False)
+ ax3a.spines['left'].set_visible(False)
+ ax3b.spines['left'].set_visible(False)
+ ax3a.spines['right'].set_visible(False)
+ ax3b.spines['right'].set_visible(False)
+ ax4a.spines['left'].set_visible(False)
+ ax4b.spines['left'].set_visible(False)
+
+ plt.subplots_adjust(hspace = 0.8)
+
+ return ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b
+
+def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.):
+ N = len(x)
+ ax.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+ ax2.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+ ax3.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+ ax4.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+
+def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, label=""):
+ N = len(x)
+ ax.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+ ax2.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+ ax3.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+ line, = ax4.plot(x, c=c, linewidth=linewidth, alpha=alpha, label=label)
+ return line
+
+def plot_long_fig(args):
+ # Arguments & parameters
+ workspace = args.workspace
+
+ # Paths
+ stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl')
+ save_out_path = 'results/long_fig.pdf'
+ create_folder(os.path.dirname(save_out_path))
+
+ # Stats
+ stats = cPickle.load(open(stat_path, 'rb'))
+
+ N = len(config.labels)
+ sorted_indexes = stats['sorted_indexes_for_plot']
+ sorted_labels = np.array(config.labels)[sorted_indexes]
+ audio_clips_per_class = stats['official_balanced_trainig_samples'] + stats['official_unbalanced_training_samples']
+ audio_clips_per_class = audio_clips_per_class[sorted_indexes]
+
+ (ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b) = prepare_plot_long_4_rows(sorted_labels)
+
+ # plot the same data on both axes
+ ax1a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+ ax2a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+ ax3a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+ ax4a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+
+ maps_avg_instances = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision']
+ maps_avg_instances = maps_avg_instances[sorted_indexes]
+
+ maps_cnn13 = stats['cnn13_system_iteration60k']['eval']['average_precision']
+ maps_cnn13 = maps_cnn13[sorted_indexes]
+
+ maps_mobilenetv1 = stats['mobilenetv1_system_iteration56k']['eval']['average_precision']
+ maps_mobilenetv1 = maps_mobilenetv1[sorted_indexes]
+
+ maps_logmel_wavegram_cnn = _load_metrics0_classwise('main', 32000, 1024,
+ 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+ maps_logmel_wavegram_cnn = maps_logmel_wavegram_cnn[sorted_indexes]
+
+ _scatter_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, s=5, c='k')
+ _scatter_4_rows(maps_cnn13, ax1b, ax2b, ax3b, ax4b, s=5, c='r')
+ _scatter_4_rows(maps_mobilenetv1, ax1b, ax2b, ax3b, ax4b, s=5, c='b')
+ _scatter_4_rows(maps_logmel_wavegram_cnn, ax1b, ax2b, ax3b, ax4b, s=5, c='g')
+
+ linewidth = 0.7
+ line0te = _plot_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, c='k', linewidth=linewidth, label='AP with averaging instances (baseline)')
+ line1te = _plot_4_rows(maps_cnn13, ax1b, ax2b, ax3b, ax4b, c='r', linewidth=linewidth, label='AP with CNN14')
+ line2te = _plot_4_rows(maps_mobilenetv1, ax1b, ax2b, ax3b, ax4b, c='b', linewidth=linewidth, label='AP with MobileNetV1')
+ line3te = _plot_4_rows(maps_logmel_wavegram_cnn, ax1b, ax2b, ax3b, ax4b, c='g', linewidth=linewidth, label='AP with Wavegram-Logmel-CNN')
+
+ label_quality = stats['label_quality']
+ sorted_rate = np.array(label_quality)[sorted_indexes]
+ for k in range(len(sorted_rate)):
+ if sorted_rate[k] and sorted_rate[k] == 1:
+ sorted_rate[k] = 0.99
+
+ ax1b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+')
+ ax2b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+')
+ ax3b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+')
+ line_label_quality = ax4b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+', label='Label quality')
+ ax1b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+ ax2b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+ ax3b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+ ax4b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+
+ plt.legend(handles=[line0te, line1te, line2te, line3te, line_label_quality], fontsize=6, loc=1)
+
+ plt.savefig(save_out_path)
+ print('Save fig to {}'.format(save_out_path))
+
+def plot_flops(args):
+
+ # Arguments & parameters
+ workspace = args.workspace
+
+ # Paths
+ save_out_path = 'results_map/flops.pdf'
+ create_folder(os.path.dirname(save_out_path))
+
+ plt.figure(figsize=(5, 5))
+ fig, ax = plt.subplots(1, 1)
+
+ model_types = np.array(['Cnn6', 'Cnn10', 'Cnn14', 'ResNet22', 'ResNet38', 'ResNet54',
+ 'MobileNetV1', 'MobileNetV2', 'DaiNet', 'LeeNet', 'LeeNet18',
+ 'Res1dNet30', 'Res1dNet44', 'Wavegram-CNN', 'Wavegram-\nLogmel-CNN'])
+ flops = np.array([21.986, 21.986, 42.220, 30.081, 48.962, 54.563, 3.614, 2.810,
+ 30.395, 4.741, 26.369, 32.688, 61.833, 44.234, 53.510])
+ mAPs = np.array([0.343, 0.380, 0.431, 0.430, 0.434, 0.429, 0.389, 0.383, 0.295,
+ 0.266, 0.336, 0.365, 0.355, 0.389, 0.439])
+
+ sorted_indexes = np.sort(flops)
+ ax.scatter(flops, mAPs)
+
+ shift = [[1, 0.002], [1, -0.006], [-1, -0.014], [-2, 0.006], [-7, 0.006],
+ [1, -0.01], [0.5, 0.004], [-1, -0.014], [1, -0.007], [0.8, -0.008],
+ [1, -0.007], [1, 0.002], [-6, -0.015], [1, -0.008], [0.8, 0]]
+
+ for i, model_type in enumerate(model_types):
+ ax.annotate(model_type, (flops[i] + shift[i][0], mAPs[i] + shift[i][1]))
+
+ ax.plot(flops[[0, 1, 2]], mAPs[[0, 1, 2]])
+ ax.plot(flops[[3, 4, 5]], mAPs[[3, 4, 5]])
+ ax.plot(flops[[6, 7]], mAPs[[6, 7]])
+ ax.plot(flops[[9, 10]], mAPs[[9, 10]])
+ ax.plot(flops[[11, 12]], mAPs[[11, 12]])
+ ax.plot(flops[[13, 14]], mAPs[[13, 14]])
+
+ ax.set_xlim(0, 70)
+ ax.set_ylim(0.2, 0.5)
+ ax.set_xlabel('Multi-adds (million)')
+ ax.set_ylabel('mAP')
+
+ plt.tight_layout(0, 0, 0)
+
+ plt.savefig(save_out_path)
+ print('Write out figure to {}'.format(save_out_path))
+
+
+def spearman(args):
+
+ # Arguments & parameters
+ workspace = args.workspace
+
+ # Paths
+ stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl')
+
+ # Stats
+ stats = cPickle.load(open(stat_path, 'rb'))
+
+ label_quality = np.array([qu if qu else 0.5 for qu in stats['label_quality']])
+ training_samples = np.array(stats['official_balanced_trainig_samples']) + \
+ np.array(stats['official_unbalanced_training_samples'])
+ mAP = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision']
+
+ import scipy
+ samples_spearman = scipy.stats.spearmanr(training_samples, mAP)[0]
+ quality_spearman = scipy.stats.spearmanr(label_quality, mAP)[0]
+
+ print('Training samples spearman: {:.3f}'.format(samples_spearman))
+ print('Quality spearman: {:.3f}'.format(quality_spearman))
+
+
+def print_results(args):
+
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+ #
+ (mAP, mAUC, dprime) = _load_metrics0_classwise2('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+
+ (mAP, mAUC, dprime) = _load_metrics0_classwise2('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+
+ # partial
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+ # Sample rate
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+
+ # Mel bins
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+
+ (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+
+ import crash
+ asdf
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser(description='')
+ subparsers = parser.add_subparsers(dest='mode')
+
+ parser_plot = subparsers.add_parser('plot')
+ parser_plot.add_argument('--dataset_dir', type=str, required=True)
+ parser_plot.add_argument('--workspace', type=str, required=True)
+ parser_plot.add_argument('--select', type=str, required=True)
+
+ parser_plot = subparsers.add_parser('plot_for_paper')
+ parser_plot.add_argument('--dataset_dir', type=str, required=True)
+ parser_plot.add_argument('--workspace', type=str, required=True)
+ parser_plot.add_argument('--select', type=str, required=True)
+
+ parser_plot = subparsers.add_parser('plot_for_paper2')
+ parser_plot.add_argument('--dataset_dir', type=str, required=True)
+ parser_plot.add_argument('--workspace', type=str, required=True)
+
+ parser_values = subparsers.add_parser('plot_class_iteration')
+ parser_values.add_argument('--workspace', type=str, required=True)
+ parser_values.add_argument('--select', type=str, required=True)
+
+ parser_summary_stats = subparsers.add_parser('summary_stats')
+ parser_summary_stats.add_argument('--workspace', type=str, required=True)
+
+ parser_plot_long = subparsers.add_parser('plot_long_fig')
+ parser_plot_long.add_argument('--workspace', type=str, required=True)
+
+ parser_plot_flops = subparsers.add_parser('plot_flops')
+ parser_plot_flops.add_argument('--workspace', type=str, required=True)
+
+ parser_spearman = subparsers.add_parser('spearman')
+ parser_spearman.add_argument('--workspace', type=str, required=True)
+
+ parser_print = subparsers.add_parser('print')
+ parser_print.add_argument('--workspace', type=str, required=True)
+
+ args = parser.parse_args()
+
+ if args.mode == 'plot':
+ plot(args)
+
+ elif args.mode == 'plot_for_paper':
+ plot_for_paper(args)
+
+ elif args.mode == 'plot_for_paper2':
+ plot_for_paper2(args)
+
+ elif args.mode == 'table_values':
+ table_values(args)
+
+ elif args.mode == 'plot_class_iteration':
+ plot_class_iteration(args)
+
+ elif args.mode == 'summary_stats':
+ summary_stats(args)
+
+ elif args.mode == 'plot_long_fig':
+ plot_long_fig(args)
+
+ elif args.mode == 'plot_flops':
+ plot_flops(args)
+
+ elif args.mode == 'spearman':
+ spearman(args)
+
+ elif args.mode == 'print':
+ print_results(args)
+
+ else:
+ raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/utilities.py b/audio_detection/audio_infer/utils/utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d1604579b88e7e1e79f6350376f89d9c1c85f44
--- /dev/null
+++ b/audio_detection/audio_infer/utils/utilities.py
@@ -0,0 +1,172 @@
+import os
+import logging
+import h5py
+import soundfile
+import librosa
+import numpy as np
+import pandas as pd
+from scipy import stats
+import datetime
+import pickle
+
+
+def create_folder(fd):
+ if not os.path.exists(fd):
+ os.makedirs(fd)
+
+
+def get_filename(path):
+ path = os.path.realpath(path)
+ na_ext = path.split('/')[-1]
+ na = os.path.splitext(na_ext)[0]
+ return na
+
+
+def get_sub_filepaths(folder):
+ paths = []
+ for root, dirs, files in os.walk(folder):
+ for name in files:
+ path = os.path.join(root, name)
+ paths.append(path)
+ return paths
+
+
+def create_logging(log_dir, filemode):
+ create_folder(log_dir)
+ i1 = 0
+
+ while os.path.isfile(os.path.join(log_dir, '{:04d}.log'.format(i1))):
+ i1 += 1
+
+ log_path = os.path.join(log_dir, '{:04d}.log'.format(i1))
+ logging.basicConfig(
+ level=logging.DEBUG,
+ format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
+ datefmt='%a, %d %b %Y %H:%M:%S',
+ filename=log_path,
+ filemode=filemode)
+
+ # Print to console
+ console = logging.StreamHandler()
+ console.setLevel(logging.INFO)
+ formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
+ console.setFormatter(formatter)
+ logging.getLogger('').addHandler(console)
+
+ return logging
+
+
+def read_metadata(csv_path, classes_num, id_to_ix):
+ """Read metadata of AudioSet from a csv file.
+
+ Args:
+ csv_path: str
+
+ Returns:
+ meta_dict: {'audio_name': (audios_num,), 'target': (audios_num, classes_num)}
+ """
+
+ with open(csv_path, 'r') as fr:
+ lines = fr.readlines()
+ lines = lines[3:] # Remove heads
+
+ audios_num = len(lines)
+ targets = np.zeros((audios_num, classes_num), dtype=np.bool)
+ audio_names = []
+
+ for n, line in enumerate(lines):
+ items = line.split(', ')
+ """items: ['--4gqARaEJE', '0.000', '10.000', '"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"\n']"""
+
+ audio_name = 'Y{}.wav'.format(items[0]) # Audios are started with an extra 'Y' when downloading
+ label_ids = items[3].split('"')[1].split(',')
+
+ audio_names.append(audio_name)
+
+ # Target
+ for id in label_ids:
+ ix = id_to_ix[id]
+ targets[n, ix] = 1
+
+ meta_dict = {'audio_name': np.array(audio_names), 'target': targets}
+ return meta_dict
+
+
+def float32_to_int16(x):
+ assert np.max(np.abs(x)) <= 1.2
+ x = np.clip(x, -1, 1)
+ return (x * 32767.).astype(np.int16)
+
+def int16_to_float32(x):
+ return (x / 32767.).astype(np.float32)
+
+
+def pad_or_truncate(x, audio_length):
+ """Pad all audio to specific length."""
+ if len(x) <= audio_length:
+ return np.concatenate((x, np.zeros(audio_length - len(x))), axis=0)
+ else:
+ return x[0 : audio_length]
+
+
+def d_prime(auc):
+ d_prime = stats.norm().ppf(auc) * np.sqrt(2.0)
+ return d_prime
+
+
+class Mixup(object):
+ def __init__(self, mixup_alpha, random_seed=1234):
+ """Mixup coefficient generator.
+ """
+ self.mixup_alpha = mixup_alpha
+ self.random_state = np.random.RandomState(random_seed)
+
+ def get_lambda(self, batch_size):
+ """Get mixup random coefficients.
+ Args:
+ batch_size: int
+ Returns:
+ mixup_lambdas: (batch_size,)
+ """
+ mixup_lambdas = []
+ for n in range(0, batch_size, 2):
+ lam = self.random_state.beta(self.mixup_alpha, self.mixup_alpha, 1)[0]
+ mixup_lambdas.append(lam)
+ mixup_lambdas.append(1. - lam)
+
+ return np.array(mixup_lambdas)
+
+
+class StatisticsContainer(object):
+ def __init__(self, statistics_path):
+ """Contain statistics of different training iterations.
+ """
+ self.statistics_path = statistics_path
+
+ self.backup_statistics_path = '{}_{}.pkl'.format(
+ os.path.splitext(self.statistics_path)[0],
+ datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
+
+ self.statistics_dict = {'bal': [], 'test': []}
+
+ def append(self, iteration, statistics, data_type):
+ statistics['iteration'] = iteration
+ self.statistics_dict[data_type].append(statistics)
+
+ def dump(self):
+ pickle.dump(self.statistics_dict, open(self.statistics_path, 'wb'))
+ pickle.dump(self.statistics_dict, open(self.backup_statistics_path, 'wb'))
+ logging.info(' Dump statistics to {}'.format(self.statistics_path))
+ logging.info(' Dump statistics to {}'.format(self.backup_statistics_path))
+
+ def load_state_dict(self, resume_iteration):
+ self.statistics_dict = pickle.load(open(self.statistics_path, 'rb'))
+
+ resume_statistics_dict = {'bal': [], 'test': []}
+
+ for key in self.statistics_dict.keys():
+ for statistics in self.statistics_dict[key]:
+ if statistics['iteration'] <= resume_iteration:
+ resume_statistics_dict[key].append(statistics)
+
+ self.statistics_dict = resume_statistics_dict
\ No newline at end of file
diff --git a/audio_detection/target_sound_detection/src/__pycache__/models.cpython-38.pyc b/audio_detection/target_sound_detection/src/__pycache__/models.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0982846c06b554669d8f290a24eb2fdb172893a
Binary files /dev/null and b/audio_detection/target_sound_detection/src/__pycache__/models.cpython-38.pyc differ
diff --git a/audio_detection/target_sound_detection/src/__pycache__/utils.cpython-38.pyc b/audio_detection/target_sound_detection/src/__pycache__/utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c90bde06c05575070a8743337c5b2bc4e139be3b
Binary files /dev/null and b/audio_detection/target_sound_detection/src/__pycache__/utils.cpython-38.pyc differ
diff --git a/audio_detection/target_sound_detection/src/models.py b/audio_detection/target_sound_detection/src/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..3016b9274aeb86091d30d980803c7106f15ddd54
--- /dev/null
+++ b/audio_detection/target_sound_detection/src/models.py
@@ -0,0 +1,1288 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time : 2021/3/9 16:33
+# @Author : dongchao yang
+# @File : train.py
+from itertools import zip_longest
+import numpy as np
+from scipy import ndimage
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import time
+from torchlibrosa.augmentation import SpecAugmentation
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+import math
+from sklearn.cluster import KMeans
+import os
+import time
+from functools import partial
+# import timm
+# from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+import warnings
+from functools import partial
+# from timm.models.registry import register_model
+# from timm.models.vision_transformer import _cfg
+# from mmdet.utils import get_root_logger
+# from mmcv.runner import load_checkpoint
+# from mmcv.runner import _load_checkpoint, load_state_dict
+# import mmcv.runner
+import copy
+from collections import OrderedDict
+import io
+import re
+DEBUG=0
+event_labels = ['Alarm', 'Alarm_clock', 'Animal', 'Applause', 'Arrow', 'Artillery_fire',
+ 'Babbling', 'Baby_laughter', 'Bark', 'Basketball_bounce', 'Battle_cry',
+ 'Bell', 'Bird', 'Bleat', 'Bouncing', 'Breathing', 'Buzz', 'Camera',
+ 'Cap_gun', 'Car', 'Car_alarm', 'Cat', 'Caw', 'Cheering', 'Child_singing',
+ 'Choir', 'Chop', 'Chopping_(food)', 'Clapping', 'Clickety-clack', 'Clicking',
+ 'Clip-clop', 'Cluck', 'Coin_(dropping)', 'Computer_keyboard', 'Conversation',
+ 'Coo', 'Cough', 'Cowbell', 'Creak', 'Cricket', 'Croak', 'Crow', 'Crowd', 'DTMF',
+ 'Dog', 'Door', 'Drill', 'Drip', 'Engine', 'Engine_starting', 'Explosion', 'Fart',
+ 'Female_singing', 'Filing_(rasp)', 'Finger_snapping', 'Fire', 'Fire_alarm', 'Firecracker',
+ 'Fireworks', 'Frog', 'Gasp', 'Gears', 'Giggle', 'Glass', 'Glass_shatter', 'Gobble', 'Groan',
+ 'Growling', 'Hammer', 'Hands', 'Hiccup', 'Honk', 'Hoot', 'Howl', 'Human_sounds', 'Human_voice',
+ 'Insect', 'Laughter', 'Liquid', 'Machine_gun', 'Male_singing', 'Mechanisms', 'Meow', 'Moo',
+ 'Motorcycle', 'Mouse', 'Music', 'Oink', 'Owl', 'Pant', 'Pant_(dog)', 'Patter', 'Pig', 'Plop',
+ 'Pour', 'Power_tool', 'Purr', 'Quack', 'Radio', 'Rain_on_surface', 'Rapping', 'Rattle',
+ 'Reversing_beeps', 'Ringtone', 'Roar', 'Run', 'Rustle', 'Scissors', 'Scrape', 'Scratch',
+ 'Screaming', 'Sewing_machine', 'Shout', 'Shuffle', 'Shuffling_cards', 'Singing',
+ 'Single-lens_reflex_camera', 'Siren', 'Skateboard', 'Sniff', 'Snoring', 'Speech',
+ 'Speech_synthesizer', 'Spray', 'Squeak', 'Squeal', 'Steam', 'Stir', 'Surface_contact',
+ 'Tap', 'Tap_dance', 'Telephone_bell_ringing', 'Television', 'Tick', 'Tick-tock', 'Tools',
+ 'Train', 'Train_horn', 'Train_wheels_squealing', 'Truck', 'Turkey', 'Typewriter', 'Typing',
+ 'Vehicle', 'Video_game_sound', 'Water', 'Whimper_(dog)', 'Whip', 'Whispering', 'Whistle',
+ 'Whistling', 'Whoop', 'Wind', 'Writing', 'Yip', 'and_pans', 'bird_song', 'bleep', 'clink',
+ 'cock-a-doodle-doo', 'crinkling', 'dove', 'dribble', 'eructation', 'faucet', 'flapping_wings',
+ 'footsteps', 'gunfire', 'heartbeat', 'infant_cry', 'kid_speaking', 'man_speaking', 'mastication',
+ 'mice', 'river', 'rooster', 'silverware', 'skidding', 'smack', 'sobbing', 'speedboat', 'splatter',
+ 'surf', 'thud', 'thwack', 'toot', 'truck_horn', 'tweet', 'vroom', 'waterfowl', 'woman_speaking']
+def load_checkpoint(model,
+ filename,
+ map_location=None,
+ strict=False,
+ logger=None,
+ revise_keys=[(r'^module\.', '')]):
+ """Load checkpoint from a file or URI.
+ Args:
+ model (Module): Module to load checkpoint.
+ filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+ ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+ details.
+ map_location (str): Same as :func:`torch.load`.
+ strict (bool): Whether to allow different params for the model and
+ checkpoint.
+ logger (:mod:`logging.Logger` or None): The logger for error message.
+ revise_keys (list): A list of customized keywords to modify the
+ state_dict in checkpoint. Each item is a (pattern, replacement)
+ pair of the regular expression operations. Default: strip
+ the prefix 'module.' by [(r'^module\\.', '')].
+ Returns:
+ dict or OrderedDict: The loaded checkpoint.
+ """
+
+ checkpoint = _load_checkpoint(filename, map_location, logger)
+ '''
+ new_proj = torch.nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+ new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+ checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+ new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=2).unsqueeze(2).repeat(1,1,3,1))
+ checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+ new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=3).unsqueeze(3).repeat(1,1,1,3))
+ checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+ '''
+ new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+ new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+ checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+ # OrderedDict is a subclass of dict
+ if not isinstance(checkpoint, dict):
+ raise RuntimeError(
+ f'No state_dict found in checkpoint file {filename}')
+ # get state_dict from checkpoint
+ if 'state_dict' in checkpoint:
+ state_dict = checkpoint['state_dict']
+ else:
+ state_dict = checkpoint
+
+ # strip prefix of state_dict
+ metadata = getattr(state_dict, '_metadata', OrderedDict())
+ for p, r in revise_keys:
+ state_dict = OrderedDict(
+ {re.sub(p, r, k): v
+ for k, v in state_dict.items()})
+ state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()})
+ # Keep metadata in state_dict
+ state_dict._metadata = metadata
+
+ # load state_dict
+ load_state_dict(model, state_dict, strict, logger)
+ return checkpoint
+
+def init_weights(m):
+ if isinstance(m, (nn.Conv2d, nn.Conv1d)):
+ nn.init.kaiming_normal_(m.weight)
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.constant_(m.weight, 1)
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ if isinstance(m, nn.Linear):
+ nn.init.kaiming_uniform_(m.weight)
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+def init_layer(layer):
+ """Initialize a Linear or Convolutional layer. """
+ nn.init.xavier_uniform_(layer.weight)
+ if hasattr(layer, 'bias'):
+ if layer.bias is not None:
+ layer.bias.data.fill_(0.)
+
+
+def init_bn(bn):
+ """Initialize a Batchnorm layer. """
+ bn.bias.data.fill_(0.)
+ bn.weight.data.fill_(1.)
+
+class MaxPool(nn.Module):
+ def __init__(self, pooldim=1):
+ super().__init__()
+ self.pooldim = pooldim
+
+ def forward(self, logits, decision):
+ return torch.max(decision, dim=self.pooldim)[0]
+
+
+class LinearSoftPool(nn.Module):
+ """LinearSoftPool
+ Linear softmax, takes logits and returns a probability, near to the actual maximum value.
+ Taken from the paper:
+ A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling
+ https://arxiv.org/abs/1810.09050
+ """
+ def __init__(self, pooldim=1):
+ super().__init__()
+ self.pooldim = pooldim
+
+ def forward(self, logits, time_decision):
+ return (time_decision**2).sum(self.pooldim) / (time_decision.sum(
+ self.pooldim)+1e-7)
+
+class ConvBlock(nn.Module):
+ def __init__(self, in_channels, out_channels):
+
+ super(ConvBlock, self).__init__()
+
+ self.conv1 = nn.Conv2d(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=(3, 3), stride=(1, 1),
+ padding=(1, 1), bias=False)
+
+ self.conv2 = nn.Conv2d(in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=(3, 3), stride=(1, 1),
+ padding=(1, 1), bias=False)
+
+ self.bn1 = nn.BatchNorm2d(out_channels)
+ self.bn2 = nn.BatchNorm2d(out_channels)
+
+ self.init_weight()
+
+ def init_weight(self):
+ init_layer(self.conv1)
+ init_layer(self.conv2)
+ init_bn(self.bn1)
+ init_bn(self.bn2)
+
+
+ def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+
+ x = input
+ x = F.relu_(self.bn1(self.conv1(x)))
+ x = F.relu_(self.bn2(self.conv2(x)))
+ if pool_type == 'max':
+ x = F.max_pool2d(x, kernel_size=pool_size)
+ elif pool_type == 'avg':
+ x = F.avg_pool2d(x, kernel_size=pool_size)
+ elif pool_type == 'avg+max':
+ x1 = F.avg_pool2d(x, kernel_size=pool_size)
+ x2 = F.max_pool2d(x, kernel_size=pool_size)
+ x = x1 + x2
+ else:
+ raise Exception('Incorrect argument!')
+
+ return x
+
+class ConvBlock_GLU(nn.Module):
+ def __init__(self, in_channels, out_channels,kernel_size=(3,3)):
+ super(ConvBlock_GLU, self).__init__()
+ self.conv1 = nn.Conv2d(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size, stride=(1, 1),
+ padding=(1, 1), bias=False)
+ self.bn1 = nn.BatchNorm2d(out_channels)
+ self.sigmoid = nn.Sigmoid()
+ self.init_weight()
+
+ def init_weight(self):
+ init_layer(self.conv1)
+ init_bn(self.bn1)
+
+ def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+ x = input
+ x = self.bn1(self.conv1(x))
+ cnn1 = self.sigmoid(x[:, :x.shape[1]//2, :, :])
+ cnn2 = x[:,x.shape[1]//2:,:,:]
+ x = cnn1*cnn2
+ if pool_type == 'max':
+ x = F.max_pool2d(x, kernel_size=pool_size)
+ elif pool_type == 'avg':
+ x = F.avg_pool2d(x, kernel_size=pool_size)
+ elif pool_type == 'avg+max':
+ x1 = F.avg_pool2d(x, kernel_size=pool_size)
+ x2 = F.max_pool2d(x, kernel_size=pool_size)
+ x = x1 + x2
+ elif pool_type == 'None':
+ pass
+ elif pool_type == 'LP':
+ pass
+ #nn.LPPool2d(4, pool_size)
+ else:
+ raise Exception('Incorrect argument!')
+ return x
+
+class Mul_scale_GLU(nn.Module):
+ def __init__(self):
+ super(Mul_scale_GLU,self).__init__()
+ self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1)) # 1*1
+ self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3)) # 3*3
+ self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5)) # 5*5
+ self.conv_block2 = ConvBlock_GLU(in_channels=96, out_channels=128*2)
+ # self.conv_block3 = ConvBlock(in_channels=64, out_channels=128)
+ self.conv_block3 = ConvBlock_GLU(in_channels=128, out_channels=128*2)
+ self.conv_block4 = ConvBlock_GLU(in_channels=128, out_channels=256*2)
+ self.conv_block5 = ConvBlock_GLU(in_channels=256, out_channels=256*2)
+ self.conv_block6 = ConvBlock_GLU(in_channels=256, out_channels=512*2)
+ self.conv_block7 = ConvBlock_GLU(in_channels=512, out_channels=512*2)
+ self.padding = nn.ReplicationPad2d((0,1,0,1))
+
+ def forward(self, input, fi=None):
+ """
+ Input: (batch_size, data_length)"""
+ x1 = self.conv_block1_1(input, pool_size=(2, 2), pool_type='avg')
+ x1 = x1[:,:,:500,:32]
+ #print('x1 ',x1.shape)
+ x2 = self.conv_block1_2(input,pool_size=(2,2),pool_type='avg')
+ #print('x2 ',x2.shape)
+ x3 = self.conv_block1_3(input,pool_size=(2,2),pool_type='avg')
+ x3 = self.padding(x3)
+ #print('x3 ',x3.shape)
+ # assert 1==2
+ x = torch.cat([x1,x2],dim=1)
+ x = torch.cat([x,x3],dim=1)
+ #print('x ',x.shape)
+ x = self.conv_block2(x, pool_size=(2, 2), pool_type='None')
+ x = self.conv_block3(x,pool_size=(2,2),pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training) #
+ #print('x2,3 ',x.shape)
+ x = self.conv_block4(x, pool_size=(2, 4), pool_type='None')
+ x = self.conv_block5(x,pool_size=(2,4),pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ #print('x4,5 ',x.shape)
+
+ x = self.conv_block6(x, pool_size=(1, 4), pool_type='None')
+ x = self.conv_block7(x, pool_size=(1, 4), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ # print('x6,7 ',x.shape)
+ # assert 1==2
+ return x
+
+class Cnn14(nn.Module):
+ def __init__(self, sample_rate=32000, window_size=1024, hop_size=320, mel_bins=64, fmin=50,
+ fmax=14000, classes_num=527):
+
+ super(Cnn14, self).__init__()
+
+ window = 'hann'
+ center = True
+ pad_mode = 'reflect'
+ ref = 1.0
+ amin = 1e-10
+ top_db = None
+
+ # Spectrogram extractor
+ self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+ win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+ freeze_parameters=True)
+
+ # Logmel feature extractor
+ self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+ n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+ freeze_parameters=True)
+
+ # Spec augmenter
+ self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+ freq_drop_width=8, freq_stripes_num=2)
+
+ self.bn0 = nn.BatchNorm2d(64)
+
+ self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+ self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+ self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+ self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+ self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+ self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+
+ self.fc1 = nn.Linear(2048, 128, bias=True)
+ self.fc_audioset = nn.Linear(128, classes_num, bias=True)
+
+ self.init_weight()
+
+ def init_weight(self):
+ init_layer(self.fc1)
+ init_layer(self.fc_audioset)
+
+ def forward(self, input_, mixup_lambda=None):
+ """
+ Input: (batch_size, data_length)"""
+ input_ = input_.unsqueeze(1)
+ x = self.conv_block1(input_, pool_size=(2, 2), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block4(x, pool_size=(1, 2), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block5(x, pool_size=(1, 2), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block6(x, pool_size=(1, 2), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ # print(x.shape)
+ # x = torch.mean(x, dim=3)
+ x = x.transpose(1, 2).contiguous().flatten(-2)
+ x = self.fc1(x)
+ # print(x.shape)
+ # assert 1==2
+ # (x1,_) = torch.max(x, dim=2)
+ # x2 = torch.mean(x, dim=2)
+ # x = x1 + x2
+ # x = F.dropout(x, p=0.5, training=self.training)
+ # x = F.relu_(self.fc1(x))
+ # embedding = F.dropout(x, p=0.5, training=self.training)
+ return x
+
+class Cnn10_fi(nn.Module):
+ def __init__(self):
+ super(Cnn10_fi, self).__init__()
+ self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+ self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+ self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+ self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+
+ # self.fc1 = nn.Linear(512, 512, bias=True)
+ # self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+ # self.init_weight()
+
+ def forward(self, input, fi=None):
+ """
+ Input: (batch_size, data_length)"""
+
+ x = self.conv_block1(input, pool_size=(2, 2), pool_type='avg')
+ if fi != None:
+ gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+ beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+ x = (gamma)*x + beta
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+ if fi != None:
+ gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+ beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+ x = (gamma)*x + beta
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block3(x, pool_size=(2, 4), pool_type='avg')
+ if fi != None:
+ gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+ beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+ x = (gamma)*x + beta
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block4(x, pool_size=(1, 4), pool_type='avg')
+ if fi != None:
+ gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+ beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+ x = (gamma)*x + beta
+ x = F.dropout(x, p=0.2, training=self.training)
+ return x
+
+class Cnn10_mul_scale(nn.Module):
+ def __init__(self,scale=8):
+ super(Cnn10_mul_scale, self).__init__()
+ self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1))
+ self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3))
+ self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5))
+ self.conv_block2 = ConvBlock(in_channels=96, out_channels=128)
+ self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+ self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+ self.scale = scale
+ self.padding = nn.ReplicationPad2d((0,1,0,1))
+ def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+ """
+ Input: (batch_size, data_length)"""
+ if self.scale == 8:
+ pool_size1 = (2,2)
+ pool_size2 = (2,2)
+ pool_size3 = (2,4)
+ pool_size4 = (1,4)
+ elif self.scale == 4:
+ pool_size1 = (2,2)
+ pool_size2 = (2,2)
+ pool_size3 = (1,4)
+ pool_size4 = (1,4)
+ elif self.scale == 2:
+ pool_size1 = (2,2)
+ pool_size2 = (1,2)
+ pool_size3 = (1,4)
+ pool_size4 = (1,4)
+ else:
+ pool_size1 = (1,2)
+ pool_size2 = (1,2)
+ pool_size3 = (1,4)
+ pool_size4 = (1,4)
+ # print('input ',input.shape)
+ x1 = self.conv_block1_1(input, pool_size=pool_size1, pool_type='avg')
+ x1 = x1[:,:,:500,:32]
+ #print('x1 ',x1.shape)
+ x2 = self.conv_block1_2(input, pool_size=pool_size1, pool_type='avg')
+ #print('x2 ',x2.shape)
+ x3 = self.conv_block1_3(input, pool_size=pool_size1, pool_type='avg')
+ x3 = self.padding(x3)
+ #print('x3 ',x3.shape)
+ # assert 1==2
+ m_i = min(x3.shape[2],min(x1.shape[2],x2.shape[2]))
+ #print('m_i ', m_i)
+ x = torch.cat([x1[:,:,:m_i,:],x2[:,:, :m_i,:],x3[:,:, :m_i,:]],dim=1)
+ # x = torch.cat([x,x3],dim=1)
+
+ # x = self.conv_block1(input, pool_size=pool_size1, pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ return x
+
+
+class Cnn10(nn.Module):
+ def __init__(self,scale=8):
+ super(Cnn10, self).__init__()
+ self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+ self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+ self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+ self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+ self.scale = scale
+ def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+ """
+ Input: (batch_size, data_length)"""
+ if self.scale == 8:
+ pool_size1 = (2,2)
+ pool_size2 = (2,2)
+ pool_size3 = (2,4)
+ pool_size4 = (1,4)
+ elif self.scale == 4:
+ pool_size1 = (2,2)
+ pool_size2 = (2,2)
+ pool_size3 = (1,4)
+ pool_size4 = (1,4)
+ elif self.scale == 2:
+ pool_size1 = (2,2)
+ pool_size2 = (1,2)
+ pool_size3 = (1,4)
+ pool_size4 = (1,4)
+ else:
+ pool_size1 = (1,2)
+ pool_size2 = (1,2)
+ pool_size3 = (1,4)
+ pool_size4 = (1,4)
+ x = self.conv_block1(input, pool_size=pool_size1, pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ return x
+
+class MeanPool(nn.Module):
+ def __init__(self, pooldim=1):
+ super().__init__()
+ self.pooldim = pooldim
+
+ def forward(self, logits, decision):
+ return torch.mean(decision, dim=self.pooldim)
+
+class ResPool(nn.Module):
+ def __init__(self, pooldim=1):
+ super().__init__()
+ self.pooldim = pooldim
+ self.linPool = LinearSoftPool(pooldim=1)
+
+class AutoExpPool(nn.Module):
+ def __init__(self, outputdim=10, pooldim=1):
+ super().__init__()
+ self.outputdim = outputdim
+ self.alpha = nn.Parameter(torch.full((outputdim, ), 1))
+ self.pooldim = pooldim
+
+ def forward(self, logits, decision):
+ scaled = self.alpha * decision # \alpha * P(Y|x) in the paper
+ return (logits * torch.exp(scaled)).sum(
+ self.pooldim) / torch.exp(scaled).sum(self.pooldim)
+
+
+class SoftPool(nn.Module):
+ def __init__(self, T=1, pooldim=1):
+ super().__init__()
+ self.pooldim = pooldim
+ self.T = T
+
+ def forward(self, logits, decision):
+ w = torch.softmax(decision / self.T, dim=self.pooldim)
+ return torch.sum(decision * w, dim=self.pooldim)
+
+
+class AutoPool(nn.Module):
+ """docstring for AutoPool"""
+ def __init__(self, outputdim=10, pooldim=1):
+ super().__init__()
+ self.outputdim = outputdim
+ self.alpha = nn.Parameter(torch.ones(outputdim))
+ self.dim = pooldim
+
+ def forward(self, logits, decision):
+ scaled = self.alpha * decision # \alpha * P(Y|x) in the paper
+ weight = torch.softmax(scaled, dim=self.dim)
+ return torch.sum(decision * weight, dim=self.dim) # B x C
+
+
+class ExtAttentionPool(nn.Module):
+ def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
+ super().__init__()
+ self.inputdim = inputdim
+ self.outputdim = outputdim
+ self.pooldim = pooldim
+ self.attention = nn.Linear(inputdim, outputdim)
+ nn.init.zeros_(self.attention.weight)
+ nn.init.zeros_(self.attention.bias)
+ self.activ = nn.Softmax(dim=self.pooldim)
+
+ def forward(self, logits, decision):
+ # Logits of shape (B, T, D), decision of shape (B, T, C)
+ w_x = self.activ(self.attention(logits) / self.outputdim)
+ h = (logits.permute(0, 2, 1).contiguous().unsqueeze(-2) *
+ w_x.unsqueeze(-1)).flatten(-2).contiguous()
+ return torch.sum(h, self.pooldim)
+
+
+class AttentionPool(nn.Module):
+ """docstring for AttentionPool"""
+ def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
+ super().__init__()
+ self.inputdim = inputdim
+ self.outputdim = outputdim
+ self.pooldim = pooldim
+ self.transform = nn.Linear(inputdim, outputdim)
+ self.activ = nn.Softmax(dim=self.pooldim)
+ self.eps = 1e-7
+
+ def forward(self, logits, decision):
+ # Input is (B, T, D)
+ # B, T , D
+ w = self.activ(torch.clamp(self.transform(logits), -15, 15))
+ detect = (decision * w).sum(
+ self.pooldim) / (w.sum(self.pooldim) + self.eps)
+ # B, T, D
+ return detect
+
+class Block2D(nn.Module):
+ def __init__(self, cin, cout, kernel_size=3, padding=1):
+ super().__init__()
+ self.block = nn.Sequential(
+ nn.BatchNorm2d(cin),
+ nn.Conv2d(cin,
+ cout,
+ kernel_size=kernel_size,
+ padding=padding,
+ bias=False),
+ nn.LeakyReLU(inplace=True, negative_slope=0.1))
+
+ def forward(self, x):
+ return self.block(x)
+
+class AudioCNN(nn.Module):
+ def __init__(self, classes_num):
+ super(AudioCNN, self).__init__()
+ self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+ self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+ self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+ self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+ self.fc1 = nn.Linear(512,128,bias=True)
+ self.fc = nn.Linear(128, classes_num, bias=True)
+ self.init_weights()
+
+ def init_weights(self):
+ init_layer(self.fc)
+
+ def forward(self, input):
+ '''
+ Input: (batch_size, times_steps, freq_bins)'''
+ # [128, 801, 168] --> [128,1,801,168]
+ x = input[:, None, :, :]
+ '''(batch_size, 1, times_steps, freq_bins)'''
+ x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') # 128,64,400,84
+ x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') # 128,128,200,42
+ x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') # 128,256,100,21
+ x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') # 128,512,50,10
+ '''(batch_size, feature_maps, time_steps, freq_bins)'''
+ x = torch.mean(x, dim=3) # (batch_size, feature_maps, time_stpes) # 128,512,50
+ (x, _) = torch.max(x, dim=2) # (batch_size, feature_maps) 128,512
+ x = self.fc1(x) # 128,128
+ output = self.fc(x) # 128,10
+ return x,output
+
+ def extract(self,input):
+ '''Input: (batch_size, times_steps, freq_bins)'''
+ x = input[:, None, :, :]
+ x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+ x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+ x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+ x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+ '''(batch_size, feature_maps, time_steps, freq_bins)'''
+ x = torch.mean(x, dim=3) # (batch_size, feature_maps, time_stpes)
+ (x, _) = torch.max(x, dim=2) # (batch_size, feature_maps)
+ x = self.fc1(x) # 128,128
+ return x
+
+def parse_poolingfunction(poolingfunction_name='mean', **kwargs):
+ """parse_poolingfunction
+ A heler function to parse any temporal pooling
+ Pooling is done on dimension 1
+ :param poolingfunction_name:
+ :param **kwargs:
+ """
+ poolingfunction_name = poolingfunction_name.lower()
+ if poolingfunction_name == 'mean':
+ return MeanPool(pooldim=1)
+ elif poolingfunction_name == 'max':
+ return MaxPool(pooldim=1)
+ elif poolingfunction_name == 'linear':
+ return LinearSoftPool(pooldim=1)
+ elif poolingfunction_name == 'expalpha':
+ return AutoExpPool(outputdim=kwargs['outputdim'], pooldim=1)
+
+ elif poolingfunction_name == 'soft':
+ return SoftPool(pooldim=1)
+ elif poolingfunction_name == 'auto':
+ return AutoPool(outputdim=kwargs['outputdim'])
+ elif poolingfunction_name == 'attention':
+ return AttentionPool(inputdim=kwargs['inputdim'],
+ outputdim=kwargs['outputdim'])
+class conv1d(nn.Module):
+ def __init__(self, nin, nout, kernel_size=3, stride=1, padding='VALID', dilation=1):
+ super(conv1d, self).__init__()
+ if padding == 'VALID':
+ dconv_pad = 0
+ elif padding == 'SAME':
+ dconv_pad = dilation * ((kernel_size - 1) // 2)
+ else:
+ raise ValueError("Padding Mode Error!")
+ self.conv = nn.Conv1d(nin, nout, kernel_size=kernel_size, stride=stride, padding=dconv_pad)
+ self.act = nn.ReLU()
+ self.init_layer(self.conv)
+
+ def init_layer(self, layer, nonlinearity='relu'):
+ """Initialize a Linear or Convolutional layer. """
+ nn.init.kaiming_normal_(layer.weight, nonlinearity=nonlinearity)
+ nn.init.constant_(layer.bias, 0.1)
+
+ def forward(self, x):
+ out = self.act(self.conv(x))
+ return out
+
+class Atten_1(nn.Module):
+ def __init__(self, input_dim, context=2, dropout_rate=0.2):
+ super(Atten_1, self).__init__()
+ self._matrix_k = nn.Linear(input_dim, input_dim // 4)
+ self._matrix_q = nn.Linear(input_dim, input_dim // 4)
+ self.relu = nn.ReLU()
+ self.context = context
+ self._dropout_layer = nn.Dropout(dropout_rate)
+ self.init_layer(self._matrix_k)
+ self.init_layer(self._matrix_q)
+
+ def init_layer(self, layer, nonlinearity='leaky_relu'):
+ """Initialize a Linear or Convolutional layer. """
+ nn.init.kaiming_uniform_(layer.weight, nonlinearity=nonlinearity)
+ if hasattr(layer, 'bias'):
+ if layer.bias is not None:
+ layer.bias.data.fill_(0.)
+
+ def forward(self, input_x):
+ k_x = input_x
+ k_x = self.relu(self._matrix_k(k_x))
+ k_x = self._dropout_layer(k_x)
+ # print('k_x ',k_x.shape)
+ q_x = input_x[:, self.context, :]
+ # print('q_x ',q_x.shape)
+ q_x = q_x[:, None, :]
+ # print('q_x1 ',q_x.shape)
+ q_x = self.relu(self._matrix_q(q_x))
+ q_x = self._dropout_layer(q_x)
+ # print('q_x2 ',q_x.shape)
+ x_ = torch.matmul(k_x, q_x.transpose(-2, -1) / math.sqrt(k_x.size(-1)))
+ # print('x_ ',x_.shape)
+ x_ = x_.squeeze(2)
+ alpha = F.softmax(x_, dim=-1)
+ att_ = alpha
+ # print('alpha ',alpha)
+ alpha = alpha.unsqueeze(2).repeat(1,1,input_x.shape[2])
+ # print('alpha ',alpha)
+ # alpha = alpha.view(alpha.size(0), alpha.size(1), alpha.size(2), 1)
+ out = alpha * input_x
+ # print('out ', out.shape)
+ # out = out.mean(2)
+ out = out.mean(1)
+ # print('out ',out.shape)
+ # assert 1==2
+ #y = alpha * input_x
+ #return y, att_
+ out = input_x[:, self.context, :] + out
+ return out
+
+class Fusion(nn.Module):
+ def __init__(self, inputdim, inputdim2, n_fac):
+ super().__init__()
+ self.fuse_layer1 = conv1d(inputdim, inputdim2*n_fac,1)
+ self.fuse_layer2 = conv1d(inputdim2, inputdim2*n_fac,1)
+ self.avg_pool = nn.AvgPool1d(n_fac, stride=n_fac) # 沿着最后一个维度进行pooling
+
+ def forward(self,embedding,mix_embed):
+ embedding = embedding.permute(0,2,1)
+ fuse1_out = self.fuse_layer1(embedding) # [2, 501, 2560] ,512*5, 1D卷积融合,spk_embeding ,扩大其维度
+ fuse1_out = fuse1_out.permute(0,2,1)
+
+ mix_embed = mix_embed.permute(0,2,1)
+ fuse2_out = self.fuse_layer2(mix_embed) # [2, 501, 2560] ,512*5, 1D卷积融合,spk_embeding ,扩大其维度
+ fuse2_out = fuse2_out.permute(0,2,1)
+ as_embs = torch.mul(fuse1_out, fuse2_out) # 相乘 [2, 501, 2560]
+ # (10, 501, 512)
+ as_embs = self.avg_pool(as_embs) # [2, 501, 512] 相当于 2560//5
+ return as_embs
+
+class CDur_fusion(nn.Module):
+ def __init__(self, inputdim, outputdim, **kwargs):
+ super().__init__()
+ self.features = nn.Sequential(
+ Block2D(1, 32),
+ nn.LPPool2d(4, (2, 4)),
+ Block2D(32, 128),
+ Block2D(128, 128),
+ nn.LPPool2d(4, (2, 4)),
+ Block2D(128, 128),
+ Block2D(128, 128),
+ nn.LPPool2d(4, (1, 4)),
+ nn.Dropout(0.3),
+ )
+ with torch.no_grad():
+ rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+ rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+
+ self.gru = nn.GRU(128, 128, bidirectional=True, batch_first=True)
+ self.fusion = Fusion(128,2)
+ self.fc = nn.Linear(256,256)
+ self.outputlayer = nn.Linear(256, outputdim)
+ self.features.apply(init_weights)
+ self.outputlayer.apply(init_weights)
+
+ def forward(self, x, embedding): #
+ batch, time, dim = x.shape
+ x = x.unsqueeze(1) # (b,1,t,d)
+ x = self.features(x) #
+ x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+ embedding = embedding.unsqueeze(1)
+ embedding = embedding.repeat(1, x.shape[1], 1)
+ x = self.fusion(embedding,x)
+ #x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+ if not hasattr(self, '_flattened'):
+ self.gru.flatten_parameters()
+ x, _ = self.gru(x) # x torch.Size([16, 125, 256])
+ x = self.fc(x)
+ decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2])
+ decision_up = torch.nn.functional.interpolate(
+ decision_time.transpose(1, 2), # [16, 2, 125]
+ time, # 501
+ mode='linear',
+ align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+ return decision_time[:,:,0],decision_up
+
+class CDur(nn.Module):
+ def __init__(self, inputdim, outputdim,time_resolution, **kwargs):
+ super().__init__()
+ self.features = nn.Sequential(
+ Block2D(1, 32),
+ nn.LPPool2d(4, (2, 4)),
+ Block2D(32, 128),
+ Block2D(128, 128),
+ nn.LPPool2d(4, (2, 4)),
+ Block2D(128, 128),
+ Block2D(128, 128),
+ nn.LPPool2d(4, (2, 4)),
+ nn.Dropout(0.3),
+ )
+ with torch.no_grad():
+ rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+ rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+
+ self.gru = nn.GRU(256, 256, bidirectional=True, batch_first=True)
+ self.fc = nn.Linear(512,256)
+ self.outputlayer = nn.Linear(256, outputdim)
+ self.features.apply(init_weights)
+ self.outputlayer.apply(init_weights)
+
+ def forward(self, x, embedding,one_hot=None): #
+ batch, time, dim = x.shape
+ x = x.unsqueeze(1) # (b,1,t,d)
+ x = self.features(x) #
+ x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+ embedding = embedding.unsqueeze(1)
+ embedding = embedding.repeat(1, x.shape[1], 1)
+ x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+ if not hasattr(self, '_flattened'):
+ self.gru.flatten_parameters()
+ x, _ = self.gru(x) # x torch.Size([16, 125, 256])
+ x = self.fc(x)
+ decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2])
+ decision_up = torch.nn.functional.interpolate(
+ decision_time.transpose(1, 2), # [16, 2, 125]
+ time, # 501
+ mode='linear',
+ align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+ return decision_time[:,:,0],decision_up
+
+class CDur_big(nn.Module):
+ def __init__(self, inputdim, outputdim, **kwargs):
+ super().__init__()
+ self.features = nn.Sequential(
+ Block2D(1, 64),
+ Block2D(64, 64),
+ nn.LPPool2d(4, (2, 2)),
+ Block2D(64, 128),
+ Block2D(128, 128),
+ nn.LPPool2d(4, (2, 2)),
+ Block2D(128, 256),
+ Block2D(256, 256),
+ nn.LPPool2d(4, (2, 4)),
+ Block2D(256, 512),
+ Block2D(512, 512),
+ nn.LPPool2d(4, (1, 4)),
+ nn.Dropout(0.3),)
+ with torch.no_grad():
+ rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+ rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+ self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+ self.fc = nn.Linear(1024,256)
+ self.outputlayer = nn.Linear(256, outputdim)
+ self.features.apply(init_weights)
+ self.outputlayer.apply(init_weights)
+
+ def forward(self, x, embedding): #
+ batch, time, dim = x.shape
+ x = x.unsqueeze(1) # (b,1,t,d)
+ x = self.features(x) #
+ x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+ embedding = embedding.unsqueeze(1)
+ embedding = embedding.repeat(1, x.shape[1], 1)
+ x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+ if not hasattr(self, '_flattened'):
+ self.gru.flatten_parameters()
+ x, _ = self.gru(x) # x torch.Size([16, 125, 256])
+ x = self.fc(x)
+ decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2])
+ decision_up = torch.nn.functional.interpolate(
+ decision_time.transpose(1, 2), # [16, 2, 125]
+ time, # 501
+ mode='linear',
+ align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+ return decision_time[:,:,0],decision_up
+
+class CDur_GLU(nn.Module):
+ def __init__(self, inputdim, outputdim, **kwargs):
+ super().__init__()
+ self.features = Mul_scale_GLU()
+ # with torch.no_grad():
+ # rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+ # rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+ self.gru = nn.GRU(640, 512,1, bidirectional=True, batch_first=True) # previous is 640
+ # self.gru = LSTMModel(640, 512,1)
+ self.fc = nn.Linear(1024,256)
+ self.outputlayer = nn.Linear(256, outputdim)
+ # self.features.apply(init_weights)
+ self.outputlayer.apply(init_weights)
+
+ def forward(self, x, embedding,one_hot=None): #
+ batch, time, dim = x.shape
+ x = x.unsqueeze(1) # (b,1,t,d)
+ x = self.features(x) #
+ x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+ # print('x ',x.shape)
+ # assert 1==2
+ embedding = embedding.unsqueeze(1)
+ embedding = embedding.repeat(1, x.shape[1], 1)
+
+ x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+ if not hasattr(self, '_flattened'):
+ self.gru.flatten_parameters()
+ x, _ = self.gru(x) # x torch.Size([16, 125, 256])
+ # x = self.gru(x) # x torch.Size([16, 125, 256])
+ x = self.fc(x)
+ decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2])
+ decision_up = torch.nn.functional.interpolate(
+ decision_time.transpose(1, 2), # [16, 2, 125]
+ time, # 501
+ mode='linear',
+ align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+ return decision_time[:,:,0],decision_up
+
+class CDur_CNN14(nn.Module):
+ def __init__(self, inputdim, outputdim,time_resolution,**kwargs):
+ super().__init__()
+ if time_resolution==125:
+ self.features = Cnn10(8)
+ elif time_resolution == 250:
+ #print('time_resolution ',time_resolution)
+ self.features = Cnn10(4)
+ elif time_resolution == 500:
+ self.features = Cnn10(2)
+ else:
+ self.features = Cnn10(0)
+ with torch.no_grad():
+ rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+ rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+ # self.features = Cnn10()
+ self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+ # self.gru = LSTMModel(640, 512,1)
+ self.fc = nn.Linear(1024,256)
+ self.outputlayer = nn.Linear(256, outputdim)
+ # self.features.apply(init_weights)
+ self.outputlayer.apply(init_weights)
+
+ def forward(self, x, embedding,one_hot=None):
+ batch, time, dim = x.shape
+ x = x.unsqueeze(1) # (b,1,t,d)
+ x = self.features(x) #
+ x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+ # print('x ',x.shape)
+ # assert 1==2
+ embedding = embedding.unsqueeze(1)
+ embedding = embedding.repeat(1, x.shape[1], 1)
+ x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+ if not hasattr(self, '_flattened'):
+ self.gru.flatten_parameters()
+ x, _ = self.gru(x) # x torch.Size([16, 125, 256])
+ # x = self.gru(x) # x torch.Size([16, 125, 256])
+ x = self.fc(x)
+ decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2])
+ decision_up = torch.nn.functional.interpolate(
+ decision_time.transpose(1, 2), # [16, 2, 125]
+ time, # 501
+ mode='linear',
+ align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+ return decision_time[:,:,0],decision_up
+
+class CDur_CNN_mul_scale(nn.Module):
+ def __init__(self, inputdim, outputdim,time_resolution,**kwargs):
+ super().__init__()
+ if time_resolution==125:
+ self.features = Cnn10_mul_scale(8)
+ elif time_resolution == 250:
+ #print('time_resolution ',time_resolution)
+ self.features = Cnn10_mul_scale(4)
+ elif time_resolution == 500:
+ self.features = Cnn10_mul_scale(2)
+ else:
+ self.features = Cnn10_mul_scale(0)
+ # with torch.no_grad():
+ # rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+ # rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+ # self.features = Cnn10()
+ self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+ # self.gru = LSTMModel(640, 512,1)
+ self.fc = nn.Linear(1024,256)
+ self.outputlayer = nn.Linear(256, outputdim)
+ # self.features.apply(init_weights)
+ self.outputlayer.apply(init_weights)
+
+ def forward(self, x, embedding,one_hot=None):
+ # print('x ',x.shape)
+ # assert 1==2
+ batch, time, dim = x.shape
+ x = x.unsqueeze(1) # (b,1,t,d)
+ x = self.features(x) #
+ x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+ # print('x ',x.shape)
+ # assert 1==2
+ embedding = embedding.unsqueeze(1)
+ embedding = embedding.repeat(1, x.shape[1], 1)
+ x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+ if not hasattr(self, '_flattened'):
+ self.gru.flatten_parameters()
+ x, _ = self.gru(x) # x torch.Size([16, 125, 256])
+ # x = self.gru(x) # x torch.Size([16, 125, 256])
+ x = self.fc(x)
+ decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2])
+ decision_up = torch.nn.functional.interpolate(
+ decision_time.transpose(1, 2), # [16, 2, 125]
+ time, # 501
+ mode='linear',
+ align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+ return decision_time[:,:,0],decision_up
+
+class CDur_CNN_mul_scale_fusion(nn.Module):
+ def __init__(self, inputdim, outputdim, time_resolution,**kwargs):
+ super().__init__()
+ if time_resolution==125:
+ self.features = Cnn10_mul_scale(8)
+ elif time_resolution == 250:
+ #print('time_resolution ',time_resolution)
+ self.features = Cnn10_mul_scale(4)
+ elif time_resolution == 500:
+ self.features = Cnn10_mul_scale(2)
+ else:
+ self.features = Cnn10_mul_scale(0)
+ # with torch.no_grad():
+ # rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+ # rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+ # self.features = Cnn10()
+ self.gru = nn.GRU(512, 512, bidirectional=True, batch_first=True)
+ # self.gru = LSTMModel(640, 512,1)
+ self.fc = nn.Linear(1024,256)
+ self.fusion = Fusion(128,512,2)
+ self.outputlayer = nn.Linear(256, outputdim)
+ # self.features.apply(init_weights)
+ self.outputlayer.apply(init_weights)
+
+ def forward(self, x, embedding,one_hot=None):
+ # print('x ',x.shape)
+ # assert 1==2
+ batch, time, dim = x.shape
+ x = x.unsqueeze(1) # (b,1,t,d)
+ x = self.features(x) #
+ x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+ # print('x ',x.shape)
+ # assert 1==2
+ embedding = embedding.unsqueeze(1)
+ embedding = embedding.repeat(1, x.shape[1], 1)
+ x = self.fusion(embedding, x)
+ #x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+ if not hasattr(self, '_flattened'):
+ self.gru.flatten_parameters()
+ x, _ = self.gru(x) # x torch.Size([16, 125, 256])
+ # x = self.gru(x) # x torch.Size([16, 125, 256])
+ x = self.fc(x)
+ decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2])
+ decision_up = torch.nn.functional.interpolate(
+ decision_time.transpose(1, 2), # [16, 2, 125]
+ time, # 501
+ mode='linear',
+ align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+ return decision_time[:,:,0],decision_up
+
+
+class RaDur_fusion(nn.Module):
+ def __init__(self, model_config, inputdim, outputdim, time_resolution, **kwargs):
+ super().__init__()
+ self.encoder = Cnn14()
+ self.detection = CDur_CNN_mul_scale_fusion(inputdim, outputdim, time_resolution)
+ self.softmax = nn.Softmax(dim=2)
+ #self.temperature = 5
+ # if model_config['pre_train']:
+ # self.encoder.load_state_dict(torch.load(model_config['encoder_path'])['model'])
+ # self.detection.load_state_dict(torch.load(model_config['CDur_path']))
+
+ self.q = nn.Linear(128,128)
+ self.k = nn.Linear(128,128)
+ self.q_ee = nn.Linear(128, 128)
+ self.k_ee = nn.Linear(128, 128)
+ self.temperature = 11.3 # sqrt(128)
+ self.att_pool = model_config['att_pool']
+ self.enhancement = model_config['enhancement']
+ self.tao = model_config['tao']
+ self.top = model_config['top']
+ self.bn = nn.BatchNorm1d(128)
+ self.EE_fusion = Fusion(128, 128, 4)
+
+ def get_w(self,q,k):
+ q = self.q(q)
+ k = self.k(k)
+ q = q.unsqueeze(1)
+ attn = torch.bmm(q, k.transpose(1, 2))
+ attn = attn/self.temperature
+ attn = self.softmax(attn)
+ return attn
+
+ def get_w_ee(self,q,k):
+ q = self.q_ee(q)
+ k = self.k_ee(k)
+ q = q.unsqueeze(1)
+ attn = torch.bmm(q, k.transpose(1, 2))
+ attn = attn/self.temperature
+ attn = self.softmax(attn)
+ return attn
+
+ def attention_pooling(self, embeddings, mean_embedding):
+ att_pool_w = self.get_w(mean_embedding,embeddings)
+ embedding = torch.bmm(att_pool_w, embeddings).squeeze(1)
+ # print(embedding.shape)
+ # print(att_pool_w.shape)
+ # print(att_pool_w[0])
+ # assert 1==2
+ return embedding
+
+ def select_topk_embeddings(self, scores, embeddings, k):
+ _, idx_DESC = scores.sort(descending=True, dim=1) # 根据分数进行排序
+ top_k = _[:,:k]
+ # print('top_k ', top_k)
+ # top_k = top_k.mean(1)
+ idx_topk = idx_DESC[:, :k] # 取top_k个
+ # print('index ', idx_topk)
+ idx_topk = idx_topk.unsqueeze(2).expand([-1, -1, embeddings.shape[2]])
+ selected_embeddings = torch.gather(embeddings, 1, idx_topk)
+ return selected_embeddings,top_k
+
+ def sum_with_attention(self, embedding, top_k, selected_embeddings):
+ # print('embedding ',embedding)
+ # print('selected_embeddings ',selected_embeddings.shape)
+ att_1 = self.get_w_ee(embedding, selected_embeddings)
+ att_1 = att_1.squeeze(1)
+ #print('att_1 ',att_1.shape)
+ larger = top_k > self.tao
+ # print('larger ',larger)
+ top_k = top_k*larger
+ # print('top_k ',top_k.shape)
+ # print('top_k ',top_k)
+ att_1 = att_1*top_k
+ #print('att_1 ',att_1.shape)
+ # assert 1==2
+ att_2 = att_1.unsqueeze(2).repeat(1,1,128)
+ Es = selected_embeddings*att_2
+ return Es
+
+ def orcal_EE(self, x, embedding, label):
+ batch, time, dim = x.shape
+
+ mixture_embedding = self.encoder(x) # 8, 125, 128
+ mixture_embedding = mixture_embedding.transpose(1,2)
+ mixture_embedding = self.bn(mixture_embedding)
+ mixture_embedding = mixture_embedding.transpose(1,2)
+
+ x = x.unsqueeze(1) # (b,1,t,d)
+ x = self.detection.features(x) #
+ x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+ embedding_pre = embedding.unsqueeze(1)
+ embedding_pre = embedding_pre.repeat(1, x.shape[1], 1)
+ f = self.detection.fusion(embedding_pre, x) # the first stage results
+ #f = torch.cat((x, embedding_pre), dim=2) # [B, T, 128 + emb_dim]
+ if not hasattr(self, '_flattened'):
+ self.detection.gru.flatten_parameters()
+ f, _ = self.detection.gru(f) # x torch.Size([16, 125, 256])
+ f = self.detection.fc(f)
+ decision_time = torch.softmax(self.detection.outputlayer(f),dim=2) # x torch.Size([16, 125, 2])
+
+ selected_embeddings, top_k = self.select_topk_embeddings(decision_time[:,:,0], mixture_embedding, self.top)
+
+ selected_embeddings = self.sum_with_attention(embedding, top_k, selected_embeddings) # add the weight
+
+ mix_embedding = selected_embeddings.mean(1).unsqueeze(1) #
+ mix_embedding = mix_embedding.repeat(1, x.shape[1], 1)
+ embedding = embedding.unsqueeze(1)
+ embedding = embedding.repeat(1, x.shape[1], 1)
+ mix_embedding = self.EE_fusion(mix_embedding, embedding) # 使用神经网络进行融合
+ # mix_embedding2 = selected_embeddings2.mean(1)
+ #mix_embedding = embedding + mix_embedding # 直接相加
+ # new detection results
+ # embedding_now = mix_embedding.unsqueeze(1)
+ # embedding_now = embedding_now.repeat(1, x.shape[1], 1)
+ f_now = self.detection.fusion(mix_embedding, x)
+ #f_now = torch.cat((x, embedding_now), dim=2) #
+ f_now, _ = self.detection.gru(f_now) # x torch.Size([16, 125, 256])
+ f_now = self.detection.fc(f_now)
+ decision_time_now = torch.softmax(self.detection.outputlayer(f_now), dim=2) # x torch.Size([16, 125, 2])
+
+ top_k = top_k.mean(1) # get avg score,higher score will have more weight
+ larger = top_k > self.tao
+ top_k = top_k * larger
+ top_k = top_k/2.0
+ # print('top_k ',top_k)
+ # assert 1==2
+ # print('tok_k[ ',top_k.shape)
+ # print('decision_time ',decision_time.shape)
+ # print('decision_time_now ',decision_time_now.shape)
+ neg_w = top_k.unsqueeze(1).unsqueeze(2)
+ neg_w = neg_w.repeat(1, decision_time_now.shape[1], decision_time_now.shape[2])
+ # print('neg_w ',neg_w.shape)
+ #print('neg_w ',neg_w[:,0:10,0])
+ pos_w = 1-neg_w
+ #print('pos_w ',pos_w[:,0:10,0])
+ decision_time_final = decision_time*pos_w + neg_w*decision_time_now
+ #print('decision_time_final ',decision_time_final[0,0:10,0])
+ # print(decision_time_final[0,:,:])
+ #assert 1==2
+ return decision_time_final
+
+ def forward(self, x, ref, label=None):
+ batch, time, dim = x.shape
+ logit = torch.zeros(1).cuda()
+ embeddings = self.encoder(ref)
+ mean_embedding = embeddings.mean(1)
+ if self.att_pool == True:
+ mean_embedding = self.bn(mean_embedding)
+ embeddings = embeddings.transpose(1,2)
+ embeddings = self.bn(embeddings)
+ embeddings = embeddings.transpose(1,2)
+ embedding = self.attention_pooling(embeddings, mean_embedding)
+ else:
+ embedding = mean_embedding
+ if self.enhancement == True:
+ decision_time = self.orcal_EE(x, embedding, label)
+ decision_up = torch.nn.functional.interpolate(
+ decision_time.transpose(1, 2), # [16, 2, 125]
+ time, # 501
+ mode='linear',
+ align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+ return decision_time[:,:,0], decision_up, logit
+
+ x = x.unsqueeze(1) # (b,1,t,d)
+ x = self.detection.features(x) #
+ x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+ embedding = embedding.unsqueeze(1)
+ embedding = embedding.repeat(1, x.shape[1], 1)
+ # x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+ x = self.detection.fusion(embedding, x)
+ # embedding = embedding.unsqueeze(1)
+ # embedding = embedding.repeat(1, x.shape[1], 1)
+ # x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+ if not hasattr(self, '_flattened'):
+ self.detection.gru.flatten_parameters()
+ x, _ = self.detection.gru(x) # x torch.Size([16, 125, 256])
+ x = self.detection.fc(x)
+ decision_time = torch.softmax(self.detection.outputlayer(x),dim=2) # x torch.Size([16, 125, 2])
+ decision_up = torch.nn.functional.interpolate(
+ decision_time.transpose(1, 2),
+ time, # 501
+ mode='linear',
+ align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+ return decision_time[:,:,0], decision_up, logit
\ No newline at end of file
diff --git a/audio_detection/target_sound_detection/src/utils.py b/audio_detection/target_sound_detection/src/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf1deeaef4e51fcc7cc42f4f3e2d9a34296371f9
--- /dev/null
+++ b/audio_detection/target_sound_detection/src/utils.py
@@ -0,0 +1,353 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time : 2021/3/9 16:33
+# @Author : dongchao yang
+# @File : train.py
+
+import collections
+import sys
+from loguru import logger
+from pprint import pformat
+
+import numpy as np
+import pandas as pd
+import scipy
+import six
+import sklearn.preprocessing as pre
+import torch
+import tqdm
+import yaml
+
+from scipy.interpolate import interp1d
+
+def parse_config_or_kwargs(config_file, **kwargs):
+ """parse_config_or_kwargs
+ :param config_file: Config file that has parameters, yaml format
+ :param **kwargs: Other alternative parameters or overwrites for config
+ """
+ with open(config_file) as con_read:
+ yaml_config = yaml.load(con_read, Loader=yaml.FullLoader)
+ arguments = dict(yaml_config, **kwargs)
+ return arguments
+
+
+def find_contiguous_regions(activity_array): # in this part, if you cannot understand the binary operation, I think you can write a O(n) complexity method
+ """Find contiguous regions from bool valued numpy.array.
+ Copy of https://dcase-repo.github.io/dcase_util/_modules/dcase_util/data/decisions.html#DecisionEncoder
+ Reason is:
+ 1. This does not belong to a class necessarily
+ 2. Import DecisionEncoder requires sndfile over some other imports..which causes some problems on clusters
+ """
+ change_indices = np.logical_xor(activity_array[1:], activity_array[:-1]).nonzero()[0]
+ change_indices += 1
+ if activity_array[0]:
+ # If the first element of activity_array is True add 0 at the beginning
+ change_indices = np.r_[0, change_indices]
+
+ if activity_array[-1]:
+ # If the last element of activity_array is True, add the length of the array
+ change_indices = np.r_[change_indices, activity_array.size]
+ # print(change_indices.reshape((-1, 2)))
+ # Reshape the result into two columns
+ return change_indices.reshape((-1, 2))
+
+
+def split_train_cv(
+ data_frame: pd.DataFrame,
+ frac: float = 0.9,
+ y=None, # Only for stratified, computes necessary split
+ **kwargs):
+ """split_train_cv
+
+ :param data_frame:
+ :type data_frame: pd.DataFrame
+ :param frac:
+ :type frac: float
+ """
+ if kwargs.get('mode',
+ None) == 'urbansed': # Filenames are DATA_-1 DATA_-2 etc
+ data_frame.loc[:, 'id'] = data_frame.groupby(
+ data_frame['filename'].str.split('_').apply(
+ lambda x: '_'.join(x[:-1]))).ngroup()
+ sampler = np.random.permutation(data_frame['id'].nunique())
+ num_train = int(frac * len(sampler))
+ train_indexes = sampler[:num_train]
+ cv_indexes = sampler[num_train:]
+ train_data = data_frame[data_frame['id'].isin(train_indexes)]
+ cv_data = data_frame[data_frame['id'].isin(cv_indexes)]
+ del train_data['id']
+ del cv_data['id']
+ elif kwargs.get('mode', None) == 'stratified': # stratified --> 分层的 ?
+ # Use statified sampling
+ from skmultilearn.model_selection import iterative_train_test_split
+ index_train, _, index_cv, _ = iterative_train_test_split(
+ data_frame.index.values.reshape(-1, 1), y, test_size=1. - frac)
+ train_data = data_frame[data_frame.index.isin(index_train.squeeze())]
+ cv_data = data_frame[data_frame.index.isin(index_cv.squeeze())] # cv --> cross validation
+ else:
+ # Simply split train_test
+ train_data = data_frame.sample(frac=frac, random_state=10)
+ cv_data = data_frame[~data_frame.index.isin(train_data.index)]
+ return train_data, cv_data
+
+
+
+def pprint_dict(in_dict, outputfun=sys.stdout.write, formatter='yaml'): # print yaml file
+ """pprint_dict
+ :param outputfun: function to use, defaults to sys.stdout
+ :param in_dict: dict to print
+ """
+ if formatter == 'yaml':
+ format_fun = yaml.dump
+ elif formatter == 'pretty':
+ format_fun = pformat
+ for line in format_fun(in_dict).split('\n'):
+ outputfun(line)
+
+
+def getfile_outlogger(outputfile):
+ log_format = "[{time:YYYY-MM-DD HH:mm:ss}] {message}"
+ logger.configure(handlers=[{"sink": sys.stderr, "format": log_format}])
+ if outputfile:
+ logger.add(outputfile, enqueue=True, format=log_format)
+ return logger
+
+# according label, get encoder
+def train_labelencoder(labels: pd.Series, sparse=True):
+ """encode_labels
+
+ Encodes labels
+
+ :param labels: pd.Series representing the raw labels e.g., Speech, Water
+ :param encoder (optional): Encoder already fitted
+ returns encoded labels (many hot) and the encoder
+ """
+ assert isinstance(labels, pd.Series), "Labels need to be series"
+ if isinstance(labels[0], six.string_types):
+ # In case of using non processed strings, e.g., Vaccum, Speech
+ label_array = labels.str.split(',').values.tolist() # split label according to ','
+ elif isinstance(labels[0], np.ndarray):
+ # Encoder does not like to see numpy array
+ label_array = [lab.tolist() for lab in labels]
+ elif isinstance(labels[0], collections.Iterable):
+ label_array = labels
+ encoder = pre.MultiLabelBinarizer(sparse_output=sparse)
+ encoder.fit(label_array)
+ return encoder
+
+
+def encode_labels(labels: pd.Series, encoder=None, sparse=True):
+ """encode_labels
+
+ Encodes labels
+
+ :param labels: pd.Series representing the raw labels e.g., Speech, Water
+ :param encoder (optional): Encoder already fitted
+ returns encoded labels (many hot) and the encoder
+ """
+ assert isinstance(labels, pd.Series), "Labels need to be series"
+ instance = labels.iloc[0]
+ if isinstance(instance, six.string_types):
+ # In case of using non processed strings, e.g., Vaccum, Speech
+ label_array = labels.str.split(',').values.tolist()
+ elif isinstance(instance, np.ndarray):
+ # Encoder does not like to see numpy array
+ label_array = [lab.tolist() for lab in labels]
+ elif isinstance(instance, collections.Iterable):
+ label_array = labels
+ # get label_array, it is a list ,contain a lot of label, this label are string type
+ if not encoder:
+ encoder = pre.MultiLabelBinarizer(sparse_output=sparse) # if we encoder is None, we should init a encoder firstly.
+ encoder.fit(label_array)
+ labels_encoded = encoder.transform(label_array) # transform string to digit
+ return labels_encoded, encoder
+
+ # return pd.arrays.SparseArray(
+ # [row.toarray().ravel() for row in labels_encoded]), encoder
+
+
+def decode_with_timestamps(events,labels: np.array):
+ """decode_with_timestamps
+ Decodes the predicted label array (2d) into a list of
+ [(Labelname, onset, offset), ...]
+
+ :param encoder: Encoder during training
+ :type encoder: pre.MultiLabelBinarizer
+ :param labels: n-dim array
+ :type labels: np.array
+ """
+ # print('events ',events)
+ # print('labels ',labels.shape)
+ #assert 1==2
+ if labels.ndim == 2:
+ #print('...')
+ return [_decode_with_timestamps(events[i],labels[i]) for i in range(labels.shape[0])]
+ else:
+ return _decode_with_timestamps(events,labels)
+
+
+def median_filter(x, window_size, threshold=0.5):
+ """median_filter
+ :param x: input prediction array of shape (B, T, C) or (B, T).
+ Input is a sequence of probabilities 0 <= x <= 1
+ :param window_size: An integer to use
+ :param threshold: Binary thresholding threshold
+ """
+ x = binarize(x, threshold=threshold) # transfer to 0 or 1
+ if x.ndim == 3:
+ size = (1, window_size, 1)
+ elif x.ndim == 2 and x.shape[0] == 1:
+ # Assume input is class-specific median filtering
+ # E.g, Batch x Time [1, 501]
+ size = (1, window_size)
+ elif x.ndim == 2 and x.shape[0] > 1:
+ # Assume input is standard median pooling, class-independent
+ # E.g., Time x Class [501, 10]
+ size = (window_size, 1)
+ return scipy.ndimage.median_filter(x, size=size)
+
+
+def _decode_with_timestamps(events,labels):
+ result_labels = []
+ # print('.......')
+ # print('labels ',labels.shape)
+ # print(labels)
+ change_indices = find_contiguous_regions(labels)
+ # print(change_indices)
+ # assert 1==2
+ for row in change_indices:
+ result_labels.append((events,row[0], row[1]))
+ return result_labels
+
+def inverse_transform_labels(encoder, pred):
+ if pred.ndim == 3:
+ return [encoder.inverse_transform(x) for x in pred]
+ else:
+ return encoder.inverse_transform(pred)
+
+
+def binarize(pred, threshold=0.5):
+ # Batch_wise
+ if pred.ndim == 3:
+ return np.array(
+ [pre.binarize(sub, threshold=threshold) for sub in pred])
+ else:
+ return pre.binarize(pred, threshold=threshold)
+
+
+def double_threshold(x, high_thres, low_thres, n_connect=1):
+ """double_threshold
+ Helper function to calculate double threshold for n-dim arrays
+
+ :param x: input array
+ :param high_thres: high threshold value
+ :param low_thres: Low threshold value
+ :param n_connect: Distance of <= n clusters will be merged
+ """
+ assert x.ndim <= 3, "Whoops something went wrong with the input ({}), check if its <= 3 dims".format(
+ x.shape)
+ if x.ndim == 3:
+ apply_dim = 1
+ elif x.ndim < 3:
+ apply_dim = 0
+ # x is assumed to be 3d: (batch, time, dim)
+ # Assumed to be 2d : (time, dim)
+ # Assumed to be 1d : (time)
+ # time axis is therefore at 1 for 3d and 0 for 2d (
+ return np.apply_along_axis(lambda x: _double_threshold(
+ x, high_thres, low_thres, n_connect=n_connect),
+ axis=apply_dim,
+ arr=x)
+
+
+def _double_threshold(x, high_thres, low_thres, n_connect=1, return_arr=True): # in nature, double_threshold considers boundary question
+ """_double_threshold
+ Computes a double threshold over the input array
+
+ :param x: input array, needs to be 1d
+ :param high_thres: High threshold over the array
+ :param low_thres: Low threshold over the array
+ :param n_connect: Postprocessing, maximal distance between clusters to connect
+ :param return_arr: By default this function returns the filtered indiced, but if return_arr = True it returns an array of tsame size as x filled with ones and zeros.
+ """
+ assert x.ndim == 1, "Input needs to be 1d"
+ high_locations = np.where(x > high_thres)[0] # return the index, where value is greater than high_thres
+ locations = x > low_thres # return true of false
+ encoded_pairs = find_contiguous_regions(locations)
+ # print('encoded_pairs ',encoded_pairs)
+ filtered_list = list(
+ filter(
+ lambda pair:
+ ((pair[0] <= high_locations) & (high_locations <= pair[1])).any(),
+ encoded_pairs)) # find encoded_pair where inclide a high_lacations
+ #print('filtered_list ',filtered_list)
+ filtered_list = connect_(filtered_list, n_connect) # if the distance of two pair is less than n_connect, we can merge them
+ if return_arr:
+ zero_one_arr = np.zeros_like(x, dtype=int)
+ for sl in filtered_list:
+ zero_one_arr[sl[0]:sl[1]] = 1
+ return zero_one_arr
+ return filtered_list
+
+
+def connect_clusters(x, n=1):
+ if x.ndim == 1:
+ return connect_clusters_(x, n)
+ if x.ndim >= 2:
+ return np.apply_along_axis(lambda a: connect_clusters_(a, n=n), -2, x)
+
+
+def connect_clusters_(x, n=1):
+ """connect_clusters_
+ Connects clustered predictions (0,1) in x with range n
+
+ :param x: Input array. zero-one format
+ :param n: Number of frames to skip until connection can be made
+ """
+ assert x.ndim == 1, "input needs to be 1d"
+ reg = find_contiguous_regions(x)
+ start_end = connect_(reg, n=n)
+ zero_one_arr = np.zeros_like(x, dtype=int)
+ for sl in start_end:
+ zero_one_arr[sl[0]:sl[1]] = 1
+ return zero_one_arr
+
+
+def connect_(pairs, n=1):
+ """connect_
+ Connects two adjacent clusters if their distance is <= n
+
+ :param pairs: Clusters of iterateables e.g., [(1,5),(7,10)]
+ :param n: distance between two clusters
+ """
+ if len(pairs) == 0:
+ return []
+ start_, end_ = pairs[0]
+ new_pairs = []
+ for i, (next_item, cur_item) in enumerate(zip(pairs[1:], pairs[0:])):
+ end_ = next_item[1]
+ if next_item[0] - cur_item[1] <= n:
+ pass
+ else:
+ new_pairs.append((start_, cur_item[1]))
+ start_ = next_item[0]
+ new_pairs.append((start_, end_))
+ return new_pairs
+
+
+def predictions_to_time(df, ratio):
+ df.onset = df.onset * ratio
+ df.offset = df.offset * ratio
+ return df
+
+def upgrade_resolution(arr, scale):
+ print('arr ',arr.shape)
+ x = np.arange(0, arr.shape[0])
+ f = interp1d(x, arr, kind='linear', axis=0, fill_value='extrapolate')
+ scale_x = np.arange(0, arr.shape[0], 1 / scale)
+ up_scale = f(scale_x)
+ return up_scale
+# a = [0.1,0.2,0.3,0.8,0.4,0.1,0.3,0.9,0.4]
+# a = np.array(a)
+# b = a>0.2
+# _double_threshold(a,0.7,0.2)
\ No newline at end of file
diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth b/audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth
new file mode 100644
index 0000000000000000000000000000000000000000..30ee4a84d0ad9ada87a5ec32dc40ec789e559e82
--- /dev/null
+++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e4525ad12621117c3a0fcfe974fd55e51583cd219106bf510438f4bec4edc18
+size 140604911
diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth
new file mode 100644
index 0000000000000000000000000000000000000000..23719b4c8deee6c6bcac7d7704f6ced56fa289e1
--- /dev/null
+++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1331dab1e4c3ac2bc5850156f2000a95fe333bdf06d08ce9b490550726548ab0
+size 2479
diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3bae9021caa4dd01659303bc05d2227436e7a64d
--- /dev/null
+++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9b44e30c4800462c177806bbd7009953d70d531c873e3791ca9aa85375d524d
+size 343538489
diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth b/audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth
new file mode 100644
index 0000000000000000000000000000000000000000..80e1bacdfbba7071092e562b4ddfb1d8fbee6e83
--- /dev/null
+++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de482358747778181e4dc530ec61ae94f53ae0b202ac92e99491fe4ceb3cbb1c
+size 255398
diff --git a/audio_to_text/__init__.py b/audio_to_text/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/audio_to_text/__pycache__/__init__.cpython-38.pyc b/audio_to_text/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd50a50ca70fb2a85f608f8dddd11a6abb7b807d
Binary files /dev/null and b/audio_to_text/__pycache__/__init__.cpython-38.pyc differ
diff --git a/audio_to_text/__pycache__/inference_waveform.cpython-38.pyc b/audio_to_text/__pycache__/inference_waveform.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbe230a79bcf0de51c959381e83483d5a9f322b8
Binary files /dev/null and b/audio_to_text/__pycache__/inference_waveform.cpython-38.pyc differ
diff --git a/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dcdbfafa6487b60aeb8e60f7ad80da2cd1150308
--- /dev/null
+++ b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml
@@ -0,0 +1,23 @@
+model:
+ encoder:
+ type: Cnn14RnnEncoder
+ args:
+ sample_rate: 32000
+ pretrained: ./audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
+ freeze_cnn: True
+ freeze_cnn_bn: True
+ bidirectional: True
+ dropout: 0.5
+ hidden_size: 256
+ num_layers: 3
+ decoder:
+ type: TransformerDecoder
+ args:
+ attn_emb_dim: 512
+ dropout: 0.2
+ emb_dim: 256
+ fc_emb_dim: 512
+ nlayers: 2
+ type: TransformerModel
+ args: {}
+
diff --git a/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth
new file mode 100644
index 0000000000000000000000000000000000000000..916026e45ca268db286047dacb1161a6a91a9613
--- /dev/null
+++ b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d22099e1025baae0f32ce09ec02c3d5fea001e295512fbf8754b5c66db21b0ec
+size 43027289
diff --git a/audio_to_text/captioning/__init__.py b/audio_to_text/captioning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/audio_to_text/captioning/__pycache__/__init__.cpython-38.pyc b/audio_to_text/captioning/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6c2da4d396315c560620b86eb2737a07e067ee9
Binary files /dev/null and b/audio_to_text/captioning/__pycache__/__init__.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__init__.py b/audio_to_text/captioning/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7259d671aaa8a7278b5aaa12069dc25caaad3cd8
--- /dev/null
+++ b/audio_to_text/captioning/models/__init__.py
@@ -0,0 +1,3 @@
+from .base_model import *
+from .transformer_model import *
+
diff --git a/audio_to_text/captioning/models/__pycache__/__init__.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c08c874fac4d909a82f27f959e743a4aba5436a8
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/__init__.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/attn_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/attn_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c38109625aa375a8953c1adb9e8493ba1c592dcb
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/attn_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/base_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/base_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8610e38e506ce60292444561bc0a7652bf2d718f
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/base_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/decoder.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/decoder.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9848a5dd6fd832d108179372880bed510ebc7da
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/decoder.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/encoder.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/encoder.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a00468fcad9293ac03d90700e32320a3fa9e474
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/encoder.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/fc_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/fc_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c9b1b8f984d37e0daed0fc541737be2f24a5e94
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/fc_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/rl_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/rl_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06caef1f8b20de29821f255f2bf3263b5aa65211
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/rl_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/style_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/style_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d383220b793d5a36727995d722ff8bbb7affbab
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/style_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/transformer_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/transformer_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..597cea7bac6d491e52c98c1f4e9f5f0ee9659e24
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/transformer_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/utils.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ab5c1da309ac502cbce9dffb00956d5c668b63b
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/utils.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/base_model.py b/audio_to_text/captioning/models/base_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd014e9b9e68fc80f44179ccbbe066791ecdd7c0
--- /dev/null
+++ b/audio_to_text/captioning/models/base_model.py
@@ -0,0 +1,500 @@
+# -*- coding: utf-8 -*-
+
+from typing import Dict
+
+import torch
+import torch.nn as nn
+
+from .utils import mean_with_lens, repeat_tensor
+
+
+class CaptionModel(nn.Module):
+ """
+ Encoder-decoder captioning model.
+ """
+
+ pad_idx = 0
+ start_idx = 1
+ end_idx = 2
+ max_length = 20
+
+ def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+ super().__init__()
+ self.encoder = encoder
+ self.decoder = decoder
+ self.vocab_size = decoder.vocab_size
+ self.train_forward_keys = ["cap", "cap_len", "ss_ratio"]
+ self.inference_forward_keys = ["sample_method", "max_length", "temp"]
+ freeze_encoder = kwargs.get("freeze_encoder", False)
+ if freeze_encoder:
+ for param in self.encoder.parameters():
+ param.requires_grad = False
+ self.check_decoder_compatibility()
+
+ def check_decoder_compatibility(self):
+ compatible_decoders = [x.__class__.__name__ for x in self.compatible_decoders]
+ assert isinstance(self.decoder, self.compatible_decoders), \
+ f"{self.decoder.__class__.__name__} is incompatible with " \
+ f"{self.__class__.__name__}, please use decoder in {compatible_decoders} "
+
+ @classmethod
+ def set_index(cls, start_idx, end_idx):
+ cls.start_idx = start_idx
+ cls.end_idx = end_idx
+
+ def forward(self, input_dict: Dict):
+ """
+ input_dict: {
+ (required)
+ mode: train/inference,
+ spec,
+ spec_len,
+ fc,
+ attn,
+ attn_len,
+ [sample_method: greedy],
+ [temp: 1.0] (in case of no teacher forcing)
+
+ (optional, mode=train)
+ cap,
+ cap_len,
+ ss_ratio,
+
+ (optional, mode=inference)
+ sample_method: greedy/beam,
+ max_length,
+ temp,
+ beam_size (optional, sample_method=beam),
+ n_best (optional, sample_method=beam),
+ }
+ """
+ # encoder_input_keys = ["spec", "spec_len", "fc", "attn", "attn_len"]
+ # encoder_input = { key: input_dict[key] for key in encoder_input_keys }
+ encoder_output_dict = self.encoder(input_dict)
+ if input_dict["mode"] == "train":
+ forward_dict = {
+ "mode": "train", "sample_method": "greedy", "temp": 1.0
+ }
+ for key in self.train_forward_keys:
+ forward_dict[key] = input_dict[key]
+ forward_dict.update(encoder_output_dict)
+ output = self.train_forward(forward_dict)
+ elif input_dict["mode"] == "inference":
+ forward_dict = {"mode": "inference"}
+ default_args = { "sample_method": "greedy", "max_length": self.max_length, "temp": 1.0 }
+ for key in self.inference_forward_keys:
+ if key in input_dict:
+ forward_dict[key] = input_dict[key]
+ else:
+ forward_dict[key] = default_args[key]
+
+ if forward_dict["sample_method"] == "beam":
+ forward_dict["beam_size"] = input_dict.get("beam_size", 3)
+ forward_dict["n_best"] = input_dict.get("n_best", False)
+ forward_dict["n_best_size"] = input_dict.get("n_best_size", forward_dict["beam_size"])
+ elif forward_dict["sample_method"] == "dbs":
+ forward_dict["beam_size"] = input_dict.get("beam_size", 6)
+ forward_dict["group_size"] = input_dict.get("group_size", 3)
+ forward_dict["diversity_lambda"] = input_dict.get("diversity_lambda", 0.5)
+ forward_dict["group_nbest"] = input_dict.get("group_nbest", True)
+
+ forward_dict.update(encoder_output_dict)
+ output = self.inference_forward(forward_dict)
+ else:
+ raise Exception("mode should be either 'train' or 'inference'")
+
+ return output
+
+ def prepare_output(self, input_dict):
+ output = {}
+ batch_size = input_dict["fc_emb"].size(0)
+ if input_dict["mode"] == "train":
+ max_length = input_dict["cap"].size(1) - 1
+ elif input_dict["mode"] == "inference":
+ max_length = input_dict["max_length"]
+ else:
+ raise Exception("mode should be either 'train' or 'inference'")
+ device = input_dict["fc_emb"].device
+ output["seq"] = torch.full((batch_size, max_length), self.end_idx,
+ dtype=torch.long)
+ output["logit"] = torch.empty(batch_size, max_length,
+ self.vocab_size).to(device)
+ output["sampled_logprob"] = torch.zeros(batch_size, max_length)
+ output["embed"] = torch.empty(batch_size, max_length,
+ self.decoder.d_model).to(device)
+ return output
+
+ def train_forward(self, input_dict):
+ if input_dict["ss_ratio"] != 1: # scheduled sampling training
+ input_dict["mode"] = "train"
+ return self.stepwise_forward(input_dict)
+ output = self.seq_forward(input_dict)
+ self.train_process(output, input_dict)
+ return output
+
+ def seq_forward(self, input_dict):
+ raise NotImplementedError
+
+ def train_process(self, output, input_dict):
+ pass
+
+ def inference_forward(self, input_dict):
+ if input_dict["sample_method"] == "beam":
+ return self.beam_search(input_dict)
+ elif input_dict["sample_method"] == "dbs":
+ return self.diverse_beam_search(input_dict)
+ return self.stepwise_forward(input_dict)
+
+ def stepwise_forward(self, input_dict):
+ """Step-by-step decoding"""
+ output = self.prepare_output(input_dict)
+ max_length = output["seq"].size(1)
+ # start sampling
+ for t in range(max_length):
+ input_dict["t"] = t
+ self.decode_step(input_dict, output)
+ if input_dict["mode"] == "inference": # decide whether to stop when sampling
+ unfinished_t = output["seq"][:, t] != self.end_idx
+ if t == 0:
+ unfinished = unfinished_t
+ else:
+ unfinished *= unfinished_t
+ output["seq"][:, t][~unfinished] = self.end_idx
+ if unfinished.sum() == 0:
+ break
+ self.stepwise_process(output)
+ return output
+
+ def decode_step(self, input_dict, output):
+ """Decoding operation of timestep t"""
+ decoder_input = self.prepare_decoder_input(input_dict, output)
+ # feed to the decoder to get logit
+ output_t = self.decoder(decoder_input)
+ logit_t = output_t["logit"]
+ # assert logit_t.ndim == 3
+ if logit_t.size(1) == 1:
+ logit_t = logit_t.squeeze(1)
+ embed_t = output_t["embed"].squeeze(1)
+ elif logit_t.size(1) > 1:
+ logit_t = logit_t[:, -1, :]
+ embed_t = output_t["embed"][:, -1, :]
+ else:
+ raise Exception("no logit output")
+ # sample the next input word and get the corresponding logit
+ sampled = self.sample_next_word(logit_t,
+ method=input_dict["sample_method"],
+ temp=input_dict["temp"])
+
+ output_t.update(sampled)
+ output_t["t"] = input_dict["t"]
+ output_t["logit"] = logit_t
+ output_t["embed"] = embed_t
+ self.stepwise_process_step(output, output_t)
+
+ def prepare_decoder_input(self, input_dict, output):
+ """Prepare the inp ut dict for the decoder"""
+ raise NotImplementedError
+
+ def stepwise_process_step(self, output, output_t):
+ """Postprocessing (save output values) after each timestep t"""
+ t = output_t["t"]
+ output["logit"][:, t, :] = output_t["logit"]
+ output["seq"][:, t] = output_t["word"]
+ output["sampled_logprob"][:, t] = output_t["probs"]
+ output["embed"][:, t, :] = output_t["embed"]
+
+ def stepwise_process(self, output):
+ """Postprocessing after the whole step-by-step autoregressive decoding"""
+ pass
+
+ def sample_next_word(self, logit, method, temp):
+ """Sample the next word, given probs output by the decoder"""
+ logprob = torch.log_softmax(logit, dim=1)
+ if method == "greedy":
+ sampled_logprob, word = torch.max(logprob.detach(), 1)
+ elif method == "gumbel":
+ def sample_gumbel(shape, eps=1e-20):
+ U = torch.rand(shape).to(logprob.device)
+ return -torch.log(-torch.log(U + eps) + eps)
+ def gumbel_softmax_sample(logit, temperature):
+ y = logit + sample_gumbel(logit.size())
+ return torch.log_softmax(y / temperature, dim=-1)
+ _logprob = gumbel_softmax_sample(logprob, temp)
+ _, word = torch.max(_logprob.data, 1)
+ sampled_logprob = logprob.gather(1, word.unsqueeze(-1))
+ else:
+ logprob = logprob / temp
+ if method.startswith("top"):
+ top_num = float(method[3:])
+ if 0 < top_num < 1: # top-p sampling
+ probs = torch.softmax(logit, dim=1)
+ sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=1)
+ _cumsum = sorted_probs.cumsum(1)
+ mask = _cumsum < top_num
+ mask = torch.cat([torch.ones_like(mask[:,:1]), mask[:,:-1]], 1)
+ sorted_probs = sorted_probs * mask.to(sorted_probs)
+ sorted_probs = sorted_probs / sorted_probs.sum(1, keepdim=True)
+ logprob.scatter_(1, sorted_indices, sorted_probs.log())
+ else: # top-k sampling
+ k = int(top_num)
+ tmp = torch.empty_like(logprob).fill_(float('-inf'))
+ topk, indices = torch.topk(logprob, k, dim=1)
+ tmp = tmp.scatter(1, indices, topk)
+ logprob = tmp
+ word = torch.distributions.Categorical(logits=logprob.detach()).sample()
+ sampled_logprob = logprob.gather(1, word.unsqueeze(-1)).squeeze(1)
+ word = word.detach().long()
+ # sampled_logprob: [N,], word: [N,]
+ return {"word": word, "probs": sampled_logprob}
+
+ def beam_search(self, input_dict):
+ output = self.prepare_output(input_dict)
+ max_length = input_dict["max_length"]
+ beam_size = input_dict["beam_size"]
+ if input_dict["n_best"]:
+ n_best_size = input_dict["n_best_size"]
+ batch_size, max_length = output["seq"].size()
+ output["seq"] = torch.full((batch_size, n_best_size, max_length),
+ self.end_idx, dtype=torch.long)
+
+ temp = input_dict["temp"]
+ # instance by instance beam seach
+ for i in range(output["seq"].size(0)):
+ output_i = self.prepare_beamsearch_output(input_dict)
+ input_dict["sample_idx"] = i
+ for t in range(max_length):
+ input_dict["t"] = t
+ output_t = self.beamsearch_step(input_dict, output_i)
+ #######################################
+ # merge with previous beam and select the current max prob beam
+ #######################################
+ logit_t = output_t["logit"]
+ if logit_t.size(1) == 1:
+ logit_t = logit_t.squeeze(1)
+ elif logit_t.size(1) > 1:
+ logit_t = logit_t[:, -1, :]
+ else:
+ raise Exception("no logit output")
+ logprob_t = torch.log_softmax(logit_t, dim=1)
+ logprob_t = torch.log_softmax(logprob_t / temp, dim=1)
+ logprob_t = output_i["topk_logprob"].unsqueeze(1) + logprob_t
+ if t == 0: # for the first step, all k seq will have the same probs
+ topk_logprob, topk_words = logprob_t[0].topk(
+ beam_size, 0, True, True)
+ else: # unroll and find top logprob, and their unrolled indices
+ topk_logprob, topk_words = logprob_t.view(-1).topk(
+ beam_size, 0, True, True)
+ topk_words = topk_words.cpu()
+ output_i["topk_logprob"] = topk_logprob
+ # output_i["prev_words_beam"] = topk_words // self.vocab_size # [beam_size,]
+ output_i["prev_words_beam"] = torch.div(topk_words, self.vocab_size,
+ rounding_mode='trunc')
+ output_i["next_word"] = topk_words % self.vocab_size # [beam_size,]
+ if t == 0:
+ output_i["seq"] = output_i["next_word"].unsqueeze(1)
+ else:
+ output_i["seq"] = torch.cat([
+ output_i["seq"][output_i["prev_words_beam"]],
+ output_i["next_word"].unsqueeze(1)], dim=1)
+
+ # add finished beams to results
+ is_end = output_i["next_word"] == self.end_idx
+ if t == max_length - 1:
+ is_end.fill_(1)
+
+ for beam_idx in range(beam_size):
+ if is_end[beam_idx]:
+ final_beam = {
+ "seq": output_i["seq"][beam_idx].clone(),
+ "score": output_i["topk_logprob"][beam_idx].item()
+ }
+ final_beam["score"] = final_beam["score"] / (t + 1)
+ output_i["done_beams"].append(final_beam)
+ output_i["topk_logprob"][is_end] -= 1000
+
+ self.beamsearch_process_step(output_i, output_t)
+
+ self.beamsearch_process(output, output_i, input_dict)
+ return output
+
+ def prepare_beamsearch_output(self, input_dict):
+ beam_size = input_dict["beam_size"]
+ device = input_dict["fc_emb"].device
+ output = {
+ "topk_logprob": torch.zeros(beam_size).to(device),
+ "seq": None,
+ "prev_words_beam": None,
+ "next_word": None,
+ "done_beams": [],
+ }
+ return output
+
+ def beamsearch_step(self, input_dict, output_i):
+ decoder_input = self.prepare_beamsearch_decoder_input(input_dict, output_i)
+ output_t = self.decoder(decoder_input)
+ output_t["t"] = input_dict["t"]
+ return output_t
+
+ def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+ raise NotImplementedError
+
+ def beamsearch_process_step(self, output_i, output_t):
+ pass
+
+ def beamsearch_process(self, output, output_i, input_dict):
+ i = input_dict["sample_idx"]
+ done_beams = sorted(output_i["done_beams"], key=lambda x: -x["score"])
+ if input_dict["n_best"]:
+ done_beams = done_beams[:input_dict["n_best_size"]]
+ for out_idx, done_beam in enumerate(done_beams):
+ seq = done_beam["seq"]
+ output["seq"][i][out_idx, :len(seq)] = seq
+ else:
+ seq = done_beams[0]["seq"]
+ output["seq"][i][:len(seq)] = seq
+
+ def diverse_beam_search(self, input_dict):
+
+ def add_diversity(seq_table, logprob, t, divm, diversity_lambda, bdash):
+ local_time = t - divm
+ unaug_logprob = logprob.clone()
+
+ if divm > 0:
+ change = torch.zeros(logprob.size(-1))
+ for prev_choice in range(divm):
+ prev_decisions = seq_table[prev_choice][..., local_time]
+ for prev_labels in range(bdash):
+ change.scatter_add_(0, prev_decisions[prev_labels], change.new_ones(1))
+
+ change = change.to(logprob.device)
+ logprob = logprob - repeat_tensor(change, bdash) * diversity_lambda
+
+ return logprob, unaug_logprob
+
+ output = self.prepare_output(input_dict)
+ group_size = input_dict["group_size"]
+ batch_size = output["seq"].size(0)
+ beam_size = input_dict["beam_size"]
+ bdash = beam_size // group_size
+ input_dict["bdash"] = bdash
+ diversity_lambda = input_dict["diversity_lambda"]
+ device = input_dict["fc_emb"].device
+ max_length = input_dict["max_length"]
+ temp = input_dict["temp"]
+ group_nbest = input_dict["group_nbest"]
+ batch_size, max_length = output["seq"].size()
+ if group_nbest:
+ output["seq"] = torch.full((batch_size, beam_size, max_length),
+ self.end_idx, dtype=torch.long)
+ else:
+ output["seq"] = torch.full((batch_size, group_size, max_length),
+ self.end_idx, dtype=torch.long)
+
+
+ for i in range(batch_size):
+ input_dict["sample_idx"] = i
+ seq_table = [torch.LongTensor(bdash, 0) for _ in range(group_size)] # group_size x [bdash, 0]
+ logprob_table = [torch.zeros(bdash).to(device) for _ in range(group_size)]
+ done_beams_table = [[] for _ in range(group_size)]
+
+ output_i = {
+ "prev_words_beam": [None for _ in range(group_size)],
+ "next_word": [None for _ in range(group_size)],
+ "state": [None for _ in range(group_size)]
+ }
+
+ for t in range(max_length + group_size - 1):
+ input_dict["t"] = t
+ for divm in range(group_size):
+ input_dict["divm"] = divm
+ if t >= divm and t <= max_length + divm - 1:
+ local_time = t - divm
+ decoder_input = self.prepare_dbs_decoder_input(input_dict, output_i)
+ output_t = self.decoder(decoder_input)
+ output_t["divm"] = divm
+ logit_t = output_t["logit"]
+ if logit_t.size(1) == 1:
+ logit_t = logit_t.squeeze(1)
+ elif logit_t.size(1) > 1:
+ logit_t = logit_t[:, -1, :]
+ else:
+ raise Exception("no logit output")
+ logprob_t = torch.log_softmax(logit_t, dim=1)
+ logprob_t = torch.log_softmax(logprob_t / temp, dim=1)
+ logprob_t, unaug_logprob_t = add_diversity(seq_table, logprob_t, t, divm, diversity_lambda, bdash)
+ logprob_t = logprob_table[divm].unsqueeze(-1) + logprob_t
+ if local_time == 0: # for the first step, all k seq will have the same probs
+ topk_logprob, topk_words = logprob_t[0].topk(
+ bdash, 0, True, True)
+ else: # unroll and find top logprob, and their unrolled indices
+ topk_logprob, topk_words = logprob_t.view(-1).topk(
+ bdash, 0, True, True)
+ topk_words = topk_words.cpu()
+ logprob_table[divm] = topk_logprob
+ output_i["prev_words_beam"][divm] = topk_words // self.vocab_size # [bdash,]
+ output_i["next_word"][divm] = topk_words % self.vocab_size # [bdash,]
+ if local_time > 0:
+ seq_table[divm] = seq_table[divm][output_i["prev_words_beam"][divm]]
+ seq_table[divm] = torch.cat([
+ seq_table[divm],
+ output_i["next_word"][divm].unsqueeze(-1)], -1)
+
+ is_end = seq_table[divm][:, t-divm] == self.end_idx
+ assert seq_table[divm].shape[-1] == t - divm + 1
+ if t == max_length + divm - 1:
+ is_end.fill_(1)
+ for beam_idx in range(bdash):
+ if is_end[beam_idx]:
+ final_beam = {
+ "seq": seq_table[divm][beam_idx].clone(),
+ "score": logprob_table[divm][beam_idx].item()
+ }
+ final_beam["score"] = final_beam["score"] / (t - divm + 1)
+ done_beams_table[divm].append(final_beam)
+ logprob_table[divm][is_end] -= 1000
+ self.dbs_process_step(output_i, output_t)
+ done_beams_table = [sorted(done_beams_table[divm], key=lambda x: -x["score"])[:bdash] for divm in range(group_size)]
+ if group_nbest:
+ done_beams = sum(done_beams_table, [])
+ else:
+ done_beams = [group_beam[0] for group_beam in done_beams_table]
+ for _, done_beam in enumerate(done_beams):
+ output["seq"][i, _, :len(done_beam["seq"])] = done_beam["seq"]
+
+ return output
+
+ def prepare_dbs_decoder_input(self, input_dict, output_i):
+ raise NotImplementedError
+
+ def dbs_process_step(self, output_i, output_t):
+ pass
+
+
+class CaptionSequenceModel(nn.Module):
+
+ def __init__(self, model, seq_output_size):
+ super().__init__()
+ self.model = model
+ if model.decoder.d_model != seq_output_size:
+ self.output_transform = nn.Linear(model.decoder.d_model, seq_output_size)
+ else:
+ self.output_transform = lambda x: x
+
+ def forward(self, input_dict):
+ output = self.model(input_dict)
+
+ if input_dict["mode"] == "train":
+ lens = input_dict["cap_len"] - 1
+ # seq_outputs: [N, d_model]
+ elif input_dict["mode"] == "inference":
+ if "sample_method" in input_dict and input_dict["sample_method"] == "beam":
+ return output
+ seq = output["seq"]
+ lens = torch.where(seq == self.model.end_idx, torch.zeros_like(seq), torch.ones_like(seq)).sum(dim=1)
+ else:
+ raise Exception("mode should be either 'train' or 'inference'")
+ seq_output = mean_with_lens(output["embed"], lens)
+ seq_output = self.output_transform(seq_output)
+ output["seq_output"] = seq_output
+ return output
+
diff --git a/audio_to_text/captioning/models/decoder.py b/audio_to_text/captioning/models/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..869eac11349f2321993e84be148aaa651892607f
--- /dev/null
+++ b/audio_to_text/captioning/models/decoder.py
@@ -0,0 +1,746 @@
+# -*- coding: utf-8 -*-
+
+import math
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from .utils import generate_length_mask, init, PositionalEncoding
+
+
+class BaseDecoder(nn.Module):
+ """
+ Take word/audio embeddings and output the next word probs
+ Base decoder, cannot be called directly
+ All decoders should inherit from this class
+ """
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim,
+ attn_emb_dim, dropout=0.2):
+ super().__init__()
+ self.emb_dim = emb_dim
+ self.vocab_size = vocab_size
+ self.fc_emb_dim = fc_emb_dim
+ self.attn_emb_dim = attn_emb_dim
+ self.word_embedding = nn.Embedding(vocab_size, emb_dim)
+ self.in_dropout = nn.Dropout(dropout)
+
+ def forward(self, x):
+ raise NotImplementedError
+
+ def load_word_embedding(self, weight, freeze=True):
+ embedding = np.load(weight)
+ assert embedding.shape[0] == self.vocab_size, "vocabulary size mismatch"
+ assert embedding.shape[1] == self.emb_dim, "embed size mismatch"
+
+ # embeddings = torch.as_tensor(embeddings).float()
+ # self.word_embeddings.weight = nn.Parameter(embeddings)
+ # for para in self.word_embeddings.parameters():
+ # para.requires_grad = tune
+ self.word_embedding = nn.Embedding.from_pretrained(embedding,
+ freeze=freeze)
+
+
+class RnnDecoder(BaseDecoder):
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs):
+ super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout,)
+ self.d_model = d_model
+ self.num_layers = kwargs.get('num_layers', 1)
+ self.bidirectional = kwargs.get('bidirectional', False)
+ self.rnn_type = kwargs.get('rnn_type', "GRU")
+ self.classifier = nn.Linear(
+ self.d_model * (self.bidirectional + 1), vocab_size)
+
+ def forward(self, x):
+ raise NotImplementedError
+
+ def init_hidden(self, bs, device):
+ num_dire = self.bidirectional + 1
+ n_layer = self.num_layers
+ hid_dim = self.d_model
+ if self.rnn_type == "LSTM":
+ return (torch.zeros(num_dire * n_layer, bs, hid_dim).to(device),
+ torch.zeros(num_dire * n_layer, bs, hid_dim).to(device))
+ else:
+ return torch.zeros(num_dire * n_layer, bs, hid_dim).to(device)
+
+
+class RnnFcDecoder(RnnDecoder):
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, dropout, d_model, **kwargs):
+ super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, dropout, d_model, **kwargs)
+ self.model = getattr(nn, self.rnn_type)(
+ input_size=self.emb_dim * 2,
+ hidden_size=self.d_model,
+ batch_first=True,
+ num_layers=self.num_layers,
+ bidirectional=self.bidirectional)
+ self.fc_proj = nn.Linear(self.fc_emb_dim, self.emb_dim)
+ self.apply(init)
+
+ def forward(self, input_dict):
+ word = input_dict["word"]
+ state = input_dict.get("state", None)
+ fc_emb = input_dict["fc_emb"]
+
+ word = word.to(fc_emb.device)
+ embed = self.in_dropout(self.word_embedding(word))
+
+ p_fc_emb = self.fc_proj(fc_emb)
+ # embed: [N, T, embed_size]
+ embed = torch.cat((embed, p_fc_emb), dim=-1)
+
+ out, state = self.model(embed, state)
+ # out: [N, T, hs], states: [num_layers * num_dire, N, hs]
+ logits = self.classifier(out)
+ output = {
+ "state": state,
+ "embeds": out,
+ "logits": logits
+ }
+
+ return output
+
+
+class Seq2SeqAttention(nn.Module):
+
+ def __init__(self, hs_enc, hs_dec, attn_size):
+ """
+ Args:
+ hs_enc: encoder hidden size
+ hs_dec: decoder hidden size
+ attn_size: attention vector size
+ """
+ super(Seq2SeqAttention, self).__init__()
+ self.h2attn = nn.Linear(hs_enc + hs_dec, attn_size)
+ self.v = nn.Parameter(torch.randn(attn_size))
+ self.apply(init)
+
+ def forward(self, h_dec, h_enc, src_lens):
+ """
+ Args:
+ h_dec: decoder hidden (query), [N, hs_dec]
+ h_enc: encoder memory (key/value), [N, src_max_len, hs_enc]
+ src_lens: source (encoder memory) lengths, [N, ]
+ """
+ N = h_enc.size(0)
+ src_max_len = h_enc.size(1)
+ h_dec = h_dec.unsqueeze(1).repeat(1, src_max_len, 1) # [N, src_max_len, hs_dec]
+
+ attn_input = torch.cat((h_dec, h_enc), dim=-1)
+ attn_out = torch.tanh(self.h2attn(attn_input)) # [N, src_max_len, attn_size]
+
+ v = self.v.repeat(N, 1).unsqueeze(1) # [N, 1, attn_size]
+ score = torch.bmm(v, attn_out.transpose(1, 2)).squeeze(1) # [N, src_max_len]
+
+ idxs = torch.arange(src_max_len).repeat(N).view(N, src_max_len)
+ mask = (idxs < src_lens.view(-1, 1)).to(h_dec.device)
+
+ score = score.masked_fill(mask == 0, -1e10)
+ weights = torch.softmax(score, dim=-1) # [N, src_max_len]
+ ctx = torch.bmm(weights.unsqueeze(1), h_enc).squeeze(1) # [N, hs_enc]
+
+ return ctx, weights
+
+
+class AttentionProj(nn.Module):
+
+ def __init__(self, hs_enc, hs_dec, embed_dim, attn_size):
+ self.q_proj = nn.Linear(hs_dec, embed_dim)
+ self.kv_proj = nn.Linear(hs_enc, embed_dim)
+ self.h2attn = nn.Linear(embed_dim * 2, attn_size)
+ self.v = nn.Parameter(torch.randn(attn_size))
+ self.apply(init)
+
+ def init(self, m):
+ if isinstance(m, nn.Linear):
+ nn.init.kaiming_uniform_(m.weight)
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+
+ def forward(self, h_dec, h_enc, src_lens):
+ """
+ Args:
+ h_dec: decoder hidden (query), [N, hs_dec]
+ h_enc: encoder memory (key/value), [N, src_max_len, hs_enc]
+ src_lens: source (encoder memory) lengths, [N, ]
+ """
+ h_enc = self.kv_proj(h_enc) # [N, src_max_len, embed_dim]
+ h_dec = self.q_proj(h_dec) # [N, embed_dim]
+ N = h_enc.size(0)
+ src_max_len = h_enc.size(1)
+ h_dec = h_dec.unsqueeze(1).repeat(1, src_max_len, 1) # [N, src_max_len, hs_dec]
+
+ attn_input = torch.cat((h_dec, h_enc), dim=-1)
+ attn_out = torch.tanh(self.h2attn(attn_input)) # [N, src_max_len, attn_size]
+
+ v = self.v.repeat(N, 1).unsqueeze(1) # [N, 1, attn_size]
+ score = torch.bmm(v, attn_out.transpose(1, 2)).squeeze(1) # [N, src_max_len]
+
+ idxs = torch.arange(src_max_len).repeat(N).view(N, src_max_len)
+ mask = (idxs < src_lens.view(-1, 1)).to(h_dec.device)
+
+ score = score.masked_fill(mask == 0, -1e10)
+ weights = torch.softmax(score, dim=-1) # [N, src_max_len]
+ ctx = torch.bmm(weights.unsqueeze(1), h_enc).squeeze(1) # [N, hs_enc]
+
+ return ctx, weights
+
+
+class BahAttnDecoder(RnnDecoder):
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs):
+ """
+ concatenate fc, attn, word to feed to the rnn
+ """
+ super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs)
+ attn_size = kwargs.get("attn_size", self.d_model)
+ self.model = getattr(nn, self.rnn_type)(
+ input_size=self.emb_dim * 3,
+ hidden_size=self.d_model,
+ batch_first=True,
+ num_layers=self.num_layers,
+ bidirectional=self.bidirectional)
+ self.attn = Seq2SeqAttention(self.attn_emb_dim,
+ self.d_model * (self.bidirectional + 1) * \
+ self.num_layers,
+ attn_size)
+ self.fc_proj = nn.Linear(self.fc_emb_dim, self.emb_dim)
+ self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim)
+ self.apply(init)
+
+ def forward(self, input_dict):
+ word = input_dict["word"]
+ state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+ fc_emb = input_dict["fc_emb"]
+ attn_emb = input_dict["attn_emb"]
+ attn_emb_len = input_dict["attn_emb_len"]
+
+ word = word.to(fc_emb.device)
+ embed = self.in_dropout(self.word_embedding(word))
+
+ # embed: [N, 1, embed_size]
+ if state is None:
+ state = self.init_hidden(word.size(0), fc_emb.device)
+ if self.rnn_type == "LSTM":
+ query = state[0].transpose(0, 1).flatten(1)
+ else:
+ query = state.transpose(0, 1).flatten(1)
+ c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+ p_fc_emb = self.fc_proj(fc_emb)
+ p_ctx = self.ctx_proj(c)
+ rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), p_fc_emb.unsqueeze(1)),
+ dim=-1)
+
+ out, state = self.model(rnn_input, state)
+
+ output = {
+ "state": state,
+ "embed": out,
+ "logit": self.classifier(out),
+ "attn_weight": attn_weight
+ }
+ return output
+
+
+class BahAttnDecoder2(RnnDecoder):
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs):
+ """
+ add fc, attn, word together to feed to the rnn
+ """
+ super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs)
+ attn_size = kwargs.get("attn_size", self.d_model)
+ self.model = getattr(nn, self.rnn_type)(
+ input_size=self.emb_dim,
+ hidden_size=self.d_model,
+ batch_first=True,
+ num_layers=self.num_layers,
+ bidirectional=self.bidirectional)
+ self.attn = Seq2SeqAttention(self.emb_dim,
+ self.d_model * (self.bidirectional + 1) * \
+ self.num_layers,
+ attn_size)
+ self.fc_proj = nn.Linear(self.fc_emb_dim, self.emb_dim)
+ self.attn_proj = nn.Linear(self.attn_emb_dim, self.emb_dim)
+ self.apply(partial(init, method="xavier"))
+
+ def forward(self, input_dict):
+ word = input_dict["word"]
+ state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+ fc_emb = input_dict["fc_emb"]
+ attn_emb = input_dict["attn_emb"]
+ attn_emb_len = input_dict["attn_emb_len"]
+
+ word = word.to(fc_emb.device)
+ embed = self.in_dropout(self.word_embedding(word))
+ p_attn_emb = self.attn_proj(attn_emb)
+
+ # embed: [N, 1, embed_size]
+ if state is None:
+ state = self.init_hidden(word.size(0), fc_emb.device)
+ if self.rnn_type == "LSTM":
+ query = state[0].transpose(0, 1).flatten(1)
+ else:
+ query = state.transpose(0, 1).flatten(1)
+ c, attn_weight = self.attn(query, p_attn_emb, attn_emb_len)
+
+ p_fc_emb = self.fc_proj(fc_emb)
+ rnn_input = embed + c.unsqueeze(1) + p_fc_emb.unsqueeze(1)
+
+ out, state = self.model(rnn_input, state)
+
+ output = {
+ "state": state,
+ "embed": out,
+ "logit": self.classifier(out),
+ "attn_weight": attn_weight
+ }
+ return output
+
+
+class ConditionalBahAttnDecoder(RnnDecoder):
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs):
+ """
+ concatenate fc, attn, word to feed to the rnn
+ """
+ super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs)
+ attn_size = kwargs.get("attn_size", self.d_model)
+ self.model = getattr(nn, self.rnn_type)(
+ input_size=self.emb_dim * 3,
+ hidden_size=self.d_model,
+ batch_first=True,
+ num_layers=self.num_layers,
+ bidirectional=self.bidirectional)
+ self.attn = Seq2SeqAttention(self.attn_emb_dim,
+ self.d_model * (self.bidirectional + 1) * \
+ self.num_layers,
+ attn_size)
+ self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim)
+ self.condition_embedding = nn.Embedding(2, emb_dim)
+ self.apply(init)
+
+ def forward(self, input_dict):
+ word = input_dict["word"]
+ state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+ fc_emb = input_dict["fc_emb"]
+ attn_emb = input_dict["attn_emb"]
+ attn_emb_len = input_dict["attn_emb_len"]
+ condition = input_dict["condition"]
+
+ word = word.to(fc_emb.device)
+ embed = self.in_dropout(self.word_embedding(word))
+
+ condition = torch.as_tensor([[1 - c, c] for c in condition]).to(fc_emb.device)
+ condition_emb = torch.matmul(condition, self.condition_embedding.weight)
+ # condition_embs: [N, emb_dim]
+
+ # embed: [N, 1, embed_size]
+ if state is None:
+ state = self.init_hidden(word.size(0), fc_emb.device)
+ if self.rnn_type == "LSTM":
+ query = state[0].transpose(0, 1).flatten(1)
+ else:
+ query = state.transpose(0, 1).flatten(1)
+ c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+ p_ctx = self.ctx_proj(c)
+ rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), condition_emb.unsqueeze(1)),
+ dim=-1)
+
+ out, state = self.model(rnn_input, state)
+
+ output = {
+ "state": state,
+ "embed": out,
+ "logit": self.classifier(out),
+ "attn_weight": attn_weight
+ }
+ return output
+
+
+class StructBahAttnDecoder(RnnDecoder):
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim, struct_vocab_size,
+ attn_emb_dim, dropout, d_model, **kwargs):
+ """
+ concatenate fc, attn, word to feed to the rnn
+ """
+ super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs)
+ attn_size = kwargs.get("attn_size", self.d_model)
+ self.model = getattr(nn, self.rnn_type)(
+ input_size=self.emb_dim * 3,
+ hidden_size=self.d_model,
+ batch_first=True,
+ num_layers=self.num_layers,
+ bidirectional=self.bidirectional)
+ self.attn = Seq2SeqAttention(self.attn_emb_dim,
+ self.d_model * (self.bidirectional + 1) * \
+ self.num_layers,
+ attn_size)
+ self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim)
+ self.struct_embedding = nn.Embedding(struct_vocab_size, emb_dim)
+ self.apply(init)
+
+ def forward(self, input_dict):
+ word = input_dict["word"]
+ state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+ fc_emb = input_dict["fc_emb"]
+ attn_emb = input_dict["attn_emb"]
+ attn_emb_len = input_dict["attn_emb_len"]
+ structure = input_dict["structure"]
+
+ word = word.to(fc_emb.device)
+ embed = self.in_dropout(self.word_embedding(word))
+
+ struct_emb = self.struct_embedding(structure)
+ # struct_embs: [N, emb_dim]
+
+ # embed: [N, 1, embed_size]
+ if state is None:
+ state = self.init_hidden(word.size(0), fc_emb.device)
+ if self.rnn_type == "LSTM":
+ query = state[0].transpose(0, 1).flatten(1)
+ else:
+ query = state.transpose(0, 1).flatten(1)
+ c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+ p_ctx = self.ctx_proj(c)
+ rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), struct_emb.unsqueeze(1)), dim=-1)
+
+ out, state = self.model(rnn_input, state)
+
+ output = {
+ "state": state,
+ "embed": out,
+ "logit": self.classifier(out),
+ "attn_weight": attn_weight
+ }
+ return output
+
+
+class StyleBahAttnDecoder(RnnDecoder):
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs):
+ """
+ concatenate fc, attn, word to feed to the rnn
+ """
+ super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs)
+ attn_size = kwargs.get("attn_size", self.d_model)
+ self.model = getattr(nn, self.rnn_type)(
+ input_size=self.emb_dim * 3,
+ hidden_size=self.d_model,
+ batch_first=True,
+ num_layers=self.num_layers,
+ bidirectional=self.bidirectional)
+ self.attn = Seq2SeqAttention(self.attn_emb_dim,
+ self.d_model * (self.bidirectional + 1) * \
+ self.num_layers,
+ attn_size)
+ self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim)
+ self.apply(init)
+
+ def forward(self, input_dict):
+ word = input_dict["word"]
+ state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+ fc_emb = input_dict["fc_emb"]
+ attn_emb = input_dict["attn_emb"]
+ attn_emb_len = input_dict["attn_emb_len"]
+ style = input_dict["style"]
+
+ word = word.to(fc_emb.device)
+ embed = self.in_dropout(self.word_embedding(word))
+
+ # embed: [N, 1, embed_size]
+ if state is None:
+ state = self.init_hidden(word.size(0), fc_emb.device)
+ if self.rnn_type == "LSTM":
+ query = state[0].transpose(0, 1).flatten(1)
+ else:
+ query = state.transpose(0, 1).flatten(1)
+ c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+ p_ctx = self.ctx_proj(c)
+ rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), style.unsqueeze(1)),
+ dim=-1)
+
+ out, state = self.model(rnn_input, state)
+
+ output = {
+ "state": state,
+ "embed": out,
+ "logit": self.classifier(out),
+ "attn_weight": attn_weight
+ }
+ return output
+
+
+class BahAttnDecoder3(RnnDecoder):
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs):
+ """
+ concatenate fc, attn, word to feed to the rnn
+ """
+ super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs)
+ attn_size = kwargs.get("attn_size", self.d_model)
+ self.model = getattr(nn, self.rnn_type)(
+ input_size=self.emb_dim + attn_emb_dim,
+ hidden_size=self.d_model,
+ batch_first=True,
+ num_layers=self.num_layers,
+ bidirectional=self.bidirectional)
+ self.attn = Seq2SeqAttention(self.attn_emb_dim,
+ self.d_model * (self.bidirectional + 1) * \
+ self.num_layers,
+ attn_size)
+ self.ctx_proj = lambda x: x
+ self.apply(init)
+
+ def forward(self, input_dict):
+ word = input_dict["word"]
+ state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+ fc_emb = input_dict["fc_emb"]
+ attn_emb = input_dict["attn_emb"]
+ attn_emb_len = input_dict["attn_emb_len"]
+
+ if word.size(-1) == self.fc_emb_dim: # fc_emb
+ embed = word.unsqueeze(1)
+ elif word.size(-1) == 1: # word
+ word = word.to(fc_emb.device)
+ embed = self.in_dropout(self.word_embedding(word))
+ else:
+ raise Exception(f"problem with word input size {word.size()}")
+
+ # embed: [N, 1, embed_size]
+ if state is None:
+ state = self.init_hidden(word.size(0), fc_emb.device)
+ if self.rnn_type == "LSTM":
+ query = state[0].transpose(0, 1).flatten(1)
+ else:
+ query = state.transpose(0, 1).flatten(1)
+ c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+ p_ctx = self.ctx_proj(c)
+ rnn_input = torch.cat((embed, p_ctx.unsqueeze(1)), dim=-1)
+
+ out, state = self.model(rnn_input, state)
+
+ output = {
+ "state": state,
+ "embed": out,
+ "logit": self.classifier(out),
+ "attn_weight": attn_weight
+ }
+ return output
+
+
+class SpecificityBahAttnDecoder(RnnDecoder):
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs):
+ """
+ concatenate fc, attn, word to feed to the rnn
+ """
+ super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, d_model, **kwargs)
+ attn_size = kwargs.get("attn_size", self.d_model)
+ self.model = getattr(nn, self.rnn_type)(
+ input_size=self.emb_dim + attn_emb_dim + 1,
+ hidden_size=self.d_model,
+ batch_first=True,
+ num_layers=self.num_layers,
+ bidirectional=self.bidirectional)
+ self.attn = Seq2SeqAttention(self.attn_emb_dim,
+ self.d_model * (self.bidirectional + 1) * \
+ self.num_layers,
+ attn_size)
+ self.ctx_proj = lambda x: x
+ self.apply(init)
+
+ def forward(self, input_dict):
+ word = input_dict["word"]
+ state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+ fc_emb = input_dict["fc_emb"]
+ attn_emb = input_dict["attn_emb"]
+ attn_emb_len = input_dict["attn_emb_len"]
+ condition = input_dict["condition"] # [N,]
+
+ word = word.to(fc_emb.device)
+ embed = self.in_dropout(self.word_embedding(word))
+
+ # embed: [N, 1, embed_size]
+ if state is None:
+ state = self.init_hidden(word.size(0), fc_emb.device)
+ if self.rnn_type == "LSTM":
+ query = state[0].transpose(0, 1).flatten(1)
+ else:
+ query = state.transpose(0, 1).flatten(1)
+ c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+ p_ctx = self.ctx_proj(c)
+ rnn_input = torch.cat(
+ (embed, p_ctx.unsqueeze(1), condition.reshape(-1, 1, 1)),
+ dim=-1)
+
+ out, state = self.model(rnn_input, state)
+
+ output = {
+ "state": state,
+ "embed": out,
+ "logit": self.classifier(out),
+ "attn_weight": attn_weight
+ }
+ return output
+
+
+class TransformerDecoder(BaseDecoder):
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, dropout, **kwargs):
+ super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout=dropout,)
+ self.d_model = emb_dim
+ self.nhead = kwargs.get("nhead", self.d_model // 64)
+ self.nlayers = kwargs.get("nlayers", 2)
+ self.dim_feedforward = kwargs.get("dim_feedforward", self.d_model * 4)
+
+ self.pos_encoder = PositionalEncoding(self.d_model, dropout)
+ layer = nn.TransformerDecoderLayer(d_model=self.d_model,
+ nhead=self.nhead,
+ dim_feedforward=self.dim_feedforward,
+ dropout=dropout)
+ self.model = nn.TransformerDecoder(layer, self.nlayers)
+ self.classifier = nn.Linear(self.d_model, vocab_size)
+ self.attn_proj = nn.Sequential(
+ nn.Linear(self.attn_emb_dim, self.d_model),
+ nn.ReLU(),
+ nn.Dropout(dropout),
+ nn.LayerNorm(self.d_model)
+ )
+ # self.attn_proj = lambda x: x
+ self.init_params()
+
+ def init_params(self):
+ for p in self.parameters():
+ if p.dim() > 1:
+ nn.init.xavier_uniform_(p)
+
+ def generate_square_subsequent_mask(self, max_length):
+ mask = (torch.triu(torch.ones(max_length, max_length)) == 1).transpose(0, 1)
+ mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+ return mask
+
+ def forward(self, input_dict):
+ word = input_dict["word"]
+ attn_emb = input_dict["attn_emb"]
+ attn_emb_len = input_dict["attn_emb_len"]
+ cap_padding_mask = input_dict["cap_padding_mask"]
+
+ p_attn_emb = self.attn_proj(attn_emb)
+ p_attn_emb = p_attn_emb.transpose(0, 1) # [T_src, N, emb_dim]
+ word = word.to(attn_emb.device)
+ embed = self.in_dropout(self.word_embedding(word)) * math.sqrt(self.emb_dim) # [N, T, emb_dim]
+ embed = embed.transpose(0, 1) # [T, N, emb_dim]
+ embed = self.pos_encoder(embed)
+
+ tgt_mask = self.generate_square_subsequent_mask(embed.size(0)).to(attn_emb.device)
+ memory_key_padding_mask = ~generate_length_mask(attn_emb_len, attn_emb.size(1)).to(attn_emb.device)
+ output = self.model(embed, p_attn_emb, tgt_mask=tgt_mask,
+ tgt_key_padding_mask=cap_padding_mask,
+ memory_key_padding_mask=memory_key_padding_mask)
+ output = output.transpose(0, 1)
+ output = {
+ "embed": output,
+ "logit": self.classifier(output),
+ }
+ return output
+
+
+
+
+class EventTransformerDecoder(TransformerDecoder):
+
+ def forward(self, input_dict):
+ word = input_dict["word"] # index of word embeddings
+ attn_emb = input_dict["attn_emb"]
+ attn_emb_len = input_dict["attn_emb_len"]
+ cap_padding_mask = input_dict["cap_padding_mask"]
+ event_emb = input_dict["event"] # [N, emb_dim]
+
+ p_attn_emb = self.attn_proj(attn_emb)
+ p_attn_emb = p_attn_emb.transpose(0, 1) # [T_src, N, emb_dim]
+ word = word.to(attn_emb.device)
+ embed = self.in_dropout(self.word_embedding(word)) * math.sqrt(self.emb_dim) # [N, T, emb_dim]
+
+ embed = embed.transpose(0, 1) # [T, N, emb_dim]
+ embed += event_emb
+ embed = self.pos_encoder(embed)
+
+ tgt_mask = self.generate_square_subsequent_mask(embed.size(0)).to(attn_emb.device)
+ memory_key_padding_mask = ~generate_length_mask(attn_emb_len, attn_emb.size(1)).to(attn_emb.device)
+ output = self.model(embed, p_attn_emb, tgt_mask=tgt_mask,
+ tgt_key_padding_mask=cap_padding_mask,
+ memory_key_padding_mask=memory_key_padding_mask)
+ output = output.transpose(0, 1)
+ output = {
+ "embed": output,
+ "logit": self.classifier(output),
+ }
+ return output
+
+
+class KeywordProbTransformerDecoder(TransformerDecoder):
+
+ def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, keyword_classes_num, **kwargs):
+ super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+ dropout, **kwargs)
+ self.keyword_proj = nn.Linear(keyword_classes_num, self.d_model)
+ self.word_keyword_norm = nn.LayerNorm(self.d_model)
+
+ def forward(self, input_dict):
+ word = input_dict["word"] # index of word embeddings
+ attn_emb = input_dict["attn_emb"]
+ attn_emb_len = input_dict["attn_emb_len"]
+ cap_padding_mask = input_dict["cap_padding_mask"]
+ keyword = input_dict["keyword"] # [N, keyword_classes_num]
+
+ p_attn_emb = self.attn_proj(attn_emb)
+ p_attn_emb = p_attn_emb.transpose(0, 1) # [T_src, N, emb_dim]
+ word = word.to(attn_emb.device)
+ embed = self.in_dropout(self.word_embedding(word)) * math.sqrt(self.emb_dim) # [N, T, emb_dim]
+
+ embed = embed.transpose(0, 1) # [T, N, emb_dim]
+ embed += self.keyword_proj(keyword)
+ embed = self.word_keyword_norm(embed)
+
+ embed = self.pos_encoder(embed)
+
+ tgt_mask = self.generate_square_subsequent_mask(embed.size(0)).to(attn_emb.device)
+ memory_key_padding_mask = ~generate_length_mask(attn_emb_len, attn_emb.size(1)).to(attn_emb.device)
+ output = self.model(embed, p_attn_emb, tgt_mask=tgt_mask,
+ tgt_key_padding_mask=cap_padding_mask,
+ memory_key_padding_mask=memory_key_padding_mask)
+ output = output.transpose(0, 1)
+ output = {
+ "embed": output,
+ "logit": self.classifier(output),
+ }
+ return output
diff --git a/audio_to_text/captioning/models/encoder.py b/audio_to_text/captioning/models/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d6d8e87e0ed07abc04f6e79b0fa08cd102398a0
--- /dev/null
+++ b/audio_to_text/captioning/models/encoder.py
@@ -0,0 +1,686 @@
+# -*- coding: utf-8 -*-
+
+import math
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchaudio import transforms
+from torchlibrosa.augmentation import SpecAugmentation
+
+from .utils import mean_with_lens, max_with_lens, \
+ init, pack_wrapper, generate_length_mask, PositionalEncoding
+
+
+def init_layer(layer):
+ """Initialize a Linear or Convolutional layer. """
+ nn.init.xavier_uniform_(layer.weight)
+
+ if hasattr(layer, 'bias'):
+ if layer.bias is not None:
+ layer.bias.data.fill_(0.)
+
+
+def init_bn(bn):
+ """Initialize a Batchnorm layer. """
+ bn.bias.data.fill_(0.)
+ bn.weight.data.fill_(1.)
+
+
+class BaseEncoder(nn.Module):
+
+ """
+ Encode the given audio into embedding
+ Base encoder class, cannot be called directly
+ All encoders should inherit from this class
+ """
+
+ def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim):
+ super(BaseEncoder, self).__init__()
+ self.spec_dim = spec_dim
+ self.fc_feat_dim = fc_feat_dim
+ self.attn_feat_dim = attn_feat_dim
+
+
+ def forward(self, x):
+ #########################
+ # an encoder first encodes audio feature into embedding, obtaining
+ # `encoded`: {
+ # fc_embs: [N, fc_emb_dim],
+ # attn_embs: [N, attn_max_len, attn_emb_dim],
+ # attn_emb_lens: [N,]
+ # }
+ #########################
+ raise NotImplementedError
+
+
+class Block2D(nn.Module):
+
+ def __init__(self, cin, cout, kernel_size=3, padding=1):
+ super().__init__()
+ self.block = nn.Sequential(
+ nn.BatchNorm2d(cin),
+ nn.Conv2d(cin,
+ cout,
+ kernel_size=kernel_size,
+ padding=padding,
+ bias=False),
+ nn.LeakyReLU(inplace=True, negative_slope=0.1))
+
+ def forward(self, x):
+ return self.block(x)
+
+
+class LinearSoftPool(nn.Module):
+ """LinearSoftPool
+ Linear softmax, takes logits and returns a probability, near to the actual maximum value.
+ Taken from the paper:
+ A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling
+ https://arxiv.org/abs/1810.09050
+ """
+ def __init__(self, pooldim=1):
+ super().__init__()
+ self.pooldim = pooldim
+
+ def forward(self, logits, time_decision):
+ return (time_decision**2).sum(self.pooldim) / time_decision.sum(
+ self.pooldim)
+
+
+class MeanPool(nn.Module):
+
+ def __init__(self, pooldim=1):
+ super().__init__()
+ self.pooldim = pooldim
+
+ def forward(self, logits, decision):
+ return torch.mean(decision, dim=self.pooldim)
+
+
+class AttentionPool(nn.Module):
+ """docstring for AttentionPool"""
+ def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
+ super().__init__()
+ self.inputdim = inputdim
+ self.outputdim = outputdim
+ self.pooldim = pooldim
+ self.transform = nn.Linear(inputdim, outputdim)
+ self.activ = nn.Softmax(dim=self.pooldim)
+ self.eps = 1e-7
+
+ def forward(self, logits, decision):
+ # Input is (B, T, D)
+ # B, T, D
+ w = self.activ(torch.clamp(self.transform(logits), -15, 15))
+ detect = (decision * w).sum(
+ self.pooldim) / (w.sum(self.pooldim) + self.eps)
+ # B, T, D
+ return detect
+
+
+class MMPool(nn.Module):
+
+ def __init__(self, dims):
+ super().__init__()
+ self.avgpool = nn.AvgPool2d(dims)
+ self.maxpool = nn.MaxPool2d(dims)
+
+ def forward(self, x):
+ return self.avgpool(x) + self.maxpool(x)
+
+
+def parse_poolingfunction(poolingfunction_name='mean', **kwargs):
+ """parse_poolingfunction
+ A heler function to parse any temporal pooling
+ Pooling is done on dimension 1
+ :param poolingfunction_name:
+ :param **kwargs:
+ """
+ poolingfunction_name = poolingfunction_name.lower()
+ if poolingfunction_name == 'mean':
+ return MeanPool(pooldim=1)
+ elif poolingfunction_name == 'linear':
+ return LinearSoftPool(pooldim=1)
+ elif poolingfunction_name == 'attention':
+ return AttentionPool(inputdim=kwargs['inputdim'],
+ outputdim=kwargs['outputdim'])
+
+
+def embedding_pooling(x, lens, pooling="mean"):
+ if pooling == "max":
+ fc_embs = max_with_lens(x, lens)
+ elif pooling == "mean":
+ fc_embs = mean_with_lens(x, lens)
+ elif pooling == "mean+max":
+ x_mean = mean_with_lens(x, lens)
+ x_max = max_with_lens(x, lens)
+ fc_embs = x_mean + x_max
+ elif pooling == "last":
+ indices = (lens - 1).reshape(-1, 1, 1).repeat(1, 1, x.size(-1))
+ # indices: [N, 1, hidden]
+ fc_embs = torch.gather(x, 1, indices).squeeze(1)
+ else:
+ raise Exception(f"pooling method {pooling} not support")
+ return fc_embs
+
+
+class Cdur5Encoder(BaseEncoder):
+
+ def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, pooling="mean"):
+ super().__init__(spec_dim, fc_feat_dim, attn_feat_dim)
+ self.pooling = pooling
+ self.features = nn.Sequential(
+ Block2D(1, 32),
+ nn.LPPool2d(4, (2, 4)),
+ Block2D(32, 128),
+ Block2D(128, 128),
+ nn.LPPool2d(4, (2, 4)),
+ Block2D(128, 128),
+ Block2D(128, 128),
+ nn.LPPool2d(4, (1, 4)),
+ nn.Dropout(0.3),
+ )
+ with torch.no_grad():
+ rnn_input_dim = self.features(
+ torch.randn(1, 1, 500, spec_dim)).shape
+ rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+
+ self.gru = nn.GRU(rnn_input_dim,
+ 128,
+ bidirectional=True,
+ batch_first=True)
+ self.apply(init)
+
+ def forward(self, input_dict):
+ x = input_dict["spec"]
+ lens = input_dict["spec_len"]
+ if "upsample" not in input_dict:
+ input_dict["upsample"] = False
+ lens = torch.as_tensor(copy.deepcopy(lens))
+ N, T, _ = x.shape
+ x = x.unsqueeze(1)
+ x = self.features(x)
+ x = x.transpose(1, 2).contiguous().flatten(-2)
+ x, _ = self.gru(x)
+ if input_dict["upsample"]:
+ x = nn.functional.interpolate(
+ x.transpose(1, 2),
+ T,
+ mode='linear',
+ align_corners=False).transpose(1, 2)
+ else:
+ lens //= 4
+ attn_emb = x
+ fc_emb = embedding_pooling(x, lens, self.pooling)
+ return {
+ "attn_emb": attn_emb,
+ "fc_emb": fc_emb,
+ "attn_emb_len": lens
+ }
+
+
+def conv_conv_block(in_channel, out_channel):
+ return nn.Sequential(
+ nn.Conv2d(in_channel,
+ out_channel,
+ kernel_size=3,
+ bias=False,
+ padding=1),
+ nn.BatchNorm2d(out_channel),
+ nn.ReLU(True),
+ nn.Conv2d(out_channel,
+ out_channel,
+ kernel_size=3,
+ bias=False,
+ padding=1),
+ nn.BatchNorm2d(out_channel),
+ nn.ReLU(True)
+ )
+
+
+class Cdur8Encoder(BaseEncoder):
+
+ def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, pooling="mean"):
+ super().__init__(spec_dim, fc_feat_dim, attn_feat_dim)
+ self.pooling = pooling
+ self.features = nn.Sequential(
+ conv_conv_block(1, 64),
+ MMPool((2, 2)),
+ nn.Dropout(0.2, True),
+ conv_conv_block(64, 128),
+ MMPool((2, 2)),
+ nn.Dropout(0.2, True),
+ conv_conv_block(128, 256),
+ MMPool((1, 2)),
+ nn.Dropout(0.2, True),
+ conv_conv_block(256, 512),
+ MMPool((1, 2)),
+ nn.Dropout(0.2, True),
+ nn.AdaptiveAvgPool2d((None, 1)),
+ )
+ self.init_bn = nn.BatchNorm2d(spec_dim)
+ self.embedding = nn.Linear(512, 512)
+ self.gru = nn.GRU(512, 256, bidirectional=True, batch_first=True)
+ self.apply(init)
+
+ def forward(self, input_dict):
+ x = input_dict["spec"]
+ lens = input_dict["spec_len"]
+ lens = torch.as_tensor(copy.deepcopy(lens))
+ x = x.unsqueeze(1) # B x 1 x T x D
+ x = x.transpose(1, 3)
+ x = self.init_bn(x)
+ x = x.transpose(1, 3)
+ x = self.features(x)
+ x = x.transpose(1, 2).contiguous().flatten(-2)
+ x = F.dropout(x, p=0.5, training=self.training)
+ x = F.relu_(self.embedding(x))
+ x, _ = self.gru(x)
+ attn_emb = x
+ lens //= 4
+ fc_emb = embedding_pooling(x, lens, self.pooling)
+ return {
+ "attn_emb": attn_emb,
+ "fc_emb": fc_emb,
+ "attn_emb_len": lens
+ }
+
+
+class Cnn10Encoder(BaseEncoder):
+
+ def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim):
+ super().__init__(spec_dim, fc_feat_dim, attn_feat_dim)
+ self.features = nn.Sequential(
+ conv_conv_block(1, 64),
+ nn.AvgPool2d((2, 2)),
+ nn.Dropout(0.2, True),
+ conv_conv_block(64, 128),
+ nn.AvgPool2d((2, 2)),
+ nn.Dropout(0.2, True),
+ conv_conv_block(128, 256),
+ nn.AvgPool2d((2, 2)),
+ nn.Dropout(0.2, True),
+ conv_conv_block(256, 512),
+ nn.AvgPool2d((2, 2)),
+ nn.Dropout(0.2, True),
+ nn.AdaptiveAvgPool2d((None, 1)),
+ )
+ self.init_bn = nn.BatchNorm2d(spec_dim)
+ self.embedding = nn.Linear(512, 512)
+ self.apply(init)
+
+ def forward(self, input_dict):
+ x = input_dict["spec"]
+ lens = input_dict["spec_len"]
+ lens = torch.as_tensor(copy.deepcopy(lens))
+ x = x.unsqueeze(1) # [N, 1, T, D]
+ x = x.transpose(1, 3)
+ x = self.init_bn(x)
+ x = x.transpose(1, 3)
+ x = self.features(x) # [N, 512, T/16, 1]
+ x = x.transpose(1, 2).contiguous().flatten(-2) # [N, T/16, 512]
+ attn_emb = x
+ lens //= 16
+ fc_emb = embedding_pooling(x, lens, "mean+max")
+ fc_emb = F.dropout(fc_emb, p=0.5, training=self.training)
+ fc_emb = self.embedding(fc_emb)
+ fc_emb = F.relu_(fc_emb)
+ return {
+ "attn_emb": attn_emb,
+ "fc_emb": fc_emb,
+ "attn_emb_len": lens
+ }
+
+
+class ConvBlock(nn.Module):
+ def __init__(self, in_channels, out_channels):
+
+ super(ConvBlock, self).__init__()
+
+ self.conv1 = nn.Conv2d(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=(3, 3), stride=(1, 1),
+ padding=(1, 1), bias=False)
+
+ self.conv2 = nn.Conv2d(in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=(3, 3), stride=(1, 1),
+ padding=(1, 1), bias=False)
+
+ self.bn1 = nn.BatchNorm2d(out_channels)
+ self.bn2 = nn.BatchNorm2d(out_channels)
+
+ self.init_weight()
+
+ def init_weight(self):
+ init_layer(self.conv1)
+ init_layer(self.conv2)
+ init_bn(self.bn1)
+ init_bn(self.bn2)
+
+
+ def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+
+ x = input
+ x = F.relu_(self.bn1(self.conv1(x)))
+ x = F.relu_(self.bn2(self.conv2(x)))
+ if pool_type == 'max':
+ x = F.max_pool2d(x, kernel_size=pool_size)
+ elif pool_type == 'avg':
+ x = F.avg_pool2d(x, kernel_size=pool_size)
+ elif pool_type == 'avg+max':
+ x1 = F.avg_pool2d(x, kernel_size=pool_size)
+ x2 = F.max_pool2d(x, kernel_size=pool_size)
+ x = x1 + x2
+ else:
+ raise Exception('Incorrect argument!')
+
+ return x
+
+
+class Cnn14Encoder(nn.Module):
+ def __init__(self, sample_rate=32000):
+ super().__init__()
+ sr_to_fmax = {
+ 32000: 14000,
+ 16000: 8000
+ }
+ # Logmel spectrogram extractor
+ self.melspec_extractor = transforms.MelSpectrogram(
+ sample_rate=sample_rate,
+ n_fft=32 * sample_rate // 1000,
+ win_length=32 * sample_rate // 1000,
+ hop_length=10 * sample_rate // 1000,
+ f_min=50,
+ f_max=sr_to_fmax[sample_rate],
+ n_mels=64,
+ norm="slaney",
+ mel_scale="slaney"
+ )
+ self.hop_length = 10 * sample_rate // 1000
+ self.db_transform = transforms.AmplitudeToDB()
+ # Spec augmenter
+ self.spec_augmenter = SpecAugmentation(time_drop_width=64,
+ time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2)
+
+ self.bn0 = nn.BatchNorm2d(64)
+
+ self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+ self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+ self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+ self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+ self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+ self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+
+ self.downsample_ratio = 32
+
+ self.fc1 = nn.Linear(2048, 2048, bias=True)
+
+ self.init_weight()
+
+ def init_weight(self):
+ init_bn(self.bn0)
+ init_layer(self.fc1)
+
+ def load_pretrained(self, pretrained):
+ checkpoint = torch.load(pretrained, map_location="cpu")
+
+ if "model" in checkpoint:
+ state_keys = checkpoint["model"].keys()
+ backbone = False
+ for key in state_keys:
+ if key.startswith("backbone."):
+ backbone = True
+ break
+
+ if backbone: # COLA
+ state_dict = {}
+ for key, value in checkpoint["model"].items():
+ if key.startswith("backbone."):
+ model_key = key.replace("backbone.", "")
+ state_dict[model_key] = value
+ else: # PANNs
+ state_dict = checkpoint["model"]
+ elif "state_dict" in checkpoint: # CLAP
+ state_dict = checkpoint["state_dict"]
+ state_dict_keys = list(filter(
+ lambda x: "audio_encoder" in x, state_dict.keys()))
+ state_dict = {
+ key.replace('audio_encoder.', ''): state_dict[key]
+ for key in state_dict_keys
+ }
+ else:
+ raise Exception("Unkown checkpoint format")
+
+ model_dict = self.state_dict()
+ pretrained_dict = {
+ k: v for k, v in state_dict.items() if (k in model_dict) and (
+ model_dict[k].shape == v.shape)
+ }
+ model_dict.update(pretrained_dict)
+ self.load_state_dict(model_dict, strict=True)
+
+ def forward(self, input_dict):
+ """
+ Input: (batch_size, n_samples)"""
+ waveform = input_dict["wav"]
+ wave_length = input_dict["wav_len"]
+ specaug = input_dict["specaug"]
+ x = self.melspec_extractor(waveform)
+ x = self.db_transform(x) # (batch_size, mel_bins, time_steps)
+ x = x.transpose(1, 2)
+ x = x.unsqueeze(1) # (batch_size, 1, time_steps, mel_bins)
+
+ # SpecAugment
+ if self.training and specaug:
+ x = self.spec_augmenter(x)
+
+ x = x.transpose(1, 3)
+ x = self.bn0(x)
+ x = x.transpose(1, 3)
+
+ x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+ x = F.dropout(x, p=0.2, training=self.training)
+ x = torch.mean(x, dim=3)
+ attn_emb = x.transpose(1, 2)
+
+ wave_length = torch.as_tensor(wave_length)
+ feat_length = torch.div(wave_length, self.hop_length,
+ rounding_mode="floor") + 1
+ feat_length = torch.div(feat_length, self.downsample_ratio,
+ rounding_mode="floor")
+ x_max = max_with_lens(attn_emb, feat_length)
+ x_mean = mean_with_lens(attn_emb, feat_length)
+ x = x_max + x_mean
+ x = F.dropout(x, p=0.5, training=self.training)
+ x = F.relu_(self.fc1(x))
+ fc_emb = F.dropout(x, p=0.5, training=self.training)
+
+ output_dict = {
+ 'fc_emb': fc_emb,
+ 'attn_emb': attn_emb,
+ 'attn_emb_len': feat_length
+ }
+
+ return output_dict
+
+
+class RnnEncoder(BaseEncoder):
+
+ def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim,
+ pooling="mean", **kwargs):
+ super().__init__(spec_dim, fc_feat_dim, attn_feat_dim)
+ self.pooling = pooling
+ self.hidden_size = kwargs.get('hidden_size', 512)
+ self.bidirectional = kwargs.get('bidirectional', False)
+ self.num_layers = kwargs.get('num_layers', 1)
+ self.dropout = kwargs.get('dropout', 0.2)
+ self.rnn_type = kwargs.get('rnn_type', "GRU")
+ self.in_bn = kwargs.get('in_bn', False)
+ self.embed_dim = self.hidden_size * (self.bidirectional + 1)
+ self.network = getattr(nn, self.rnn_type)(
+ attn_feat_dim,
+ self.hidden_size,
+ num_layers=self.num_layers,
+ bidirectional=self.bidirectional,
+ dropout=self.dropout,
+ batch_first=True)
+ if self.in_bn:
+ self.bn = nn.BatchNorm1d(self.embed_dim)
+ self.apply(init)
+
+ def forward(self, input_dict):
+ x = input_dict["attn"]
+ lens = input_dict["attn_len"]
+ lens = torch.as_tensor(lens)
+ # x: [N, T, E]
+ if self.in_bn:
+ x = pack_wrapper(self.bn, x, lens)
+ out = pack_wrapper(self.network, x, lens)
+ # out: [N, T, hidden]
+ attn_emb = out
+ fc_emb = embedding_pooling(out, lens, self.pooling)
+ return {
+ "attn_emb": attn_emb,
+ "fc_emb": fc_emb,
+ "attn_emb_len": lens
+ }
+
+
+class Cnn14RnnEncoder(nn.Module):
+ def __init__(self, sample_rate=32000, pretrained=None,
+ freeze_cnn=False, freeze_cnn_bn=False,
+ pooling="mean", **kwargs):
+ super().__init__()
+ self.cnn = Cnn14Encoder(sample_rate)
+ self.rnn = RnnEncoder(64, 2048, 2048, pooling, **kwargs)
+ if pretrained is not None:
+ self.cnn.load_pretrained(pretrained)
+ if freeze_cnn:
+ assert pretrained is not None, "cnn is not pretrained but frozen"
+ for param in self.cnn.parameters():
+ param.requires_grad = False
+ self.freeze_cnn_bn = freeze_cnn_bn
+
+ def train(self, mode):
+ super().train(mode=mode)
+ if self.freeze_cnn_bn:
+ def bn_eval(module):
+ class_name = module.__class__.__name__
+ if class_name.find("BatchNorm") != -1:
+ module.eval()
+ self.cnn.apply(bn_eval)
+ return self
+
+ def forward(self, input_dict):
+ output_dict = self.cnn(input_dict)
+ output_dict["attn"] = output_dict["attn_emb"]
+ output_dict["attn_len"] = output_dict["attn_emb_len"]
+ del output_dict["attn_emb"], output_dict["attn_emb_len"]
+ output_dict = self.rnn(output_dict)
+ return output_dict
+
+
+class TransformerEncoder(BaseEncoder):
+
+ def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, d_model, **kwargs):
+ super().__init__(spec_dim, fc_feat_dim, attn_feat_dim)
+ self.d_model = d_model
+ dropout = kwargs.get("dropout", 0.2)
+ self.nhead = kwargs.get("nhead", self.d_model // 64)
+ self.nlayers = kwargs.get("nlayers", 2)
+ self.dim_feedforward = kwargs.get("dim_feedforward", self.d_model * 4)
+
+ self.attn_proj = nn.Sequential(
+ nn.Linear(attn_feat_dim, self.d_model),
+ nn.ReLU(),
+ nn.Dropout(dropout),
+ nn.LayerNorm(self.d_model)
+ )
+ layer = nn.TransformerEncoderLayer(d_model=self.d_model,
+ nhead=self.nhead,
+ dim_feedforward=self.dim_feedforward,
+ dropout=dropout)
+ self.model = nn.TransformerEncoder(layer, self.nlayers)
+ self.cls_token = nn.Parameter(torch.zeros(d_model))
+ self.init_params()
+
+ def init_params(self):
+ for p in self.parameters():
+ if p.dim() > 1:
+ nn.init.xavier_uniform_(p)
+
+ def forward(self, input_dict):
+ attn_feat = input_dict["attn"]
+ attn_feat_len = input_dict["attn_len"]
+ attn_feat_len = torch.as_tensor(attn_feat_len)
+
+ attn_feat = self.attn_proj(attn_feat) # [bs, T, d_model]
+
+ cls_emb = self.cls_token.reshape(1, 1, self.d_model).repeat(
+ attn_feat.size(0), 1, 1)
+ attn_feat = torch.cat((cls_emb, attn_feat), dim=1)
+ attn_feat = attn_feat.transpose(0, 1)
+
+ attn_feat_len += 1
+ src_key_padding_mask = ~generate_length_mask(
+ attn_feat_len, attn_feat.size(0)).to(attn_feat.device)
+ output = self.model(attn_feat, src_key_padding_mask=src_key_padding_mask)
+
+ attn_emb = output.transpose(0, 1)
+ fc_emb = attn_emb[:, 0]
+ return {
+ "attn_emb": attn_emb,
+ "fc_emb": fc_emb,
+ "attn_emb_len": attn_feat_len
+ }
+
+
+class Cnn14TransformerEncoder(nn.Module):
+ def __init__(self, sample_rate=32000, pretrained=None,
+ freeze_cnn=False, freeze_cnn_bn=False,
+ d_model="mean", **kwargs):
+ super().__init__()
+ self.cnn = Cnn14Encoder(sample_rate)
+ self.trm = TransformerEncoder(64, 2048, 2048, d_model, **kwargs)
+ if pretrained is not None:
+ self.cnn.load_pretrained(pretrained)
+ if freeze_cnn:
+ assert pretrained is not None, "cnn is not pretrained but frozen"
+ for param in self.cnn.parameters():
+ param.requires_grad = False
+ self.freeze_cnn_bn = freeze_cnn_bn
+
+ def train(self, mode):
+ super().train(mode=mode)
+ if self.freeze_cnn_bn:
+ def bn_eval(module):
+ class_name = module.__class__.__name__
+ if class_name.find("BatchNorm") != -1:
+ module.eval()
+ self.cnn.apply(bn_eval)
+ return self
+
+ def forward(self, input_dict):
+ output_dict = self.cnn(input_dict)
+ output_dict["attn"] = output_dict["attn_emb"]
+ output_dict["attn_len"] = output_dict["attn_emb_len"]
+ del output_dict["attn_emb"], output_dict["attn_emb_len"]
+ output_dict = self.trm(output_dict)
+ return output_dict
+
+
+
+
+
diff --git a/audio_to_text/captioning/models/transformer_model.py b/audio_to_text/captioning/models/transformer_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..76c97f171955f04b10c16fd1f1a205ce7343a0ac
--- /dev/null
+++ b/audio_to_text/captioning/models/transformer_model.py
@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+import random
+import torch
+import torch.nn as nn
+
+from .base_model import CaptionModel
+from .utils import repeat_tensor
+import audio_to_text.captioning.models.decoder
+
+
+class TransformerModel(CaptionModel):
+
+ def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+ if not hasattr(self, "compatible_decoders"):
+ self.compatible_decoders = (
+ audio_to_text.captioning.models.decoder.TransformerDecoder,
+ )
+ super().__init__(encoder, decoder, **kwargs)
+
+ def seq_forward(self, input_dict):
+ cap = input_dict["cap"]
+ cap_padding_mask = (cap == self.pad_idx).to(cap.device)
+ cap_padding_mask = cap_padding_mask[:, :-1]
+ output = self.decoder(
+ {
+ "word": cap[:, :-1],
+ "attn_emb": input_dict["attn_emb"],
+ "attn_emb_len": input_dict["attn_emb_len"],
+ "cap_padding_mask": cap_padding_mask
+ }
+ )
+ return output
+
+ def prepare_decoder_input(self, input_dict, output):
+ decoder_input = {
+ "attn_emb": input_dict["attn_emb"],
+ "attn_emb_len": input_dict["attn_emb_len"]
+ }
+ t = input_dict["t"]
+
+ ###############
+ # determine input word
+ ################
+ if input_dict["mode"] == "train" and random.random() < input_dict["ss_ratio"]: # training, scheduled sampling
+ word = input_dict["cap"][:, :t+1]
+ else:
+ start_word = torch.tensor([self.start_idx,] * input_dict["attn_emb"].size(0)).unsqueeze(1).long()
+ if t == 0:
+ word = start_word
+ else:
+ word = torch.cat((start_word, output["seq"][:, :t]), dim=-1)
+ # word: [N, T]
+ decoder_input["word"] = word
+
+ cap_padding_mask = (word == self.pad_idx).to(input_dict["attn_emb"].device)
+ decoder_input["cap_padding_mask"] = cap_padding_mask
+ return decoder_input
+
+ def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+ decoder_input = {}
+ t = input_dict["t"]
+ i = input_dict["sample_idx"]
+ beam_size = input_dict["beam_size"]
+ ###############
+ # prepare attn embeds
+ ################
+ if t == 0:
+ attn_emb = repeat_tensor(input_dict["attn_emb"][i], beam_size)
+ attn_emb_len = repeat_tensor(input_dict["attn_emb_len"][i], beam_size)
+ output_i["attn_emb"] = attn_emb
+ output_i["attn_emb_len"] = attn_emb_len
+ decoder_input["attn_emb"] = output_i["attn_emb"]
+ decoder_input["attn_emb_len"] = output_i["attn_emb_len"]
+ ###############
+ # determine input word
+ ################
+ start_word = torch.tensor([self.start_idx,] * beam_size).unsqueeze(1).long()
+ if t == 0:
+ word = start_word
+ else:
+ word = torch.cat((start_word, output_i["seq"]), dim=-1)
+ decoder_input["word"] = word
+ cap_padding_mask = (word == self.pad_idx).to(input_dict["attn_emb"].device)
+ decoder_input["cap_padding_mask"] = cap_padding_mask
+
+ return decoder_input
+
+
+class M2TransformerModel(CaptionModel):
+
+ def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+ if not hasattr(self, "compatible_decoders"):
+ self.compatible_decoders = (
+ captioning.models.decoder.M2TransformerDecoder,
+ )
+ super().__init__(encoder, decoder, **kwargs)
+ self.check_encoder_compatibility()
+
+ def check_encoder_compatibility(self):
+ assert isinstance(self.encoder, captioning.models.encoder.M2TransformerEncoder), \
+ f"only M2TransformerModel is compatible with {self.__class__.__name__}"
+
+
+ def seq_forward(self, input_dict):
+ cap = input_dict["cap"]
+ output = self.decoder(
+ {
+ "word": cap[:, :-1],
+ "attn_emb": input_dict["attn_emb"],
+ "attn_emb_mask": input_dict["attn_emb_mask"],
+ }
+ )
+ return output
+
+ def prepare_decoder_input(self, input_dict, output):
+ decoder_input = {
+ "attn_emb": input_dict["attn_emb"],
+ "attn_emb_mask": input_dict["attn_emb_mask"]
+ }
+ t = input_dict["t"]
+
+ ###############
+ # determine input word
+ ################
+ if input_dict["mode"] == "train" and random.random() < input_dict["ss_ratio"]: # training, scheduled sampling
+ word = input_dict["cap"][:, :t+1]
+ else:
+ start_word = torch.tensor([self.start_idx,] * input_dict["attn_emb"].size(0)).unsqueeze(1).long()
+ if t == 0:
+ word = start_word
+ else:
+ word = torch.cat((start_word, output["seq"][:, :t]), dim=-1)
+ # word: [N, T]
+ decoder_input["word"] = word
+
+ return decoder_input
+
+ def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+ decoder_input = {}
+ t = input_dict["t"]
+ i = input_dict["sample_idx"]
+ beam_size = input_dict["beam_size"]
+ ###############
+ # prepare attn embeds
+ ################
+ if t == 0:
+ attn_emb = repeat_tensor(input_dict["attn_emb"][i], beam_size)
+ attn_emb_mask = repeat_tensor(input_dict["attn_emb_mask"][i], beam_size)
+ output_i["attn_emb"] = attn_emb
+ output_i["attn_emb_mask"] = attn_emb_mask
+ decoder_input["attn_emb"] = output_i["attn_emb"]
+ decoder_input["attn_emb_mask"] = output_i["attn_emb_mask"]
+ ###############
+ # determine input word
+ ################
+ start_word = torch.tensor([self.start_idx,] * beam_size).unsqueeze(1).long()
+ if t == 0:
+ word = start_word
+ else:
+ word = torch.cat((start_word, output_i["seq"]), dim=-1)
+ decoder_input["word"] = word
+
+ return decoder_input
+
+
+class EventEncoder(nn.Module):
+ """
+ Encode the Label information in AudioCaps and AudioSet
+ """
+ def __init__(self, emb_dim, vocab_size=527):
+ super(EventEncoder, self).__init__()
+ self.label_embedding = nn.Parameter(
+ torch.randn((vocab_size, emb_dim)), requires_grad=True)
+
+ def forward(self, word_idxs):
+ indices = word_idxs / word_idxs.sum(dim=1, keepdim=True)
+ embeddings = indices @ self.label_embedding
+ return embeddings
+
+
+class EventCondTransformerModel(TransformerModel):
+
+ def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+ if not hasattr(self, "compatible_decoders"):
+ self.compatible_decoders = (
+ captioning.models.decoder.EventTransformerDecoder,
+ )
+ super().__init__(encoder, decoder, **kwargs)
+ self.label_encoder = EventEncoder(decoder.emb_dim, 527)
+ self.train_forward_keys += ["events"]
+ self.inference_forward_keys += ["events"]
+
+ # def seq_forward(self, input_dict):
+ # cap = input_dict["cap"]
+ # cap_padding_mask = (cap == self.pad_idx).to(cap.device)
+ # cap_padding_mask = cap_padding_mask[:, :-1]
+ # output = self.decoder(
+ # {
+ # "word": cap[:, :-1],
+ # "attn_emb": input_dict["attn_emb"],
+ # "attn_emb_len": input_dict["attn_emb_len"],
+ # "cap_padding_mask": cap_padding_mask
+ # }
+ # )
+ # return output
+
+ def prepare_decoder_input(self, input_dict, output):
+ decoder_input = super().prepare_decoder_input(input_dict, output)
+ decoder_input["events"] = self.label_encoder(input_dict["events"])
+ return decoder_input
+
+ def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+ decoder_input = super().prepare_beamsearch_decoder_input(input_dict, output_i)
+ t = input_dict["t"]
+ i = input_dict["sample_idx"]
+ beam_size = input_dict["beam_size"]
+ if t == 0:
+ output_i["events"] = repeat_tensor(self.label_encoder(input_dict["events"])[i], beam_size)
+ decoder_input["events"] = output_i["events"]
+ return decoder_input
+
+
+class KeywordCondTransformerModel(TransformerModel):
+
+ def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+ if not hasattr(self, "compatible_decoders"):
+ self.compatible_decoders = (
+ captioning.models.decoder.KeywordProbTransformerDecoder,
+ )
+ super().__init__(encoder, decoder, **kwargs)
+ self.train_forward_keys += ["keyword"]
+ self.inference_forward_keys += ["keyword"]
+
+ def seq_forward(self, input_dict):
+ cap = input_dict["cap"]
+ cap_padding_mask = (cap == self.pad_idx).to(cap.device)
+ cap_padding_mask = cap_padding_mask[:, :-1]
+ keyword = input_dict["keyword"]
+ output = self.decoder(
+ {
+ "word": cap[:, :-1],
+ "attn_emb": input_dict["attn_emb"],
+ "attn_emb_len": input_dict["attn_emb_len"],
+ "keyword": keyword,
+ "cap_padding_mask": cap_padding_mask
+ }
+ )
+ return output
+
+ def prepare_decoder_input(self, input_dict, output):
+ decoder_input = super().prepare_decoder_input(input_dict, output)
+ decoder_input["keyword"] = input_dict["keyword"]
+ return decoder_input
+
+ def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+ decoder_input = super().prepare_beamsearch_decoder_input(input_dict, output_i)
+ t = input_dict["t"]
+ i = input_dict["sample_idx"]
+ beam_size = input_dict["beam_size"]
+ if t == 0:
+ output_i["keyword"] = repeat_tensor(input_dict["keyword"][i],
+ beam_size)
+ decoder_input["keyword"] = output_i["keyword"]
+ return decoder_input
+
diff --git a/audio_to_text/captioning/models/utils.py b/audio_to_text/captioning/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3623cf43619a7a4ff5fa31f2b056378697b04d61
--- /dev/null
+++ b/audio_to_text/captioning/models/utils.py
@@ -0,0 +1,132 @@
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence
+
+
+def sort_pack_padded_sequence(input, lengths):
+ sorted_lengths, indices = torch.sort(lengths, descending=True)
+ tmp = pack_padded_sequence(input[indices], sorted_lengths.cpu(), batch_first=True)
+ inv_ix = indices.clone()
+ inv_ix[indices] = torch.arange(0,len(indices)).type_as(inv_ix)
+ return tmp, inv_ix
+
+def pad_unsort_packed_sequence(input, inv_ix):
+ tmp, _ = pad_packed_sequence(input, batch_first=True)
+ tmp = tmp[inv_ix]
+ return tmp
+
+def pack_wrapper(module, attn_feats, attn_feat_lens):
+ packed, inv_ix = sort_pack_padded_sequence(attn_feats, attn_feat_lens)
+ if isinstance(module, torch.nn.RNNBase):
+ return pad_unsort_packed_sequence(module(packed)[0], inv_ix)
+ else:
+ return pad_unsort_packed_sequence(PackedSequence(module(packed[0]), packed[1]), inv_ix)
+
+def generate_length_mask(lens, max_length=None):
+ lens = torch.as_tensor(lens)
+ N = lens.size(0)
+ if max_length is None:
+ max_length = max(lens)
+ idxs = torch.arange(max_length).repeat(N).view(N, max_length)
+ idxs = idxs.to(lens.device)
+ mask = (idxs < lens.view(-1, 1))
+ return mask
+
+def mean_with_lens(features, lens):
+ """
+ features: [N, T, ...] (assume the second dimension represents length)
+ lens: [N,]
+ """
+ lens = torch.as_tensor(lens)
+ if max(lens) != features.size(1):
+ max_length = features.size(1)
+ mask = generate_length_mask(lens, max_length)
+ else:
+ mask = generate_length_mask(lens)
+ mask = mask.to(features.device) # [N, T]
+
+ while mask.ndim < features.ndim:
+ mask = mask.unsqueeze(-1)
+ feature_mean = features * mask
+ feature_mean = feature_mean.sum(1)
+ while lens.ndim < feature_mean.ndim:
+ lens = lens.unsqueeze(1)
+ feature_mean = feature_mean / lens.to(features.device)
+ # feature_mean = features * mask.unsqueeze(-1)
+ # feature_mean = feature_mean.sum(1) / lens.unsqueeze(1).to(features.device)
+ return feature_mean
+
+def max_with_lens(features, lens):
+ """
+ features: [N, T, ...] (assume the second dimension represents length)
+ lens: [N,]
+ """
+ lens = torch.as_tensor(lens)
+ mask = generate_length_mask(lens).to(features.device) # [N, T]
+
+ feature_max = features.clone()
+ feature_max[~mask] = float("-inf")
+ feature_max, _ = feature_max.max(1)
+ return feature_max
+
+def repeat_tensor(x, n):
+ return x.unsqueeze(0).repeat(n, *([1] * len(x.shape)))
+
+def init(m, method="kaiming"):
+ if isinstance(m, (nn.Conv2d, nn.Conv1d)):
+ if method == "kaiming":
+ nn.init.kaiming_uniform_(m.weight)
+ elif method == "xavier":
+ nn.init.xavier_uniform_(m.weight)
+ else:
+ raise Exception(f"initialization method {method} not supported")
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)):
+ nn.init.constant_(m.weight, 1)
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Linear):
+ if method == "kaiming":
+ nn.init.kaiming_uniform_(m.weight)
+ elif method == "xavier":
+ nn.init.xavier_uniform_(m.weight)
+ else:
+ raise Exception(f"initialization method {method} not supported")
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Embedding):
+ if method == "kaiming":
+ nn.init.kaiming_uniform_(m.weight)
+ elif method == "xavier":
+ nn.init.xavier_uniform_(m.weight)
+ else:
+ raise Exception(f"initialization method {method} not supported")
+
+
+
+
+class PositionalEncoding(nn.Module):
+
+ def __init__(self, d_model, dropout=0.1, max_len=100):
+ super(PositionalEncoding, self).__init__()
+ self.dropout = nn.Dropout(p=dropout)
+
+ pe = torch.zeros(max_len, d_model)
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * \
+ (-math.log(10000.0) / d_model))
+ pe[:, 0::2] = torch.sin(position * div_term)
+ pe[:, 1::2] = torch.cos(position * div_term)
+ pe = pe.unsqueeze(0).transpose(0, 1)
+ # self.register_buffer("pe", pe)
+ self.register_parameter("pe", nn.Parameter(pe, requires_grad=False))
+
+ def forward(self, x):
+ # x: [T, N, E]
+ x = x + self.pe[:x.size(0), :]
+ return self.dropout(x)
diff --git a/audio_to_text/captioning/utils/README.md b/audio_to_text/captioning/utils/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c6fd17d778a9f9dbe7bf632c92e40e36e67b91d2
--- /dev/null
+++ b/audio_to_text/captioning/utils/README.md
@@ -0,0 +1,19 @@
+# Utils
+
+Scripts in this directory are used as utility functions.
+
+## BERT Pretrained Embeddings
+
+You can load pretrained word embeddings in Google [BERT](https://github.com/google-research/bert#pre-trained-models) instead of training word embeddings from scratch. The scripts in `utils/bert` need a BERT server in the background. We use BERT server from [bert-as-service](https://github.com/hanxiao/bert-as-service).
+
+To use bert-as-service, you need to first install the repository. It is recommended that you create a new environment with Tensorflow 1.3 to run BERT server since it is incompatible with Tensorflow 2.x.
+
+After successful installation of [bert-as-service](https://github.com/hanxiao/bert-as-service), downloading and running the BERT server needs to execute:
+
+```bash
+bash scripts/prepare_bert_server.sh zh
+```
+
+By default, server based on BERT base Chinese model is running in the background. You can change to other models by changing corresponding model name and path in `scripts/prepare_bert_server.sh`.
+
+To extract BERT word embeddings, you need to execute `utils/bert/create_word_embedding.py`.
diff --git a/audio_to_text/captioning/utils/__init__.py b/audio_to_text/captioning/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/audio_to_text/captioning/utils/__pycache__/__init__.cpython-38.pyc b/audio_to_text/captioning/utils/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be638756ffa795f33059276b99c2f8c05661cbdf
Binary files /dev/null and b/audio_to_text/captioning/utils/__pycache__/__init__.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/utils/__pycache__/train_util.cpython-38.pyc b/audio_to_text/captioning/utils/__pycache__/train_util.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4270c25cf751d703e233146358c7345c39e55ceb
Binary files /dev/null and b/audio_to_text/captioning/utils/__pycache__/train_util.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/utils/bert/create_sent_embedding.py b/audio_to_text/captioning/utils/bert/create_sent_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..b517a32429ca74bae668291dcb03d34296027440
--- /dev/null
+++ b/audio_to_text/captioning/utils/bert/create_sent_embedding.py
@@ -0,0 +1,89 @@
+import pickle
+import fire
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+
+class EmbeddingExtractor(object):
+
+ def extract_sentbert(self, caption_file: str, output: str, dev: bool=True, zh: bool=False):
+ from sentence_transformers import SentenceTransformer
+ lang2model = {
+ "zh": "distiluse-base-multilingual-cased",
+ "en": "bert-base-nli-mean-tokens"
+ }
+ lang = "zh" if zh else "en"
+ model = SentenceTransformer(lang2model[lang])
+
+ self.extract(caption_file, model, output, dev)
+
+ def extract_originbert(self, caption_file: str, output: str, dev: bool=True, ip="localhost"):
+ from bert_serving.client import BertClient
+ client = BertClient(ip)
+
+ self.extract(caption_file, client, output, dev)
+
+ def extract(self, caption_file: str, model, output, dev: bool):
+ caption_df = pd.read_json(caption_file, dtype={"key": str})
+ embeddings = {}
+
+ if dev:
+ with tqdm(total=caption_df.shape[0], ascii=True) as pbar:
+ for idx, row in caption_df.iterrows():
+ caption = row["caption"]
+ key = row["key"]
+ cap_idx = row["caption_index"]
+ embedding = model.encode([caption])
+ embedding = np.array(embedding).reshape(-1)
+ embeddings[f"{key}_{cap_idx}"] = embedding
+ pbar.update()
+
+ else:
+ dump = {}
+
+ with tqdm(total=caption_df.shape[0], ascii=True) as pbar:
+ for idx, row in caption_df.iterrows():
+ key = row["key"]
+ caption = row["caption"]
+ value = np.array(model.encode([caption])).reshape(-1)
+
+ if key not in embeddings.keys():
+ embeddings[key] = [value]
+ else:
+ embeddings[key].append(value)
+
+ pbar.update()
+
+ for key in embeddings:
+ dump[key] = np.stack(embeddings[key])
+
+ embeddings = dump
+
+ with open(output, "wb") as f:
+ pickle.dump(embeddings, f)
+
+ def extract_sbert(self,
+ input_json: str,
+ output: str):
+ from sentence_transformers import SentenceTransformer
+ import json
+ import torch
+ from h5py import File
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
+ model = model.to(device)
+ model.eval()
+
+ data = json.load(open(input_json))["audios"]
+ with torch.no_grad(), tqdm(total=len(data), ascii=True) as pbar, File(output, "w") as store:
+ for sample in data:
+ audio_id = sample["audio_id"]
+ for cap in sample["captions"]:
+ cap_id = cap["cap_id"]
+ store[f"{audio_id}_{cap_id}"] = model.encode(cap["caption"])
+ pbar.update()
+
+
+if __name__ == "__main__":
+ fire.Fire(EmbeddingExtractor)
diff --git a/audio_to_text/captioning/utils/bert/create_word_embedding.py b/audio_to_text/captioning/utils/bert/create_word_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..43c980e69057dc251ddbb7ae6a19684807cc6699
--- /dev/null
+++ b/audio_to_text/captioning/utils/bert/create_word_embedding.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+
+from bert_serving.client import BertClient
+import numpy as np
+from tqdm import tqdm
+import fire
+import torch
+
+sys.path.append(os.getcwd())
+from utils.build_vocab import Vocabulary
+
+def main(vocab_file: str, output: str, server_hostname: str):
+ client = BertClient(ip=server_hostname)
+ vocabulary = torch.load(vocab_file)
+ vocab_size = len(vocabulary)
+
+ fake_embedding = client.encode(["test"]).reshape(-1)
+ embed_size = fake_embedding.shape[0]
+
+ print("Encoding words into embeddings with size: ", embed_size)
+
+ embeddings = np.empty((vocab_size, embed_size))
+ for i in tqdm(range(len(embeddings)), ascii=True):
+ embeddings[i] = client.encode([vocabulary.idx2word[i]])
+ np.save(output, embeddings)
+
+
+if __name__ == '__main__':
+ fire.Fire(main)
+
+
diff --git a/audio_to_text/captioning/utils/build_vocab.py b/audio_to_text/captioning/utils/build_vocab.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9fab23bc2c48203e541d356dc172e1fdee8f113
--- /dev/null
+++ b/audio_to_text/captioning/utils/build_vocab.py
@@ -0,0 +1,153 @@
+import json
+from tqdm import tqdm
+import logging
+import pickle
+from collections import Counter
+import re
+import fire
+
+
+class Vocabulary(object):
+ """Simple vocabulary wrapper."""
+ def __init__(self):
+ self.word2idx = {}
+ self.idx2word = {}
+ self.idx = 0
+
+ def add_word(self, word):
+ if not word in self.word2idx:
+ self.word2idx[word] = self.idx
+ self.idx2word[self.idx] = word
+ self.idx += 1
+
+ def __call__(self, word):
+ if not word in self.word2idx:
+ return self.word2idx[""]
+ return self.word2idx[word]
+
+ def __getitem__(self, word_id):
+ return self.idx2word[word_id]
+
+ def __len__(self):
+ return len(self.word2idx)
+
+
+def build_vocab(input_json: str,
+ threshold: int,
+ keep_punctuation: bool,
+ host_address: str,
+ character_level: bool = False,
+ zh: bool = True ):
+ """Build vocabulary from csv file with a given threshold to drop all counts < threshold
+
+ Args:
+ input_json(string): Preprossessed json file. Structure like this:
+ {
+ 'audios': [
+ {
+ 'audio_id': 'xxx',
+ 'captions': [
+ {
+ 'caption': 'xxx',
+ 'cap_id': 'xxx'
+ }
+ ]
+ },
+ ...
+ ]
+ }
+ threshold (int): Threshold to drop all words with counts < threshold
+ keep_punctuation (bool): Includes or excludes punctuation.
+
+ Returns:
+ vocab (Vocab): Object with the processed vocabulary
+"""
+ data = json.load(open(input_json, "r"))["audios"]
+ counter = Counter()
+ pretokenized = "tokens" in data[0]["captions"][0]
+
+ if zh:
+ from nltk.parse.corenlp import CoreNLPParser
+ from zhon.hanzi import punctuation
+ if not pretokenized:
+ parser = CoreNLPParser(host_address)
+ for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ if pretokenized:
+ tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+ else:
+ caption = data[audio_idx]["captions"][cap_idx]["caption"]
+ # Remove all punctuations
+ if not keep_punctuation:
+ caption = re.sub("[{}]".format(punctuation), "", caption)
+ if character_level:
+ tokens = list(caption)
+ else:
+ tokens = list(parser.tokenize(caption))
+ data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)
+ counter.update(tokens)
+ else:
+ if pretokenized:
+ for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+ counter.update(tokens)
+ else:
+ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+ captions = {}
+ for audio_idx in range(len(data)):
+ audio_id = data[audio_idx]["audio_id"]
+ captions[audio_id] = []
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ caption = data[audio_idx]["captions"][cap_idx]["caption"]
+ captions[audio_id].append({
+ "audio_id": audio_id,
+ "id": cap_idx,
+ "caption": caption
+ })
+ tokenizer = PTBTokenizer()
+ captions = tokenizer.tokenize(captions)
+ for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+ audio_id = data[audio_idx]["audio_id"]
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ tokens = captions[audio_id][cap_idx]
+ data[audio_idx]["captions"][cap_idx]["tokens"] = tokens
+ counter.update(tokens.split(" "))
+
+ if not pretokenized:
+ json.dump({ "audios": data }, open(input_json, "w"), indent=4, ensure_ascii=not zh)
+ words = [word for word, cnt in counter.items() if cnt >= threshold]
+
+ # Create a vocab wrapper and add some special tokens.
+ vocab = Vocabulary()
+ vocab.add_word("")
+ vocab.add_word("")
+ vocab.add_word("")
+ vocab.add_word("")
+
+ # Add the words to the vocabulary.
+ for word in words:
+ vocab.add_word(word)
+ return vocab
+
+
+def process(input_json: str,
+ output_file: str,
+ threshold: int = 1,
+ keep_punctuation: bool = False,
+ character_level: bool = False,
+ host_address: str = "http://localhost:9000",
+ zh: bool = False):
+ logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+ logging.basicConfig(level=logging.INFO, format=logfmt)
+ logging.info("Build Vocab")
+ vocabulary = build_vocab(
+ input_json=input_json, threshold=threshold, keep_punctuation=keep_punctuation,
+ host_address=host_address, character_level=character_level, zh=zh)
+ pickle.dump(vocabulary, open(output_file, "wb"))
+ logging.info("Total vocabulary size: {}".format(len(vocabulary)))
+ logging.info("Saved vocab to '{}'".format(output_file))
+
+
+if __name__ == '__main__':
+ fire.Fire(process)
diff --git a/audio_to_text/captioning/utils/build_vocab_ltp.py b/audio_to_text/captioning/utils/build_vocab_ltp.py
new file mode 100644
index 0000000000000000000000000000000000000000..aae0c718ae546882dcb573be42ace3408394468f
--- /dev/null
+++ b/audio_to_text/captioning/utils/build_vocab_ltp.py
@@ -0,0 +1,150 @@
+import json
+from tqdm import tqdm
+import logging
+import pickle
+from collections import Counter
+import re
+import fire
+
+class Vocabulary(object):
+ """Simple vocabulary wrapper."""
+ def __init__(self):
+ self.word2idx = {}
+ self.idx2word = {}
+ self.idx = 0
+
+ def add_word(self, word):
+ if not word in self.word2idx:
+ self.word2idx[word] = self.idx
+ self.idx2word[self.idx] = word
+ self.idx += 1
+
+ def __call__(self, word):
+ if not word in self.word2idx:
+ return self.word2idx[""]
+ return self.word2idx[word]
+
+ def __len__(self):
+ return len(self.word2idx)
+
+def build_vocab(input_json: str,
+ output_json: str,
+ threshold: int,
+ keep_punctuation: bool,
+ character_level: bool = False,
+ zh: bool = True ):
+ """Build vocabulary from csv file with a given threshold to drop all counts < threshold
+
+ Args:
+ input_json(string): Preprossessed json file. Structure like this:
+ {
+ 'audios': [
+ {
+ 'audio_id': 'xxx',
+ 'captions': [
+ {
+ 'caption': 'xxx',
+ 'cap_id': 'xxx'
+ }
+ ]
+ },
+ ...
+ ]
+ }
+ threshold (int): Threshold to drop all words with counts < threshold
+ keep_punctuation (bool): Includes or excludes punctuation.
+
+ Returns:
+ vocab (Vocab): Object with the processed vocabulary
+"""
+ data = json.load(open(input_json, "r"))["audios"]
+ counter = Counter()
+ pretokenized = "tokens" in data[0]["captions"][0]
+
+ if zh:
+ from ltp import LTP
+ from zhon.hanzi import punctuation
+ if not pretokenized:
+ parser = LTP("base")
+ for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ if pretokenized:
+ tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+ else:
+ caption = data[audio_idx]["captions"][cap_idx]["caption"]
+ if character_level:
+ tokens = list(caption)
+ else:
+ tokens, _ = parser.seg([caption])
+ tokens = tokens[0]
+ # Remove all punctuations
+ if not keep_punctuation:
+ tokens = [token for token in tokens if token not in punctuation]
+ data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)
+ counter.update(tokens)
+ else:
+ if pretokenized:
+ for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+ counter.update(tokens)
+ else:
+ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+ captions = {}
+ for audio_idx in range(len(data)):
+ audio_id = data[audio_idx]["audio_id"]
+ captions[audio_id] = []
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ caption = data[audio_idx]["captions"][cap_idx]["caption"]
+ captions[audio_id].append({
+ "audio_id": audio_id,
+ "id": cap_idx,
+ "caption": caption
+ })
+ tokenizer = PTBTokenizer()
+ captions = tokenizer.tokenize(captions)
+ for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+ audio_id = data[audio_idx]["audio_id"]
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ tokens = captions[audio_id][cap_idx]
+ data[audio_idx]["captions"][cap_idx]["tokens"] = tokens
+ counter.update(tokens.split(" "))
+
+ if not pretokenized:
+ if output_json is None:
+ output_json = input_json
+ json.dump({ "audios": data }, open(output_json, "w"), indent=4, ensure_ascii=not zh)
+ words = [word for word, cnt in counter.items() if cnt >= threshold]
+
+ # Create a vocab wrapper and add some special tokens.
+ vocab = Vocabulary()
+ vocab.add_word("")
+ vocab.add_word("")
+ vocab.add_word("")
+ vocab.add_word("")
+
+ # Add the words to the vocabulary.
+ for word in words:
+ vocab.add_word(word)
+ return vocab
+
+def process(input_json: str,
+ output_file: str,
+ output_json: str = None,
+ threshold: int = 1,
+ keep_punctuation: bool = False,
+ character_level: bool = False,
+ zh: bool = True):
+ logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+ logging.basicConfig(level=logging.INFO, format=logfmt)
+ logging.info("Build Vocab")
+ vocabulary = build_vocab(
+ input_json=input_json, output_json=output_json, threshold=threshold,
+ keep_punctuation=keep_punctuation, character_level=character_level, zh=zh)
+ pickle.dump(vocabulary, open(output_file, "wb"))
+ logging.info("Total vocabulary size: {}".format(len(vocabulary)))
+ logging.info("Saved vocab to '{}'".format(output_file))
+
+
+if __name__ == '__main__':
+ fire.Fire(process)
diff --git a/audio_to_text/captioning/utils/build_vocab_spacy.py b/audio_to_text/captioning/utils/build_vocab_spacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..84da679f79d9f36b288d7312fb4ad9dc04723b0d
--- /dev/null
+++ b/audio_to_text/captioning/utils/build_vocab_spacy.py
@@ -0,0 +1,152 @@
+import json
+from tqdm import tqdm
+import logging
+import pickle
+from collections import Counter
+import re
+import fire
+
+class Vocabulary(object):
+ """Simple vocabulary wrapper."""
+ def __init__(self):
+ self.word2idx = {}
+ self.idx2word = {}
+ self.idx = 0
+
+ def add_word(self, word):
+ if not word in self.word2idx:
+ self.word2idx[word] = self.idx
+ self.idx2word[self.idx] = word
+ self.idx += 1
+
+ def __call__(self, word):
+ if not word in self.word2idx:
+ return self.word2idx[""]
+ return self.word2idx[word]
+
+ def __len__(self):
+ return len(self.word2idx)
+
+
+def build_vocab(input_json: str,
+ output_json: str,
+ threshold: int,
+ keep_punctuation: bool,
+ host_address: str,
+ character_level: bool = False,
+ retokenize: bool = True,
+ zh: bool = True ):
+ """Build vocabulary from csv file with a given threshold to drop all counts < threshold
+
+ Args:
+ input_json(string): Preprossessed json file. Structure like this:
+ {
+ 'audios': [
+ {
+ 'audio_id': 'xxx',
+ 'captions': [
+ {
+ 'caption': 'xxx',
+ 'cap_id': 'xxx'
+ }
+ ]
+ },
+ ...
+ ]
+ }
+ threshold (int): Threshold to drop all words with counts < threshold
+ keep_punctuation (bool): Includes or excludes punctuation.
+
+ Returns:
+ vocab (Vocab): Object with the processed vocabulary
+"""
+ data = json.load(open(input_json, "r"))["audios"]
+ counter = Counter()
+ if retokenize:
+ pretokenized = False
+ else:
+ pretokenized = "tokens" in data[0]["captions"][0]
+
+ if zh:
+ from nltk.parse.corenlp import CoreNLPParser
+ from zhon.hanzi import punctuation
+ if not pretokenized:
+ parser = CoreNLPParser(host_address)
+ for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ if pretokenized:
+ tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+ else:
+ caption = data[audio_idx]["captions"][cap_idx]["caption"]
+ # Remove all punctuations
+ if not keep_punctuation:
+ caption = re.sub("[{}]".format(punctuation), "", caption)
+ if character_level:
+ tokens = list(caption)
+ else:
+ tokens = list(parser.tokenize(caption))
+ data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)
+ counter.update(tokens)
+ else:
+ if pretokenized:
+ for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+ counter.update(tokens)
+ else:
+ import spacy
+ tokenizer = spacy.load("en_core_web_sm", disable=["parser", "ner"])
+ for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+ captions = data[audio_idx]["captions"]
+ for cap_idx in range(len(captions)):
+ caption = captions[cap_idx]["caption"]
+ doc = tokenizer(caption)
+ tokens = " ".join([str(token).lower() for token in doc])
+ data[audio_idx]["captions"][cap_idx]["tokens"] = tokens
+ counter.update(tokens.split(" "))
+
+ if not pretokenized:
+ if output_json is None:
+ json.dump({ "audios": data }, open(input_json, "w"),
+ indent=4, ensure_ascii=not zh)
+ else:
+ json.dump({ "audios": data }, open(output_json, "w"),
+ indent=4, ensure_ascii=not zh)
+
+ words = [word for word, cnt in counter.items() if cnt >= threshold]
+
+ # Create a vocab wrapper and add some special tokens.
+ vocab = Vocabulary()
+ vocab.add_word("")
+ vocab.add_word("")
+ vocab.add_word("")
+ vocab.add_word("")
+
+ # Add the words to the vocabulary.
+ for word in words:
+ vocab.add_word(word)
+ return vocab
+
+def process(input_json: str,
+ output_file: str,
+ output_json: str = None,
+ threshold: int = 1,
+ keep_punctuation: bool = False,
+ character_level: bool = False,
+ retokenize: bool = False,
+ host_address: str = "http://localhost:9000",
+ zh: bool = True):
+ logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+ logging.basicConfig(level=logging.INFO, format=logfmt)
+ logging.info("Build Vocab")
+ vocabulary = build_vocab(
+ input_json=input_json, output_json=output_json, threshold=threshold,
+ keep_punctuation=keep_punctuation, host_address=host_address,
+ character_level=character_level, retokenize=retokenize, zh=zh)
+ pickle.dump(vocabulary, open(output_file, "wb"))
+ logging.info("Total vocabulary size: {}".format(len(vocabulary)))
+ logging.info("Saved vocab to '{}'".format(output_file))
+
+
+if __name__ == '__main__':
+ fire.Fire(process)
diff --git a/audio_to_text/captioning/utils/eval_round_robin.py b/audio_to_text/captioning/utils/eval_round_robin.py
new file mode 100644
index 0000000000000000000000000000000000000000..28603a56fe3e6603cca7da5d70c0f71b1421c7c5
--- /dev/null
+++ b/audio_to_text/captioning/utils/eval_round_robin.py
@@ -0,0 +1,182 @@
+import copy
+import json
+
+import numpy as np
+import fire
+
+
+def evaluate_annotation(key2refs, scorer):
+ if scorer.method() == "Bleu":
+ scores = np.array([ 0.0 for n in range(4) ])
+ else:
+ scores = 0
+ num_cap_per_audio = len(next(iter(key2refs.values())))
+
+ for i in range(num_cap_per_audio):
+ if i > 0:
+ for key in key2refs:
+ key2refs[key].insert(0, res[key][0])
+ res = { key: [refs.pop(),] for key, refs in key2refs.items() }
+ score, _ = scorer.compute_score(key2refs, res)
+
+ if scorer.method() == "Bleu":
+ scores += np.array(score)
+ else:
+ scores += score
+
+ score = scores / num_cap_per_audio
+ return score
+
+def evaluate_prediction(key2pred, key2refs, scorer):
+ if scorer.method() == "Bleu":
+ scores = np.array([ 0.0 for n in range(4) ])
+ else:
+ scores = 0
+ num_cap_per_audio = len(next(iter(key2refs.values())))
+
+ for i in range(num_cap_per_audio):
+ key2refs_i = {}
+ for key, refs in key2refs.items():
+ key2refs_i[key] = refs[:i] + refs[i+1:]
+ score, _ = scorer.compute_score(key2refs_i, key2pred)
+
+ if scorer.method() == "Bleu":
+ scores += np.array(score)
+ else:
+ scores += score
+
+ score = scores / num_cap_per_audio
+ return score
+
+
+class Evaluator(object):
+
+ def eval_annotation(self, annotation, output):
+ captions = json.load(open(annotation, "r"))["audios"]
+
+ key2refs = {}
+ for audio_idx in range(len(captions)):
+ audio_id = captions[audio_idx]["audio_id"]
+ key2refs[audio_id] = []
+ for caption in captions[audio_idx]["captions"]:
+ key2refs[audio_id].append(caption["caption"])
+
+ from fense.fense import Fense
+ scores = {}
+ scorer = Fense()
+ scores[scorer.method()] = evaluate_annotation(copy.deepcopy(key2refs), scorer)
+
+ refs4eval = {}
+ for key, refs in key2refs.items():
+ refs4eval[key] = []
+ for idx, ref in enumerate(refs):
+ refs4eval[key].append({
+ "audio_id": key,
+ "id": idx,
+ "caption": ref
+ })
+
+ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+
+ tokenizer = PTBTokenizer()
+ key2refs = tokenizer.tokenize(refs4eval)
+
+
+ from pycocoevalcap.bleu.bleu import Bleu
+ from pycocoevalcap.cider.cider import Cider
+ from pycocoevalcap.rouge.rouge import Rouge
+ from pycocoevalcap.meteor.meteor import Meteor
+ from pycocoevalcap.spice.spice import Spice
+
+
+ scorers = [Bleu(), Rouge(), Cider(), Meteor(), Spice()]
+ for scorer in scorers:
+ scores[scorer.method()] = evaluate_annotation(copy.deepcopy(key2refs), scorer)
+
+ spider = 0
+ with open(output, "w") as f:
+ for name, score in scores.items():
+ if name == "Bleu":
+ for n in range(4):
+ f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n]))
+ else:
+ f.write("{}: {:6.3f}\n".format(name, score))
+ if name in ["CIDEr", "SPICE"]:
+ spider += score
+ f.write("SPIDEr: {:6.3f}\n".format(spider / 2))
+
+ def eval_prediction(self, prediction, annotation, output):
+ ref_captions = json.load(open(annotation, "r"))["audios"]
+
+ key2refs = {}
+ for audio_idx in range(len(ref_captions)):
+ audio_id = ref_captions[audio_idx]["audio_id"]
+ key2refs[audio_id] = []
+ for caption in ref_captions[audio_idx]["captions"]:
+ key2refs[audio_id].append(caption["caption"])
+
+ pred_captions = json.load(open(prediction, "r"))["predictions"]
+
+ key2pred = {}
+ for audio_idx in range(len(pred_captions)):
+ item = pred_captions[audio_idx]
+ audio_id = item["filename"]
+ key2pred[audio_id] = [item["tokens"]]
+
+ from fense.fense import Fense
+ scores = {}
+ scorer = Fense()
+ scores[scorer.method()] = evaluate_prediction(key2pred, key2refs, scorer)
+
+ refs4eval = {}
+ for key, refs in key2refs.items():
+ refs4eval[key] = []
+ for idx, ref in enumerate(refs):
+ refs4eval[key].append({
+ "audio_id": key,
+ "id": idx,
+ "caption": ref
+ })
+
+ preds4eval = {}
+ for key, preds in key2pred.items():
+ preds4eval[key] = []
+ for idx, pred in enumerate(preds):
+ preds4eval[key].append({
+ "audio_id": key,
+ "id": idx,
+ "caption": pred
+ })
+
+ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+
+ tokenizer = PTBTokenizer()
+ key2refs = tokenizer.tokenize(refs4eval)
+ key2pred = tokenizer.tokenize(preds4eval)
+
+
+ from pycocoevalcap.bleu.bleu import Bleu
+ from pycocoevalcap.cider.cider import Cider
+ from pycocoevalcap.rouge.rouge import Rouge
+ from pycocoevalcap.meteor.meteor import Meteor
+ from pycocoevalcap.spice.spice import Spice
+
+ scorers = [Bleu(), Rouge(), Cider(), Meteor(), Spice()]
+ for scorer in scorers:
+ scores[scorer.method()] = evaluate_prediction(key2pred, key2refs, scorer)
+
+ spider = 0
+ with open(output, "w") as f:
+ for name, score in scores.items():
+ if name == "Bleu":
+ for n in range(4):
+ f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n]))
+ else:
+ f.write("{}: {:6.3f}\n".format(name, score))
+ if name in ["CIDEr", "SPICE"]:
+ spider += score
+ f.write("SPIDEr: {:6.3f}\n".format(spider / 2))
+
+
+if __name__ == "__main__":
+ fire.Fire(Evaluator)
diff --git a/audio_to_text/captioning/utils/fasttext/create_word_embedding.py b/audio_to_text/captioning/utils/fasttext/create_word_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..09da13a62a3462e730c8275320a6391536ff42c4
--- /dev/null
+++ b/audio_to_text/captioning/utils/fasttext/create_word_embedding.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+#!/usr/bin/env python3
+
+import numpy as np
+import pandas as pd
+import torch
+from gensim.models import FastText
+from tqdm import tqdm
+import fire
+
+import sys
+import os
+sys.path.append(os.getcwd())
+from utils.build_vocab import Vocabulary
+
+def create_embedding(caption_file: str,
+ vocab_file: str,
+ embed_size: int,
+ output: str,
+ **fasttext_kwargs):
+ caption_df = pd.read_json(caption_file)
+ caption_df["tokens"] = caption_df["tokens"].apply(lambda x: [""] + [token for token in x] + [""])
+
+ sentences = list(caption_df["tokens"].values)
+ vocabulary = torch.load(vocab_file, map_location="cpu")
+
+ epochs = fasttext_kwargs.get("epochs", 10)
+ model = FastText(size=embed_size, min_count=1, **fasttext_kwargs)
+ model.build_vocab(sentences=sentences)
+ model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs)
+
+ word_embeddings = np.zeros((len(vocabulary), embed_size))
+
+ with tqdm(total=len(vocabulary), ascii=True) as pbar:
+ for word, idx in vocabulary.word2idx.items():
+ if word == "" or word == "":
+ continue
+ word_embeddings[idx] = model.wv[word]
+ pbar.update()
+
+ np.save(output, word_embeddings)
+
+ print("Finish writing fasttext embeddings to " + output)
+
+
+if __name__ == "__main__":
+ fire.Fire(create_embedding)
+
+
+
diff --git a/audio_to_text/captioning/utils/lr_scheduler.py b/audio_to_text/captioning/utils/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46e3f0397634bcf48a6a61ab041a7ea07577eb3
--- /dev/null
+++ b/audio_to_text/captioning/utils/lr_scheduler.py
@@ -0,0 +1,128 @@
+import math
+import torch
+
+
+class ExponentialDecayScheduler(torch.optim.lr_scheduler._LRScheduler):
+
+ def __init__(self, optimizer, total_iters, final_lrs,
+ warmup_iters=3000, last_epoch=-1, verbose=False):
+ self.total_iters = total_iters
+ self.final_lrs = final_lrs
+ if not isinstance(self.final_lrs, list) and not isinstance(
+ self.final_lrs, tuple):
+ self.final_lrs = [self.final_lrs] * len(optimizer.param_groups)
+ self.warmup_iters = warmup_iters
+ self.bases = [0.0,] * len(optimizer.param_groups)
+ super().__init__(optimizer, last_epoch, verbose)
+ for i, (base_lr, final_lr) in enumerate(zip(self.base_lrs, self.final_lrs)):
+ base = (final_lr / base_lr) ** (1 / (
+ self.total_iters - self.warmup_iters))
+ self.bases[i] = base
+
+ def _get_closed_form_lr(self):
+ warmup_coeff = 1.0
+ current_iter = self._step_count
+ if current_iter < self.warmup_iters:
+ warmup_coeff = current_iter / self.warmup_iters
+ current_lrs = []
+ # if not self.linear_warmup:
+ # for base_lr, final_lr, base in zip(self.base_lrs, self.final_lrs, self.bases):
+ # # current_lr = warmup_coeff * base_lr * math.exp(((current_iter - self.warmup_iters) / self.total_iters) * math.log(final_lr / base_lr))
+ # current_lr = warmup_coeff * base_lr * (base ** (current_iter - self.warmup_iters))
+ # current_lrs.append(current_lr)
+ # else:
+ for base_lr, final_lr, base in zip(self.base_lrs, self.final_lrs,
+ self.bases):
+ if current_iter <= self.warmup_iters:
+ current_lr = warmup_coeff * base_lr
+ else:
+ # current_lr = warmup_coeff * base_lr * math.exp(((current_iter - self.warmup_iters) / self.total_iters) * math.log(final_lr / base_lr))
+ current_lr = base_lr * (base ** (current_iter - self.warmup_iters))
+ current_lrs.append(current_lr)
+ return current_lrs
+
+ def get_lr(self):
+ return self._get_closed_form_lr()
+
+
+class NoamScheduler(torch.optim.lr_scheduler._LRScheduler):
+
+ def __init__(self, optimizer, model_size=512, factor=1, warmup_iters=3000,
+ last_epoch=-1, verbose=False):
+ self.model_size = model_size
+ self.warmup_iters = warmup_iters
+ # self.factors = [group["lr"] / (self.model_size ** (-0.5) * self.warmup_iters ** (-0.5)) for group in optimizer.param_groups]
+ self.factor = factor
+ super().__init__(optimizer, last_epoch, verbose)
+
+ def _get_closed_form_lr(self):
+ current_iter = self._step_count
+ current_lrs = []
+ for _ in self.base_lrs:
+ current_lr = self.factor * \
+ (self.model_size ** (-0.5) * min(current_iter ** (-0.5),
+ current_iter * self.warmup_iters ** (-1.5)))
+ current_lrs.append(current_lr)
+ return current_lrs
+
+ def get_lr(self):
+ return self._get_closed_form_lr()
+
+
+class CosineWithWarmup(torch.optim.lr_scheduler._LRScheduler):
+
+ def __init__(self, optimizer, total_iters, warmup_iters,
+ num_cycles=0.5, last_epoch=-1, verbose=False):
+ self.total_iters = total_iters
+ self.warmup_iters = warmup_iters
+ self.num_cycles = num_cycles
+ super().__init__(optimizer, last_epoch, verbose)
+
+ def lr_lambda(self, iteration):
+ if iteration < self.warmup_iters:
+ return float(iteration) / float(max(1, self.warmup_iters))
+ progress = float(iteration - self.warmup_iters) / float(max(1,
+ self.total_iters - self.warmup_iters))
+ return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(
+ self.num_cycles) * 2.0 * progress)))
+
+ def _get_closed_form_lr(self):
+ current_iter = self._step_count
+ current_lrs = []
+ for base_lr in self.base_lrs:
+ current_lr = base_lr * self.lr_lambda(current_iter)
+ current_lrs.append(current_lr)
+ return current_lrs
+
+ def get_lr(self):
+ return self._get_closed_form_lr()
+
+
+if __name__ == "__main__":
+ model = torch.nn.Linear(10, 5)
+ optimizer = torch.optim.Adam(model.parameters(), 5e-4)
+ epochs = 25
+ iters = 600
+ scheduler = CosineWithWarmup(optimizer, 600 * 25, 600 * 5,)
+ # scheduler = ExponentialDecayScheduler(optimizer, 600 * 25, 5e-7, 600 * 5)
+ criterion = torch.nn.MSELoss()
+ lrs = []
+ for epoch in range(1, epochs + 1):
+ for iteration in range(1, iters + 1):
+ optimizer.zero_grad()
+ x = torch.randn(4, 10)
+ y = torch.randn(4, 5)
+ loss = criterion(model(x), y)
+ loss.backward()
+ optimizer.step()
+ scheduler.step()
+ # print(f"lr: {scheduler.get_last_lr()}")
+ # lrs.append(scheduler.get_last_lr())
+ lrs.append(optimizer.param_groups[0]["lr"])
+ import matplotlib.pyplot as plt
+ plt.plot(list(range(1, len(lrs) + 1)), lrs, '-o', markersize=1)
+ # plt.legend(loc="best")
+ plt.xlabel("Iteration")
+ plt.ylabel("LR")
+
+ plt.savefig("lr_curve.png", dpi=100)
diff --git a/audio_to_text/captioning/utils/model_eval_diff.py b/audio_to_text/captioning/utils/model_eval_diff.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c29ef8fde2451d3f84e842d0d6a72754f0d4603
--- /dev/null
+++ b/audio_to_text/captioning/utils/model_eval_diff.py
@@ -0,0 +1,110 @@
+import os
+import sys
+import copy
+import pickle
+
+import numpy as np
+import pandas as pd
+import fire
+
+sys.path.append(os.getcwd())
+
+
+def coco_score(refs, pred, scorer):
+ if scorer.method() == "Bleu":
+ scores = np.array([ 0.0 for n in range(4) ])
+ else:
+ scores = 0
+ num_cap_per_audio = len(refs[list(refs.keys())[0]])
+
+ for i in range(num_cap_per_audio):
+ if i > 0:
+ for key in refs:
+ refs[key].insert(0, res[key][0])
+ res = {key: [refs[key].pop(),] for key in refs}
+ score, _ = scorer.compute_score(refs, pred)
+
+ if scorer.method() == "Bleu":
+ scores += np.array(score)
+ else:
+ scores += score
+
+ score = scores / num_cap_per_audio
+
+ for key in refs:
+ refs[key].insert(0, res[key][0])
+ score_allref, _ = scorer.compute_score(refs, pred)
+ diff = score_allref - score
+ return diff
+
+def embedding_score(refs, pred, scorer):
+
+ num_cap_per_audio = len(refs[list(refs.keys())[0]])
+ scores = 0
+
+ for i in range(num_cap_per_audio):
+ res = {key: [refs[key][i],] for key in refs.keys() if len(refs[key]) == num_cap_per_audio}
+ refs_i = {key: np.concatenate([refs[key][:i], refs[key][i+1:]]) for key in refs.keys() if len(refs[key]) == num_cap_per_audio}
+ score, _ = scorer.compute_score(refs_i, pred)
+
+ scores += score
+
+ score = scores / num_cap_per_audio
+
+ score_allref, _ = scorer.compute_score(refs, pred)
+ diff = score_allref - score
+ return diff
+
+def main(output_file, eval_caption_file, eval_embedding_file, output, zh=False):
+ output_df = pd.read_json(output_file)
+ output_df["key"] = output_df["filename"].apply(lambda x: os.path.splitext(os.path.basename(x))[0])
+ pred = output_df.groupby("key")["tokens"].apply(list).to_dict()
+
+ label_df = pd.read_json(eval_caption_file)
+ if zh:
+ refs = label_df.groupby("key")["tokens"].apply(list).to_dict()
+ else:
+ refs = label_df.groupby("key")["caption"].apply(list).to_dict()
+
+ from pycocoevalcap.bleu.bleu import Bleu
+ from pycocoevalcap.cider.cider import Cider
+ from pycocoevalcap.rouge.rouge import Rouge
+
+ scorer = Bleu(zh=zh)
+ bleu_scores = coco_score(copy.deepcopy(refs), pred, scorer)
+ scorer = Cider(zh=zh)
+ cider_score = coco_score(copy.deepcopy(refs), pred, scorer)
+ scorer = Rouge(zh=zh)
+ rouge_score = coco_score(copy.deepcopy(refs), pred, scorer)
+
+ if not zh:
+ from pycocoevalcap.meteor.meteor import Meteor
+ scorer = Meteor()
+ meteor_score = coco_score(copy.deepcopy(refs), pred, scorer)
+
+ from pycocoevalcap.spice.spice import Spice
+ scorer = Spice()
+ spice_score = coco_score(copy.deepcopy(refs), pred, scorer)
+
+ # from audiocaptioneval.sentbert.sentencebert import SentenceBert
+ # scorer = SentenceBert(zh=zh)
+ # with open(eval_embedding_file, "rb") as f:
+ # ref_embeddings = pickle.load(f)
+
+ # sent_bert = embedding_score(ref_embeddings, pred, scorer)
+
+ with open(output, "w") as f:
+ f.write("Diff:\n")
+ for n in range(4):
+ f.write("BLEU-{}: {:6.3f}\n".format(n+1, bleu_scores[n]))
+ f.write("CIDEr: {:6.3f}\n".format(cider_score))
+ f.write("ROUGE: {:6.3f}\n".format(rouge_score))
+ if not zh:
+ f.write("Meteor: {:6.3f}\n".format(meteor_score))
+ f.write("SPICE: {:6.3f}\n".format(spice_score))
+ # f.write("SentenceBert: {:6.3f}\n".format(sent_bert))
+
+
+
+if __name__ == "__main__":
+ fire.Fire(main)
diff --git a/audio_to_text/captioning/utils/predict_nn.py b/audio_to_text/captioning/utils/predict_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..699c3dcffe7ce2c6dad33a5546c707dd76ccf82c
--- /dev/null
+++ b/audio_to_text/captioning/utils/predict_nn.py
@@ -0,0 +1,49 @@
+import json
+import random
+import argparse
+import numpy as np
+from tqdm import tqdm
+from h5py import File
+import sklearn.metrics
+
+random.seed(1)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("train_feature", type=str)
+parser.add_argument("train_corpus", type=str)
+parser.add_argument("pred_feature", type=str)
+parser.add_argument("output_json", type=str)
+
+args = parser.parse_args()
+train_embs = []
+train_idx_to_audioid = []
+with File(args.train_feature, "r") as store:
+ for audio_id, embedding in tqdm(store.items(), ascii=True):
+ train_embs.append(embedding[()])
+ train_idx_to_audioid.append(audio_id)
+
+train_annotation = json.load(open(args.train_corpus, "r"))["audios"]
+train_audioid_to_tokens = {}
+for item in train_annotation:
+ audio_id = item["audio_id"]
+ train_audioid_to_tokens[audio_id] = [cap_item["tokens"] for cap_item in item["captions"]]
+train_embs = np.stack(train_embs)
+
+
+pred_data = []
+pred_embs = []
+pred_idx_to_audioids = []
+with File(args.pred_feature, "r") as store:
+ for audio_id, embedding in tqdm(store.items(), ascii=True):
+ pred_embs.append(embedding[()])
+ pred_idx_to_audioids.append(audio_id)
+pred_embs = np.stack(pred_embs)
+
+similarity = sklearn.metrics.pairwise.cosine_similarity(pred_embs, train_embs)
+for idx, audio_id in enumerate(pred_idx_to_audioids):
+ train_idx = similarity[idx].argmax()
+ pred_data.append({
+ "filename": audio_id,
+ "tokens": random.choice(train_audioid_to_tokens[train_idx_to_audioid[train_idx]])
+ })
+json.dump({"predictions": pred_data}, open(args.output_json, "w"), ensure_ascii=False, indent=4)
diff --git a/audio_to_text/captioning/utils/remove_optimizer.py b/audio_to_text/captioning/utils/remove_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b9871ee8022c0e0814abb46173fee1a6ae4ba9c
--- /dev/null
+++ b/audio_to_text/captioning/utils/remove_optimizer.py
@@ -0,0 +1,18 @@
+import argparse
+import torch
+
+
+def main(checkpoint):
+ state_dict = torch.load(checkpoint, map_location="cpu")
+ if "optimizer" in state_dict:
+ del state_dict["optimizer"]
+ if "lr_scheduler" in state_dict:
+ del state_dict["lr_scheduler"]
+ torch.save(state_dict, checkpoint)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint", type=str)
+ args = parser.parse_args()
+ main(args.checkpoint)
diff --git a/audio_to_text/captioning/utils/report_results.py b/audio_to_text/captioning/utils/report_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b9f6ec5e8d2f253706198e0d521f73981ef3efe
--- /dev/null
+++ b/audio_to_text/captioning/utils/report_results.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+import argparse
+import numpy as np
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="input filename", type=str, nargs="+")
+parser.add_argument("--output", help="output result file", default=None)
+
+args = parser.parse_args()
+
+
+scores = {}
+for path in args.input:
+ with open(path, "r") as reader:
+ for line in reader.readlines():
+ metric, score = line.strip().split(": ")
+ score = float(score)
+ if metric not in scores:
+ scores[metric] = []
+ scores[metric].append(score)
+
+if len(scores) == 0:
+ print("No experiment directory found, wrong path?")
+ exit(1)
+
+with open(args.output, "w") as writer:
+ print("Average results: ", file=writer)
+ for metric, score in scores.items():
+ score = np.array(score)
+ mean = np.mean(score)
+ std = np.std(score)
+ print(f"{metric}: {mean:.3f} (±{std:.3f})", file=writer)
+ print("", file=writer)
+ print("Best results: ", file=writer)
+ for metric, score in scores.items():
+ score = np.max(score)
+ print(f"{metric}: {score:.3f}", file=writer)
diff --git a/audio_to_text/captioning/utils/tokenize_caption.py b/audio_to_text/captioning/utils/tokenize_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..b340068577a1d4b02e187048e6a20cb95264561f
--- /dev/null
+++ b/audio_to_text/captioning/utils/tokenize_caption.py
@@ -0,0 +1,86 @@
+import json
+from tqdm import tqdm
+import re
+import fire
+
+
+def tokenize_caption(input_json: str,
+ keep_punctuation: bool = False,
+ host_address: str = None,
+ character_level: bool = False,
+ zh: bool = True,
+ output_json: str = None):
+ """Build vocabulary from csv file with a given threshold to drop all counts < threshold
+
+ Args:
+ input_json(string): Preprossessed json file. Structure like this:
+ {
+ 'audios': [
+ {
+ 'audio_id': 'xxx',
+ 'captions': [
+ {
+ 'caption': 'xxx',
+ 'cap_id': 'xxx'
+ }
+ ]
+ },
+ ...
+ ]
+ }
+ threshold (int): Threshold to drop all words with counts < threshold
+ keep_punctuation (bool): Includes or excludes punctuation.
+
+ Returns:
+ vocab (Vocab): Object with the processed vocabulary
+"""
+ data = json.load(open(input_json, "r"))["audios"]
+
+ if zh:
+ from nltk.parse.corenlp import CoreNLPParser
+ from zhon.hanzi import punctuation
+ parser = CoreNLPParser(host_address)
+ for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ caption = data[audio_idx]["captions"][cap_idx]["caption"]
+ # Remove all punctuations
+ if not keep_punctuation:
+ caption = re.sub("[{}]".format(punctuation), "", caption)
+ if character_level:
+ tokens = list(caption)
+ else:
+ tokens = list(parser.tokenize(caption))
+ data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)
+ else:
+ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+ captions = {}
+ for audio_idx in range(len(data)):
+ audio_id = data[audio_idx]["audio_id"]
+ captions[audio_id] = []
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ caption = data[audio_idx]["captions"][cap_idx]["caption"]
+ captions[audio_id].append({
+ "audio_id": audio_id,
+ "id": cap_idx,
+ "caption": caption
+ })
+ tokenizer = PTBTokenizer()
+ captions = tokenizer.tokenize(captions)
+ for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+ audio_id = data[audio_idx]["audio_id"]
+ for cap_idx in range(len(data[audio_idx]["captions"])):
+ tokens = captions[audio_id][cap_idx]
+ data[audio_idx]["captions"][cap_idx]["tokens"] = tokens
+
+ if output_json:
+ json.dump(
+ { "audios": data }, open(output_json, "w"),
+ indent=4, ensure_ascii=not zh)
+ else:
+ json.dump(
+ { "audios": data }, open(input_json, "w"),
+ indent=4, ensure_ascii=not zh)
+
+
+if __name__ == "__main__":
+ fire.Fire(tokenize_caption)
diff --git a/audio_to_text/captioning/utils/train_util.py b/audio_to_text/captioning/utils/train_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cd62cc36043a2db75cc6761c51fdfdd18d11392
--- /dev/null
+++ b/audio_to_text/captioning/utils/train_util.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env python3
+import os
+import sys
+import logging
+from typing import Callable, Dict, Union
+import yaml
+import torch
+from torch.optim.swa_utils import AveragedModel as torch_average_model
+import numpy as np
+import pandas as pd
+from pprint import pformat
+
+
+def load_dict_from_csv(csv, cols):
+ df = pd.read_csv(csv, sep="\t")
+ output = dict(zip(df[cols[0]], df[cols[1]]))
+ return output
+
+
+def init_logger(filename, level="INFO"):
+ formatter = logging.Formatter(
+ "[ %(levelname)s : %(asctime)s ] - %(message)s")
+ logger = logging.getLogger(__name__ + "." + filename)
+ logger.setLevel(getattr(logging, level))
+ # Log results to std
+ # stdhandler = logging.StreamHandler(sys.stdout)
+ # stdhandler.setFormatter(formatter)
+ # Dump log to file
+ filehandler = logging.FileHandler(filename)
+ filehandler.setFormatter(formatter)
+ logger.addHandler(filehandler)
+ # logger.addHandler(stdhandler)
+ return logger
+
+
+def init_obj(module, config, **kwargs):# 'captioning.models.encoder'
+ obj_args = config["args"].copy()
+ obj_args.update(kwargs)
+ return getattr(module, config["type"])(**obj_args)
+
+
+def pprint_dict(in_dict, outputfun=sys.stdout.write, formatter='yaml'):
+ """pprint_dict
+
+ :param outputfun: function to use, defaults to sys.stdout
+ :param in_dict: dict to print
+ """
+ if formatter == 'yaml':
+ format_fun = yaml.dump
+ elif formatter == 'pretty':
+ format_fun = pformat
+ for line in format_fun(in_dict).split('\n'):
+ outputfun(line)
+
+
+def merge_a_into_b(a, b):
+ # merge dict a into dict b. values in a will overwrite b.
+ for k, v in a.items():
+ if isinstance(v, dict) and k in b:
+ assert isinstance(
+ b[k], dict
+ ), "Cannot inherit key '{}' from base!".format(k)
+ merge_a_into_b(v, b[k])
+ else:
+ b[k] = v
+
+
+def load_config(config_file):
+ with open(config_file, "r") as reader:
+ config = yaml.load(reader, Loader=yaml.FullLoader)
+ if "inherit_from" in config:
+ base_config_file = config["inherit_from"]
+ base_config_file = os.path.join(
+ os.path.dirname(config_file), base_config_file
+ )
+ assert not os.path.samefile(config_file, base_config_file), \
+ "inherit from itself"
+ base_config = load_config(base_config_file)
+ del config["inherit_from"]
+ merge_a_into_b(config, base_config)
+ return base_config
+ return config
+
+
+def parse_config_or_kwargs(config_file, **kwargs):
+ yaml_config = load_config(config_file)
+ # passed kwargs will override yaml config
+ args = dict(yaml_config, **kwargs)
+ return args
+
+
+def store_yaml(config, config_file):
+ with open(config_file, "w") as con_writer:
+ yaml.dump(config, con_writer, indent=4, default_flow_style=False)
+
+
+class MetricImprover:
+
+ def __init__(self, mode):
+ assert mode in ("min", "max")
+ self.mode = mode
+ # min: lower -> better; max: higher -> better
+ self.best_value = np.inf if mode == "min" else -np.inf
+
+ def compare(self, x, best_x):
+ return x < best_x if self.mode == "min" else x > best_x
+
+ def __call__(self, x):
+ if self.compare(x, self.best_value):
+ self.best_value = x
+ return True
+ return False
+
+ def state_dict(self):
+ return self.__dict__
+
+ def load_state_dict(self, state_dict):
+ self.__dict__.update(state_dict)
+
+
+def fix_batchnorm(model: torch.nn.Module):
+ def inner(module):
+ class_name = module.__class__.__name__
+ if class_name.find("BatchNorm") != -1:
+ module.eval()
+ model.apply(inner)
+
+
+def load_pretrained_model(model: torch.nn.Module,
+ pretrained: Union[str, Dict],
+ output_fn: Callable = sys.stdout.write):
+ if not isinstance(pretrained, dict) and not os.path.exists(pretrained):
+ output_fn(f"pretrained {pretrained} not exist!")
+ return
+
+ if hasattr(model, "load_pretrained"):
+ model.load_pretrained(pretrained)
+ return
+
+ if isinstance(pretrained, dict):
+ state_dict = pretrained
+ else:
+ state_dict = torch.load(pretrained, map_location="cpu")
+
+ if "model" in state_dict:
+ state_dict = state_dict["model"]
+ model_dict = model.state_dict()
+ pretrained_dict = {
+ k: v for k, v in state_dict.items() if (k in model_dict) and (
+ model_dict[k].shape == v.shape)
+ }
+ output_fn(f"Loading pretrained keys {pretrained_dict.keys()}")
+ model_dict.update(pretrained_dict)
+ model.load_state_dict(model_dict, strict=True)
+
+
+class AveragedModel(torch_average_model):
+
+ def update_parameters(self, model):
+ for p_swa, p_model in zip(self.parameters(), model.parameters()):
+ device = p_swa.device
+ p_model_ = p_model.detach().to(device)
+ if self.n_averaged == 0:
+ p_swa.detach().copy_(p_model_)
+ else:
+ p_swa.detach().copy_(self.avg_fn(p_swa.detach(), p_model_,
+ self.n_averaged.to(device)))
+
+ for b_swa, b_model in zip(list(self.buffers())[1:], model.buffers()):
+ device = b_swa.device
+ b_model_ = b_model.detach().to(device)
+ if self.n_averaged == 0:
+ b_swa.detach().copy_(b_model_)
+ else:
+ b_swa.detach().copy_(self.avg_fn(b_swa.detach(), b_model_,
+ self.n_averaged.to(device)))
+ self.n_averaged += 1
diff --git a/audio_to_text/captioning/utils/word2vec/create_word_embedding.py b/audio_to_text/captioning/utils/word2vec/create_word_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..77ebe5adc6ec14bd639e78125f00c1eaea0b4dcc
--- /dev/null
+++ b/audio_to_text/captioning/utils/word2vec/create_word_embedding.py
@@ -0,0 +1,67 @@
+# coding=utf-8
+#!/usr/bin/env python3
+
+import numpy as np
+import pandas as pd
+import torch
+import gensim
+from gensim.models import Word2Vec
+from tqdm import tqdm
+import fire
+
+import sys
+import os
+sys.path.append(os.getcwd())
+from utils.build_vocab import Vocabulary
+
+def create_embedding(vocab_file: str,
+ embed_size: int,
+ output: str,
+ caption_file: str = None,
+ pretrained_weights_path: str = None,
+ **word2vec_kwargs):
+ vocabulary = torch.load(vocab_file, map_location="cpu")
+
+ if pretrained_weights_path:
+ model = gensim.models.KeyedVectors.load_word2vec_format(
+ fname=pretrained_weights_path,
+ binary=True,
+ )
+ if model.vector_size != embed_size:
+ assert embed_size < model.vector_size, f"only reduce dimension, cannot add dimesion {model.vector_size} to {embed_size}"
+ from sklearn.decomposition import PCA
+ pca = PCA(n_components=embed_size)
+ model.vectors = pca.fit_transform(model.vectors)
+ else:
+ caption_df = pd.read_json(caption_file)
+ caption_df["tokens"] = caption_df["tokens"].apply(lambda x: [""] + [token for token in x] + [""])
+ sentences = list(caption_df["tokens"].values)
+ epochs = word2vec_kwargs.get("epochs", 10)
+ if "epochs" in word2vec_kwargs:
+ del word2vec_kwargs["epochs"]
+ model = Word2Vec(size=embed_size, min_count=1, **word2vec_kwargs)
+ model.build_vocab(sentences=sentences)
+ model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs)
+
+ word_embeddings = np.random.randn(len(vocabulary), embed_size)
+
+ if isinstance(model, gensim.models.word2vec.Word2Vec):
+ model = model.wv
+ with tqdm(total=len(vocabulary), ascii=True) as pbar:
+ for word, idx in vocabulary.word2idx.items():
+ try:
+ word_embeddings[idx] = model.get_vector(word)
+ except KeyError:
+ print(f"word {word} not found in word2vec model, it is random initialized!")
+ pbar.update()
+
+ np.save(output, word_embeddings)
+
+ print("Finish writing word2vec embeddings to " + output)
+
+
+if __name__ == "__main__":
+ fire.Fire(create_embedding)
+
+
+
diff --git a/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..034751fcff1e1d3b686ae0ad1cd6346f92dacc13
--- /dev/null
+++ b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml
@@ -0,0 +1,22 @@
+model:
+ encoder:
+ type: Cnn14RnnEncoder
+ args:
+ sample_rate: 32000
+ pretrained: ./audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
+ freeze_cnn: True
+ freeze_cnn_bn: True
+ bidirectional: True
+ dropout: 0.5
+ hidden_size: 256
+ num_layers: 3
+ decoder:
+ type: TransformerDecoder
+ args:
+ attn_emb_dim: 512
+ dropout: 0.2
+ emb_dim: 256
+ fc_emb_dim: 512
+ nlayers: 2
+ type: TransformerModel
+ args: {}
diff --git a/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0f9a16de2efa334d403326acec7de5de4c3393d6
--- /dev/null
+++ b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8d341dccafcdcfb7009c402afb07f314ab1d613a5f5c42d32407d6c2a821abf
+size 41755865
diff --git a/audio_to_text/inference_waveform.py b/audio_to_text/inference_waveform.py
new file mode 100644
index 0000000000000000000000000000000000000000..aba39614c8104f62cdb8a3c7e0e3cf5dced0d95a
--- /dev/null
+++ b/audio_to_text/inference_waveform.py
@@ -0,0 +1,102 @@
+import sys
+import os
+import librosa
+import numpy as np
+import torch
+import audio_to_text.captioning.models
+import audio_to_text.captioning.models.encoder
+import audio_to_text.captioning.models.decoder
+import audio_to_text.captioning.utils.train_util as train_util
+
+
+def load_model(config, checkpoint):
+ ckpt = torch.load(checkpoint, "cpu")
+ encoder_cfg = config["model"]["encoder"]
+ encoder = train_util.init_obj(
+ audio_to_text.captioning.models.encoder,
+ encoder_cfg
+ )
+ if "pretrained" in encoder_cfg:
+ pretrained = encoder_cfg["pretrained"]
+ train_util.load_pretrained_model(encoder,
+ pretrained,
+ sys.stdout.write)
+ decoder_cfg = config["model"]["decoder"]
+ if "vocab_size" not in decoder_cfg["args"]:
+ decoder_cfg["args"]["vocab_size"] = len(ckpt["vocabulary"])
+ decoder = train_util.init_obj(
+ audio_to_text.captioning.models.decoder,
+ decoder_cfg
+ )
+ if "word_embedding" in decoder_cfg:
+ decoder.load_word_embedding(**decoder_cfg["word_embedding"])
+ if "pretrained" in decoder_cfg:
+ pretrained = decoder_cfg["pretrained"]
+ train_util.load_pretrained_model(decoder,
+ pretrained,
+ sys.stdout.write)
+ model = train_util.init_obj(audio_to_text.captioning.models, config["model"],
+ encoder=encoder, decoder=decoder)
+ train_util.load_pretrained_model(model, ckpt)
+ model.eval()
+ return {
+ "model": model,
+ "vocabulary": ckpt["vocabulary"]
+ }
+
+
+def decode_caption(word_ids, vocabulary):
+ candidate = []
+ for word_id in word_ids:
+ word = vocabulary[word_id]
+ if word == "":
+ break
+ elif word == "":
+ continue
+ candidate.append(word)
+ candidate = " ".join(candidate)
+ return candidate
+
+
+class AudioCapModel(object):
+ def __init__(self,weight_dir,device='cuda'):
+ config = os.path.join(weight_dir,'config.yaml')
+ self.config = train_util.parse_config_or_kwargs(config)
+ checkpoint = os.path.join(weight_dir,'swa.pth')
+ resumed = load_model(self.config, checkpoint)
+ model = resumed["model"]
+ self.vocabulary = resumed["vocabulary"]
+ self.model = model.to(device)
+ self.device = device
+
+ def caption(self,audio_list):
+ if isinstance(audio_list,np.ndarray):
+ audio_list = [audio_list]
+ elif isinstance(audio_list,str):
+ audio_list = [librosa.load(audio_list,sr=32000)[0]]
+
+ captions = []
+ for wav in audio_list:
+ inputwav = torch.as_tensor(wav).float().unsqueeze(0).to(self.device)
+ wav_len = torch.LongTensor([len(wav)])
+ input_dict = {
+ "mode": "inference",
+ "wav": inputwav,
+ "wav_len": wav_len,
+ "specaug": False,
+ "sample_method": "beam",
+ }
+ print(input_dict)
+ out_dict = self.model(input_dict)
+ caption_batch = [decode_caption(seq, self.vocabulary) for seq in \
+ out_dict["seq"].cpu().numpy()]
+ captions.extend(caption_batch)
+ return captions
+
+
+
+ def __call__(self, audio_list):
+ return self.caption(audio_list)
+
+
+
diff --git a/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth b/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8f570dc2d96679fbdecaba7d8f266368fc7fb0c9
--- /dev/null
+++ b/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c4faa86f30e77df235b5dc1fb6578a18ff2b8a1b0043f47e30acb9ccb53a336
+size 494977221
diff --git a/checkpoints/0102_xiaoma_pe/config.yaml b/checkpoints/0102_xiaoma_pe/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..69a88444205377d48573d53bb4fb500860976588
--- /dev/null
+++ b/checkpoints/0102_xiaoma_pe/config.yaml
@@ -0,0 +1,172 @@
+accumulate_grad_batches: 1
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+base_config:
+- configs/tts/lj/fs2.yaml
+binarization_args:
+ shuffle: false
+ with_align: true
+ with_f0: true
+ with_f0cwt: true
+ with_spk_embed: true
+ with_txt: true
+ with_wav: false
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+binary_data_dir: data/binary/xiaoma1022_24k_128hop
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+decoder_type: fft
+dict_dir: ''
+dropout: 0.1
+ds_workers: 4
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+enc_ffn_kernel_size: 9
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+gen_dir_name: ''
+hidden_size: 256
+hop_size: 128
+infer: false
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+load_ckpt: ''
+log_interval: 100
+loud_norm: false
+lr: 2.0
+max_epochs: 1000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 5000
+max_input_tokens: 1550
+max_sentences: 100000
+max_tokens: 20000
+max_updates: 60000
+mel_loss: l1
+mel_vmax: 1.5
+mel_vmin: -6
+min_level_db: -120
+norm_type: gn
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 5
+num_spk: 1
+num_test_samples: 20
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_ar: false
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor_conv_layers: 2
+pitch_loss: l1
+pitch_norm: log
+pitch_type: frame
+pre_align_args:
+ allow_no_txt: false
+ denoise: false
+ forced_align: mfa
+ txt_processor: en
+ use_sox: false
+ use_tone: true
+pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+pretrain_fs_ckpt: ''
+processed_data_dir: data/processed/ljspeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech-1.1
+ref_norm_layer: bn
+reset_phone_dict: true
+save_best: false
+save_ckpt: true
+save_codes:
+- configs
+- modules
+- tasks
+- utils
+- usr
+save_f0: false
+save_gt: false
+seed: 1234
+sort_by_len: true
+stop_token_weight: 5.0
+task_cls: tasks.tts.pe.PitchExtractionTask
+test_ids:
+- 68
+- 70
+- 74
+- 87
+- 110
+- 172
+- 190
+- 215
+- 231
+- 294
+- 316
+- 324
+- 402
+- 422
+- 485
+- 500
+- 505
+- 508
+- 509
+- 519
+test_input_dir: ''
+test_num: 523
+test_set_name: test
+train_set_name: train
+use_denoise: false
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_pitch_embed: true
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_num: 348
+valid_set_name: valid
+vocoder: pwg
+vocoder_ckpt: ''
+warmup_updates: 2000
+weight_decay: 0
+win_size: 512
+work_dir: checkpoints/0102_xiaoma_pe
diff --git a/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt b/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..468cc81b1a95e2f3dd490a8770bd705e14855f77
--- /dev/null
+++ b/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53942abd8cb908b6d161e1ad7ff3d7d0dd6b204d5bf050613c9d00c56b185ceb
+size 13047222
diff --git a/checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml b/checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95fc5414ba1aff1bad8284ebfba52f5636b4d76d
--- /dev/null
+++ b/checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml
@@ -0,0 +1,241 @@
+accumulate_grad_batches: 1
+adam_b1: 0.8
+adam_b2: 0.99
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+aux_context_window: 0
+#base_config:
+#- egs/egs_bases/singing/pwg.yaml
+#- egs/egs_bases/tts/vocoder/hifigan.yaml
+binarization_args:
+ reset_phone_dict: true
+ reset_word_dict: true
+ shuffle: false
+ trim_eos_bos: false
+ trim_sil: false
+ with_align: false
+ with_f0: true
+ with_f0cwt: false
+ with_linear: false
+ with_spk_embed: false
+ with_spk_id: true
+ with_txt: false
+ with_wav: true
+ with_word: false
+binarizer_cls: data_gen.tts.singing.binarize.SingingBinarizer
+binary_data_dir: data/binary/big_popcs_24k_hop128
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+datasets: []
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+dict_dir: ''
+disc_start_steps: 40000
+discriminator_grad_norm: 1
+discriminator_optimizer_params:
+ eps: 1.0e-06
+ lr: 0.0002
+ weight_decay: 0.0
+discriminator_params:
+ bias: true
+ conv_channels: 64
+ in_channels: 1
+ kernel_size: 3
+ layers: 10
+ nonlinear_activation: LeakyReLU
+ nonlinear_activation_params:
+ negative_slope: 0.2
+ out_channels: 1
+ use_weight_norm: true
+discriminator_scheduler_params:
+ gamma: 0.999
+ step_size: 600
+dropout: 0.1
+ds_workers: 1
+enc_ffn_kernel_size: 9
+enc_layers: 4
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+frames_multiple: 1
+gen_dir_name: ''
+generator_grad_norm: 10
+generator_optimizer_params:
+ eps: 1.0e-06
+ lr: 0.0002
+ weight_decay: 0.0
+generator_params:
+ aux_channels: 80
+ dropout: 0.0
+ gate_channels: 128
+ in_channels: 1
+ kernel_size: 3
+ layers: 30
+ out_channels: 1
+ residual_channels: 64
+ skip_channels: 64
+ stacks: 3
+ upsample_net: ConvInUpsampleNetwork
+ upsample_params:
+ upsample_scales:
+ - 2
+ - 4
+ - 4
+ - 4
+ use_nsf: false
+ use_pitch_embed: true
+ use_weight_norm: true
+generator_scheduler_params:
+ gamma: 0.999
+ step_size: 600
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 128
+infer: false
+lambda_adv: 1.0
+lambda_cdisc: 4.0
+lambda_energy: 0.0
+lambda_f0: 0.0
+lambda_mel: 5.0
+lambda_mel_adv: 1.0
+lambda_ph_dur: 0.0
+lambda_sent_dur: 0.0
+lambda_uv: 0.0
+lambda_word_dur: 0.0
+load_ckpt: ''
+loud_norm: false
+lr: 2.0
+max_epochs: 1000
+max_frames: 2400
+max_input_tokens: 1550
+max_samples: 8192
+max_sentences: 20
+max_tokens: 24000
+max_updates: 3000000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 0
+min_level_db: -120
+num_ckpt_keep: 3
+num_heads: 2
+num_mels: 80
+num_sanity_val_steps: 5
+num_spk: 100
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pitch_type: frame
+pre_align_args:
+ allow_no_txt: false
+ denoise: false
+ sox_resample: true
+ sox_to_wav: false
+ trim_sil: false
+ txt_processor: zh
+ use_tone: false
+pre_align_cls: data_gen.tts.singing.pre_align.SingingPreAlign
+predictor_grad: 0.0
+print_nan_grads: false
+processed_data_dir: ''
+profile_infer: false
+raw_data_dir: ''
+ref_level_db: 20
+rename_tmux: true
+rerun_gen: true
+resblock: '1'
+resblock_dilation_sizes:
+- - 1
+ - 3
+ - 5
+- - 1
+ - 3
+ - 5
+- - 1
+ - 3
+ - 5
+resblock_kernel_sizes:
+- 3
+- 7
+- 11
+resume_from_checkpoint: 0
+save_best: true
+save_codes: []
+save_f0: true
+save_gt: true
+scheduler: rsqrt
+seed: 1234
+sort_by_len: true
+stft_loss_params:
+ fft_sizes:
+ - 1024
+ - 2048
+ - 512
+ hop_sizes:
+ - 120
+ - 240
+ - 50
+ win_lengths:
+ - 600
+ - 1200
+ - 240
+ window: hann_window
+task_cls: tasks.vocoder.hifigan.HifiGanTask
+tb_log_interval: 100
+test_ids: []
+test_input_dir: ''
+test_num: 50
+test_prefixes: []
+test_set_name: test
+train_set_name: train
+train_sets: ''
+upsample_initial_channel: 512
+upsample_kernel_sizes:
+- 16
+- 16
+- 4
+- 4
+upsample_rates:
+- 8
+- 4
+- 2
+- 2
+use_cdisc: false
+use_cond_disc: false
+use_fm_loss: false
+use_gt_dur: true
+use_gt_f0: true
+use_mel_loss: true
+use_ms_stft: false
+use_pitch_embed: true
+use_ref_enc: true
+use_spec_disc: false
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder: pwg
+vocoder_ckpt: ''
+vocoder_denoise_c: 0.0
+warmup_updates: 8000
+weight_decay: 0
+win_length: null
+win_size: 512
+window: hann
+word_size: 3000
+work_dir: checkpoints/0109_hifigan_bigpopcs_hop128
diff --git a/checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt b/checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..ed55eaa98f86e3e22f4eb4e8115f254745cea155
--- /dev/null
+++ b/checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cb68f3ce0c46ba0a8b6d49718f1fffdf5bd7bcab769a986fd2fd129835cc1d1
+size 55827436
diff --git a/checkpoints/0228_opencpop_ds100_rel/config.yaml b/checkpoints/0228_opencpop_ds100_rel/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..977627b65c12e00e5dd2cc42e423f9ee4899956a
--- /dev/null
+++ b/checkpoints/0228_opencpop_ds100_rel/config.yaml
@@ -0,0 +1,342 @@
+K_step: 100
+accumulate_grad_batches: 1
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+base_config:
+- usr/configs/popcs_ds_beta6.yaml
+- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
+binarization_args:
+ shuffle: false
+ with_align: true
+ with_f0: true
+ with_f0cwt: true
+ with_spk_embed: false
+ with_txt: true
+ with_wav: true
+binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
+binary_data_dir: data/binary/opencpop-midi-dp
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+content_cond_steps: []
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+datasets:
+- popcs
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+decay_steps: 50000
+decoder_type: fft
+dict_dir: ''
+diff_decoder_type: wavenet
+diff_loss_type: l1
+dilation_cycle_length: 4
+dropout: 0.1
+ds_workers: 4
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 5
+enc_ffn_kernel_size: 9
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+fs2_ckpt: ''
+gaussian_start: true
+gen_dir_name: ''
+gen_tgt_spk_id: -1
+hidden_size: 256
+hop_size: 128
+infer: false
+keep_bins: 80
+lambda_commit: 0.25
+lambda_energy: 0.0
+lambda_f0: 0.0
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_uv: 0.0
+lambda_word_dur: 1.0
+load_ckpt: ''
+log_interval: 100
+loud_norm: false
+lr: 0.001
+max_beta: 0.06
+max_epochs: 1000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 8000
+max_input_tokens: 1550
+max_sentences: 48
+max_tokens: 40000
+max_updates: 160000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6.0
+min_level_db: -120
+norm_type: gn
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 1
+num_spk: 1
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pe_ckpt: checkpoints/0102_xiaoma_pe
+pe_enable: true
+pitch_ar: false
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l1
+pitch_norm: log
+pitch_type: frame
+pre_align_args:
+ allow_no_txt: false
+ denoise: false
+ forced_align: mfa
+ txt_processor: zh_g2pM
+ use_sox: true
+ use_tone: false
+pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 5
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+pretrain_fs_ckpt: ''
+processed_data_dir: data/processed/popcs
+profile_infer: false
+raw_data_dir: data/raw/popcs
+ref_norm_layer: bn
+rel_pos: true
+reset_phone_dict: true
+residual_channels: 256
+residual_layers: 20
+save_best: false
+save_ckpt: true
+save_codes:
+- configs
+- modules
+- tasks
+- utils
+- usr
+save_f0: true
+save_gt: false
+schedule_type: linear
+seed: 1234
+sort_by_len: true
+spec_max:
+- -0.79453
+- -0.81116
+- -0.61631
+- -0.30679
+- -0.13863
+- -0.050652
+- -0.11563
+- -0.10679
+- -0.091068
+- -0.062174
+- -0.075302
+- -0.072217
+- -0.063815
+- -0.073299
+- 0.007361
+- -0.072508
+- -0.050234
+- -0.16534
+- -0.26928
+- -0.20782
+- -0.20823
+- -0.11702
+- -0.070128
+- -0.065868
+- -0.012675
+- 0.0015121
+- -0.089902
+- -0.21392
+- -0.23789
+- -0.28922
+- -0.30405
+- -0.23029
+- -0.22088
+- -0.21542
+- -0.29367
+- -0.30137
+- -0.38281
+- -0.4359
+- -0.28681
+- -0.46855
+- -0.57485
+- -0.47022
+- -0.54266
+- -0.44848
+- -0.6412
+- -0.687
+- -0.6486
+- -0.76436
+- -0.49971
+- -0.71068
+- -0.69724
+- -0.61487
+- -0.55843
+- -0.69773
+- -0.57502
+- -0.70919
+- -0.82431
+- -0.84213
+- -0.90431
+- -0.8284
+- -0.77945
+- -0.82758
+- -0.87699
+- -1.0532
+- -1.0766
+- -1.1198
+- -1.0185
+- -0.98983
+- -1.0001
+- -1.0756
+- -1.0024
+- -1.0304
+- -1.0579
+- -1.0188
+- -1.05
+- -1.0842
+- -1.0923
+- -1.1223
+- -1.2381
+- -1.6467
+spec_min:
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+spk_cond_steps: []
+stop_token_weight: 5.0
+task_cls: usr.diffsinger_task.DiffSingerMIDITask
+test_ids: []
+test_input_dir: ''
+test_num: 0
+test_prefixes:
+- "popcs-\u8BF4\u6563\u5C31\u6563"
+- "popcs-\u9690\u5F62\u7684\u7FC5\u8180"
+test_set_name: test
+timesteps: 100
+train_set_name: train
+use_denoise: false
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_midi: true
+use_nsf: true
+use_pitch_embed: false
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_num: 0
+valid_set_name: valid
+vocoder: vocoders.hifigan.HifiGAN
+vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
+warmup_updates: 2000
+wav2spec_eps: 1e-6
+weight_decay: 0
+win_size: 512
+work_dir: checkpoints/0228_opencpop_ds100_rel
diff --git a/checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt b/checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..07b944d43e3bd61ebd8272c09db0011425b4af08
--- /dev/null
+++ b/checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a8261f7415bb39eb80a19d4c27c0ea084f63af2fdf6b82e63fcbd9cd82fc90c
+size 170226367
diff --git a/checkpoints/0831_opencpop_ds1000/config.yaml b/checkpoints/0831_opencpop_ds1000/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc2be3b17c1cab8a96f033a6370e6dbfbca1b66d
--- /dev/null
+++ b/checkpoints/0831_opencpop_ds1000/config.yaml
@@ -0,0 +1,346 @@
+K_step: 1000
+accumulate_grad_batches: 1
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+base_config:
+- usr/configs/popcs_ds_beta6.yaml
+- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
+binarization_args:
+ shuffle: false
+ with_align: true
+ with_f0: true
+ with_f0cwt: true
+ with_spk_embed: false
+ with_txt: true
+ with_wav: true
+binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
+binary_data_dir: data/binary/opencpop-midi-dp
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+content_cond_steps: []
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+datasets:
+- opencpop
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+decay_steps: 50000
+decoder_type: fft
+dict_dir: ''
+diff_decoder_type: wavenet
+diff_loss_type: l1
+dilation_cycle_length: 4
+dropout: 0.1
+ds_workers: 4
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 5
+enc_ffn_kernel_size: 9
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+fs2_ckpt: ''
+gaussian_start: true
+gen_dir_name: ''
+gen_tgt_spk_id: -1
+hidden_size: 256
+hop_size: 128
+infer: false
+keep_bins: 80
+lambda_commit: 0.25
+lambda_energy: 0.0
+lambda_f0: 0.0
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_uv: 0.0
+lambda_word_dur: 1.0
+load_ckpt: ''
+log_interval: 100
+loud_norm: false
+lr: 0.001
+max_beta: 0.02
+max_epochs: 1000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 8000
+max_input_tokens: 1550
+max_sentences: 48
+max_tokens: 36000
+max_updates: 320000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6.0
+min_level_db: -120
+norm_type: gn
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 1
+num_spk: 1
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pe_ckpt: checkpoints/0102_xiaoma_pe
+pe_enable: true
+pitch_ar: false
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l1
+pitch_norm: log
+pitch_type: frame
+pre_align_args:
+ allow_no_txt: false
+ denoise: false
+ forced_align: mfa
+ txt_processor: zh_g2pM
+ use_sox: true
+ use_tone: false
+pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 5
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+pretrain_fs_ckpt: ''
+processed_data_dir: xxx
+profile_infer: false
+raw_data_dir: data/raw/opencpop/segments
+ref_norm_layer: bn
+rel_pos: true
+reset_phone_dict: true
+residual_channels: 256
+residual_layers: 20
+save_best: false
+save_ckpt: true
+save_codes:
+- configs
+- modules
+- tasks
+- utils
+- usr
+save_f0: true
+save_gt: false
+schedule_type: linear
+seed: 1234
+sort_by_len: true
+spec_max:
+- -0.79453
+- -0.81116
+- -0.61631
+- -0.30679
+- -0.13863
+- -0.050652
+- -0.11563
+- -0.10679
+- -0.091068
+- -0.062174
+- -0.075302
+- -0.072217
+- -0.063815
+- -0.073299
+- 0.007361
+- -0.072508
+- -0.050234
+- -0.16534
+- -0.26928
+- -0.20782
+- -0.20823
+- -0.11702
+- -0.070128
+- -0.065868
+- -0.012675
+- 0.0015121
+- -0.089902
+- -0.21392
+- -0.23789
+- -0.28922
+- -0.30405
+- -0.23029
+- -0.22088
+- -0.21542
+- -0.29367
+- -0.30137
+- -0.38281
+- -0.4359
+- -0.28681
+- -0.46855
+- -0.57485
+- -0.47022
+- -0.54266
+- -0.44848
+- -0.6412
+- -0.687
+- -0.6486
+- -0.76436
+- -0.49971
+- -0.71068
+- -0.69724
+- -0.61487
+- -0.55843
+- -0.69773
+- -0.57502
+- -0.70919
+- -0.82431
+- -0.84213
+- -0.90431
+- -0.8284
+- -0.77945
+- -0.82758
+- -0.87699
+- -1.0532
+- -1.0766
+- -1.1198
+- -1.0185
+- -0.98983
+- -1.0001
+- -1.0756
+- -1.0024
+- -1.0304
+- -1.0579
+- -1.0188
+- -1.05
+- -1.0842
+- -1.0923
+- -1.1223
+- -1.2381
+- -1.6467
+spec_min:
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+spk_cond_steps: []
+stop_token_weight: 5.0
+task_cls: usr.diffsinger_task.DiffSingerMIDITask
+test_ids: []
+test_input_dir: ''
+test_num: 0
+test_prefixes:
+- '2044'
+- '2086'
+- '2092'
+- '2093'
+- '2100'
+test_set_name: test
+timesteps: 1000
+train_set_name: train
+use_denoise: false
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_midi: true
+use_nsf: true
+use_pitch_embed: false
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_num: 0
+valid_set_name: valid
+vocoder: vocoders.hifigan.HifiGAN
+vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
+warmup_updates: 2000
+wav2spec_eps: 1e-6
+weight_decay: 0
+win_size: 512
+work_dir: checkpoints/0831_opencpop_ds1000
+pndm_speedup: 10
diff --git a/checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt b/checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..f36846cd61ffca537611feea3166011f480a443a
--- /dev/null
+++ b/checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:954a31208ee6afb6240d09454bb204c4fbc63cf70e2586bed0ab29b1dc964c9e
+size 170269591
diff --git a/checkpoints/Emotion_encoder.pt b/checkpoints/Emotion_encoder.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ac214aba4b7a248c6742782392529b8442855805
--- /dev/null
+++ b/checkpoints/Emotion_encoder.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9de4930cbd8e5ba51efdef84c326e3728a5482dd7668f82960e4cb0f97cc8e5
+size 17095350
diff --git a/checkpoints/GenerSpeech/config.yaml b/checkpoints/GenerSpeech/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed493feb76965929cd333ffafcb95f2d47cfc0e6
--- /dev/null
+++ b/checkpoints/GenerSpeech/config.yaml
@@ -0,0 +1,249 @@
+accumulate_grad_batches: 1
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 16000
+base_config:
+- egs/egs_bases/tts/fs2_adv.yaml
+- egs/datasets/audio/emotion/base_text2mel.yaml
+binarization_args:
+ reset_phone_dict: true
+ reset_word_dict: true
+ shuffle: true
+ trim_eos_bos: false
+ trim_sil: false
+ with_align: true
+ with_f0: true
+ with_f0cwt: false
+ with_linear: false
+ with_spk_embed: true
+ with_spk_id: true
+ with_txt: true
+ with_wav: true
+ with_word: true
+binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
+binary_data_dir: data/binary/training_set
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+conv_use_pos: false
+crop: false
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+debug: false
+dec_dilations:
+- 1
+- 1
+- 1
+- 1
+dec_ffn_kernel_size: 9
+dec_inp_add_noise: false
+dec_kernel_size: 5
+dec_layers: 4
+dec_num_heads: 2
+decoder_rnn_dim: 0
+decoder_type: fft
+dict_dir: ''
+disc_hidden_size: 128
+disc_interval: 1
+disc_lr: 0.0001
+disc_norm: in
+disc_reduction: stack
+disc_start_steps: 0
+disc_win_num: 3
+discriminator_grad_norm: 1
+discriminator_optimizer_params:
+ eps: 1.0e-06
+ weight_decay: 0.0
+discriminator_scheduler_params:
+ gamma: 0.5
+ step_size: 60000
+dropout: 0.05
+ds_workers: 2
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+emotion_encoder_path: checkpoints/Emotion_encoder.pt # set the emotion encoder path
+enc_dec_norm: ln
+enc_dilations:
+- 1
+- 1
+- 1
+- 1
+enc_ffn_kernel_size: 9
+enc_kernel_size: 5
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_hidden_size: 1024
+ffn_padding: SAME
+fft_size: 1024
+fmax: 7600
+fmin: 80
+forcing: 20000
+frames_multiple: 1
+gen_dir_name: ''
+generator_grad_norm: 5.0
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 256
+infer: false
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_mel_adv: 0.1
+lambda_ph_dur: 0.1
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+layers_in_block: 2
+load_ckpt: ''
+loud_norm: false
+lr: 1.0
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_sentences: 100000
+max_tokens: 30000
+max_updates: 300000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_disc_hidden_size: 128
+mel_gan: true
+mel_hidden_size: 256
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 128
+min_level_db: -100
+nVQ: 128
+noise_scale: 0.8
+num_ckpt_keep: 2
+num_heads: 2
+num_sanity_val_steps: -1
+num_spk: 500
+num_test_samples: 72
+num_valid_plots: 10
+optimizer_adam_beta1: 0.5
+optimizer_adam_beta2: 0.999
+out_wav_norm: false
+pitch_ar: false
+pitch_embed_type: 0
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l1
+pitch_norm: standard
+pitch_ssim_win: 11
+pitch_type: frame
+post_glow_hidden: 128
+post_glow_kernel_size: 3
+post_glow_n_block_layers: 3
+post_glow_n_blocks: 8
+post_share_cond_layers: false
+pre_align_args:
+ allow_no_txt: false
+ denoise: false
+ sox_resample: false
+ sox_to_wav: false
+ trim_sil: false
+ txt_processor: en
+ use_tone: true
+pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
+predictor_dropout: 0.5
+predictor_grad: 1.0
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+preprocess_args:
+ add_eos_bos: true
+ mfa_group_shuffle: false
+ mfa_offset: 0.02
+ nsample_per_mfa_group: 1000
+ reset_phone_dict: true
+ reset_word_dict: true
+ save_sil_mask: true
+ txt_processor: en
+ use_mfa: true
+ vad_max_silence_length: 12
+ wav_processors: []
+ with_phsep: true
+preprocess_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
+pretrain_fs_ckpt: ''
+print_nan_grads: false
+processed_data_dir: data/processed/emotion
+profile_infer: false
+raw_data_dir: data/raw/ESD
+ref_audio: ''
+ref_hidden_stride_kernel:
+- 0,3,5
+- 0,3,5
+- 0,2,5
+- 0,2,5
+- 0,2,5
+ref_level_db: 20
+ref_norm_layer: bn
+rename_tmux: true
+rerun_gen: false
+resume_from_checkpoint: 0
+save_best: false
+save_codes: []
+save_f0: false
+save_gt: true
+scheduler: rsqrt
+seed: 1234
+share_wn_layers: 4
+sigmoid_scale: false
+sil_add_noise: false
+sort_by_len: true
+task_cls: modules.GenerSpeech.task.generspeech.GenerSpeechTask
+tb_log_interval: 100
+test_ids: []
+test_input_dir: ''
+test_num: 200
+test_set_name: test
+text: ''
+train_set_name: train
+train_sets: ''
+use_cond_disc: false
+use_emotion: true
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_latent_cond: true
+use_pitch_embed: true
+use_pos_embed: true
+use_ref_enc: false
+use_spk_embed: true
+use_spk_id: false
+use_split_spk_id: false
+use_txt_cond: true
+use_uv: true
+use_var_enc: false
+use_word: true
+vae_dropout: 0.0
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+var_enc_vq_codes: 64
+vocoder: hifigan
+vocoder_ckpt: checkpoints/trainset_hifigan
+vocoder_denoise_c: 0.0
+vq_start: 20500
+warmup_updates: 2000
+weight_decay: 0
+win_size: 1024
+word_size: 30000
+work_dir: checkpoints/GenerSpeech
diff --git a/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt b/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..def291d926fe008dc220e775ee525cdfe501d7c8
--- /dev/null
+++ b/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b872bb686013cee2a98cc610b8b66b788c46ff4c33130682b63af4ac005405ea
+size 619582860
diff --git a/checkpoints/trainset_hifigan/config.yaml b/checkpoints/trainset_hifigan/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df5c3117c000dd2d20637f52dc11b87b653142e2
--- /dev/null
+++ b/checkpoints/trainset_hifigan/config.yaml
@@ -0,0 +1,178 @@
+accumulate_grad_batches: 1
+adam_b1: 0.8
+adam_b2: 0.99
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 16000
+aux_context_window: 0
+base_config:
+- egs/egs_bases/tts/vocoder/hifigan.yaml
+- egs/datasets/audio/emotion/base_text2mel.yaml
+binarization_args:
+ reset_phone_dict: true
+ reset_word_dict: true
+ shuffle: true
+ trim_eos_bos: false
+ trim_sil: false
+ with_align: false
+ with_f0: true
+ with_f0cwt: false
+ with_linear: false
+ with_spk_embed: false
+ with_spk_id: true
+ with_txt: false
+ with_wav: true
+ with_word: false
+binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
+binary_data_dir: data/binary/training_set
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+dict_dir: ''
+disc_start_steps: 40000
+discriminator_grad_norm: 1
+discriminator_optimizer_params:
+ lr: 0.0002
+discriminator_scheduler_params:
+ gamma: 0.999
+ step_size: 600
+dropout: 0.1
+ds_workers: 1
+enc_ffn_kernel_size: 9
+enc_layers: 4
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 1024
+fmax: 7600
+fmin: 80
+frames_multiple: 1
+gen_dir_name: ''
+generator_grad_norm: 10
+generator_optimizer_params:
+ lr: 0.0002
+generator_scheduler_params:
+ gamma: 0.999
+ step_size: 600
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 256
+infer: false
+lambda_adv: 1.0
+lambda_cdisc: 4.0
+lambda_mel: 5.0
+lambda_mel_adv: 1.0
+load_ckpt: ''
+loud_norm: false
+lr: 2.0
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_samples: 8192
+max_sentences: 24
+max_tokens: 30000
+max_updates: 1000000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 128
+min_level_db: -100
+num_ckpt_keep: 3
+num_heads: 2
+num_mels: 80
+num_sanity_val_steps: -1
+num_spk: 10
+num_test_samples: 30
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pitch_type: frame
+pre_align_args:
+ allow_no_txt: false
+ denoise: false
+ sox_resample: false
+ sox_to_wav: false
+ trim_sil: false
+ txt_processor: en
+ use_tone: true
+pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
+print_nan_grads: false
+processed_data_dir: data/processed/emotion,data/processed/LibriTTS
+profile_infer: false
+raw_data_dir: data/raw/ESD
+ref_level_db: 20
+rename_tmux: true
+resblock: '1'
+resblock_dilation_sizes:
+- - 1
+ - 3
+ - 5
+- - 1
+ - 3
+ - 5
+- - 1
+ - 3
+ - 5
+resblock_kernel_sizes:
+- 3
+- 7
+- 11
+resume_from_checkpoint: 0
+save_best: true
+save_codes: []
+save_f0: false
+save_gt: true
+scheduler: rsqrt
+seed: 1234
+sort_by_len: true
+task_cls: tasks.vocoder.hifigan.HifiGanTask
+tb_log_interval: 100
+test_ids: []
+test_input_dir: ''
+test_num: 200
+test_set_name: test
+train_set_name: train
+train_sets: ''
+upsample_initial_channel: 512
+upsample_kernel_sizes:
+- 16
+- 16
+- 4
+- 4
+upsample_rates:
+- 8
+- 8
+- 2
+- 2
+use_cdisc: false
+use_cond_disc: false
+use_emotion: true
+use_fm_loss: false
+use_ms_stft: false
+use_pitch_embed: false
+use_spec_disc: false
+use_spk_embed: false
+use_spk_id: true
+use_split_spk_id: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder: pwg
+vocoder_ckpt: ''
+vocoder_denoise_c: 0.0
+warmup_updates: 8000
+weight_decay: 0
+win_length: null
+win_size: 1024
+window: hann
+word_size: 30000
+work_dir: checkpoints/trainset_hifigan
diff --git a/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt b/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..9c71c2b0d75bd2867111cf7401bf8c7e0b77b03c
--- /dev/null
+++ b/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a2577919899400a111ef42a2aba65797d282c259d083d2c276539dda9d17870
+size 1016199247