diff --git a/audio_detection/__init__.py b/audio_detection/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/audio_detection/audio_infer/__init__.py b/audio_detection/audio_infer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/audio_detection/audio_infer/__pycache__/__init__.cpython-38.pyc b/audio_detection/audio_infer/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0be763e4f05595b0b8fc1819a5ce5d665e6a7e6d
Binary files /dev/null and b/audio_detection/audio_infer/__pycache__/__init__.cpython-38.pyc differ
diff --git a/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv
new file mode 100644
index 0000000000000000000000000000000000000000..48d8522774b0127d4b585c18fb7da54a9fcbc248
--- /dev/null
+++ b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv
@@ -0,0 +1,1350 @@
+-JMT0mK0Dbg_30.000_40.000.wav	30.000	40.000	Train horn
+3ACjUf9QpAQ_30.000_40.000.wav	30.000	40.000	Train horn
+3S2-TODd__k_90.000_100.000.wav	90.000	100.000	Train horn
+3YJewEC-NWo_30.000_40.000.wav	30.000	40.000	Train horn
+3jXAh3V2FO8_30.000_40.000.wav	30.000	40.000	Train horn
+53oq_Otm_XI_30.000_40.000.wav	30.000	40.000	Train horn
+8IaInXpdd9M_0.000_10.000.wav	0.000	10.000	Train horn
+8nU1aVscJec_30.000_40.000.wav	30.000	40.000	Train horn
+9LQEZJPNVpw_30.000_40.000.wav	30.000	40.000	Train horn
+AHom7lBbtoY_30.000_40.000.wav	30.000	40.000	Train horn
+Ag_zT74ZGNc_9.000_19.000.wav	9.000	19.000	Train horn
+BQpa8whzwAE_30.000_40.000.wav	30.000	40.000	Train horn
+CCX_4cW_SAU_0.000_10.000.wav	0.000	10.000	Train horn
+CLIdVCUO_Vw_30.000_40.000.wav	30.000	40.000	Train horn
+D_nXtMgbPNY_30.000_40.000.wav	30.000	40.000	Train horn
+GFQnh84kNwU_30.000_40.000.wav	30.000	40.000	Train horn
+I4qODX0fypE_30.000_40.000.wav	30.000	40.000	Train horn
+IdqEbjujFb8_30.000_40.000.wav	30.000	40.000	Train horn
+L3a132_uApg_50.000_60.000.wav	50.000	60.000	Train horn
+LzcNa3HvD7c_30.000_40.000.wav	30.000	40.000	Train horn
+MCYY8tJsnfY_7.000_17.000.wav	7.000	17.000	Train horn
+MPSf7dJpV5w_30.000_40.000.wav	30.000	40.000	Train horn
+NdCr5IDnkxc_30.000_40.000.wav	30.000	40.000	Train horn
+P54KKbTA_TE_0.000_7.000.wav	0.000	7.000	Train horn
+PJUy17bXlhc_40.000_50.000.wav	40.000	50.000	Train horn
+QrAoRSA13bM_30.000_40.000.wav	30.000	40.000	Train horn
+R_Lpb-51Kl4_30.000_40.000.wav	30.000	40.000	Train horn
+Rq-22Cycrpg_30.000_40.000.wav	30.000	40.000	Train horn
+TBjrN1aMRrM_30.000_40.000.wav	30.000	40.000	Train horn
+XAUtk9lwzU8_30.000_40.000.wav	30.000	40.000	Train horn
+XW8pSKLyr0o_20.000_30.000.wav	20.000	30.000	Train horn
+Y10I9JSvJuQ_30.000_40.000.wav	30.000	40.000	Train horn
+Y_jwEflLthg_190.000_200.000.wav	190.000	200.000	Train horn
+YilfKdY7w6Y_60.000_70.000.wav	60.000	70.000	Train horn
+ZcTI8fQgEZE_240.000_250.000.wav	240.000	250.000	Train horn
+_8MvhMlbwiE_40.000_50.000.wav	40.000	50.000	Train horn
+_dkeW6lqmq4_30.000_40.000.wav	30.000	40.000	Train horn
+aXsUHAKbyLs_30.000_40.000.wav	30.000	40.000	Train horn
+arevYmB0qGg_30.000_40.000.wav	30.000	40.000	Train horn
+d1o334I5X_k_30.000_40.000.wav	30.000	40.000	Train horn
+dSzZWgbJ378_30.000_40.000.wav	30.000	40.000	Train horn
+ePVb5Upev8k_40.000_50.000.wav	40.000	50.000	Train horn
+g4cA-ifQc70_30.000_40.000.wav	30.000	40.000	Train horn
+g9JVq7wfDIo_30.000_40.000.wav	30.000	40.000	Train horn
+gTFCK9TuLOQ_30.000_40.000.wav	30.000	40.000	Train horn
+hYqzr_rIIAw_30.000_40.000.wav	30.000	40.000	Train horn
+iZgzRfa-xPQ_30.000_40.000.wav	30.000	40.000	Train horn
+k8H8rn4NaSM_0.000_10.000.wav	0.000	10.000	Train horn
+lKQ-I_P7TEM_20.000_30.000.wav	20.000	30.000	Train horn
+nfY_zkJceDw_30.000_40.000.wav	30.000	40.000	Train horn
+pW5SI1ZKUpA_30.000_40.000.wav	30.000	40.000	Train horn
+pxmrmtEnROk_30.000_40.000.wav	30.000	40.000	Train horn
+q7zzKHFWGkg_30.000_40.000.wav	30.000	40.000	Train horn
+qu8vVFWKszA_30.000_40.000.wav	30.000	40.000	Train horn
+stdjjG6Y5IU_30.000_40.000.wav	30.000	40.000	Train horn
+tdRMxc4UWRk_30.000_40.000.wav	30.000	40.000	Train horn
+tu-cxDG2mW8_0.000_10.000.wav	0.000	10.000	Train horn
+txXSE7kgrc8_30.000_40.000.wav	30.000	40.000	Train horn
+xabrKa79prM_30.000_40.000.wav	30.000	40.000	Train horn
+yBVxtq9k8Sg_0.000_10.000.wav	0.000	10.000	Train horn
+-WoudI3gGvk_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+0_gci63CtFY_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+3NX4HaOVBoo_240.000_250.000.wav	240.000	250.000	Air horn, truck horn
+9NPKQDaNCRk_0.000_6.000.wav	0.000	6.000	Air horn, truck horn
+9ct4w4aYWdc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+9l9QXgsJSfo_120.000_130.000.wav	120.000	130.000	Air horn, truck horn
+CN0Bi4MDpA4_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+CU2MyVM_B48_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+Cg-DWc9nPfQ_90.000_100.000.wav	90.000	100.000	Air horn, truck horn
+D62L3husEa0_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+GO2zKyMtBV4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+Ge_KWS-0098_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+Hk7HqLBHWng_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+IpyingiCwV8_0.000_3.000.wav	0.000	3.000	Air horn, truck horn
+Isuh9pOuH6I_300.000_310.000.wav	300.000	310.000	Air horn, truck horn
+IuTfMfzkr5Y_120.000_130.000.wav	120.000	130.000	Air horn, truck horn
+MFxsgcZZtFs_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+N3osL4QmOL8_49.000_59.000.wav	49.000	59.000	Air horn, truck horn
+NOZsDTFLm7M_0.000_9.000.wav	0.000	9.000	Air horn, truck horn
+OjVY3oM1jEU_40.000_50.000.wav	40.000	50.000	Air horn, truck horn
+PNaLTW50fxM_60.000_70.000.wav	60.000	70.000	Air horn, truck horn
+TYLZuBBu8ms_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+UdHR1P_NIbo_110.000_120.000.wav	110.000	120.000	Air horn, truck horn
+YilfKdY7w6Y_60.000_70.000.wav	60.000	70.000	Air horn, truck horn
+Yt4ZWNjvJOY_50.000_60.000.wav	50.000	60.000	Air horn, truck horn
+Z5M3fGT3Xjk_60.000_70.000.wav	60.000	70.000	Air horn, truck horn
+ZauRsP1uH74_12.000_22.000.wav	12.000	22.000	Air horn, truck horn
+a_6CZ2JaEuc_0.000_2.000.wav	0.000	2.000	Air horn, truck horn
+b7m5Kt5U7Vc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+bIObkrK06rk_15.000_25.000.wav	15.000	25.000	Air horn, truck horn
+cdrjKqyDrak_420.000_430.000.wav	420.000	430.000	Air horn, truck horn
+ckSYn557ZyE_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+cs-RPPsg_ks_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+ctsq33oUBT8_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+eCFUwyU9ZWA_9.000_19.000.wav	9.000	19.000	Air horn, truck horn
+ePVb5Upev8k_40.000_50.000.wav	40.000	50.000	Air horn, truck horn
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+fOVsAMJ3Yms_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+g4cA-ifQc70_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+gjlo4evwjlE_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+i9VjpIbM3iE_410.000_420.000.wav	410.000	420.000	Air horn, truck horn
+ieZVo7W3BQ4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+ii87iO6JboA_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+jko48cNdvFA_80.000_90.000.wav	80.000	90.000	Air horn, truck horn
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+kUrb38hMwPs_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+km_hVyma2vo_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+m1e9aOwRiDQ_0.000_9.000.wav	0.000	9.000	Air horn, truck horn
+mQJcObz1k_E_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+pk75WDyNZKc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+suuYwAifIAQ_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+wDdEZ46B-tM_460.000_470.000.wav	460.000	470.000	Air horn, truck horn
+wHISHmuP58s_80.000_90.000.wav	80.000	90.000	Air horn, truck horn
+xwqIKDz1bT4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+y4Ko6VNiqB0_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+yhcmPrU3QSk_61.000_71.000.wav	61.000	71.000	Air horn, truck horn
+3FWHjjZGT9U_80.000_90.000.wav	80.000	90.000	Car alarm
+3YChVhqW42E_130.000_140.000.wav	130.000	140.000	Car alarm
+3YRkin3bMlQ_170.000_180.000.wav	170.000	180.000	Car alarm
+4APBvMmKubU_10.000_20.000.wav	10.000	20.000	Car alarm
+4JDah6Ckr9k_5.000_15.000.wav	5.000	15.000	Car alarm
+5hL1uGb4sas_30.000_40.000.wav	30.000	40.000	Car alarm
+969Zfj4IoPk_20.000_30.000.wav	20.000	30.000	Car alarm
+AyfuBDN3Vdw_40.000_50.000.wav	40.000	50.000	Car alarm
+B-ZqhRg3km4_60.000_70.000.wav	60.000	70.000	Car alarm
+BDnwA3AaclE_10.000_20.000.wav	10.000	20.000	Car alarm
+ES-rjFfuxq4_120.000_130.000.wav	120.000	130.000	Car alarm
+EWbZq5ruCpg_0.000_10.000.wav	0.000	10.000	Car alarm
+F50h9HiyC3k_40.000_50.000.wav	40.000	50.000	Car alarm
+F5AP8kQvogM_30.000_40.000.wav	30.000	40.000	Car alarm
+FKJuDOAumSk_20.000_30.000.wav	20.000	30.000	Car alarm
+GmbNjZi4xBw_30.000_40.000.wav	30.000	40.000	Car alarm
+H7lOMlND9dc_30.000_40.000.wav	30.000	40.000	Car alarm
+Hu8lxbHYaqg_40.000_50.000.wav	40.000	50.000	Car alarm
+IziTYkSwq9Q_30.000_40.000.wav	30.000	40.000	Car alarm
+JcO2TTtiplA_30.000_40.000.wav	30.000	40.000	Car alarm
+KKx7dWRg8s8_8.000_18.000.wav	8.000	18.000	Car alarm
+Kf9Kr69mwOA_14.000_24.000.wav	14.000	24.000	Car alarm
+L535vIV3ED4_40.000_50.000.wav	40.000	50.000	Car alarm
+LOjT44tFx1A_0.000_10.000.wav	0.000	10.000	Car alarm
+Mxn2FKuNwiI_20.000_30.000.wav	20.000	30.000	Car alarm
+Nkqx09b-xyI_70.000_80.000.wav	70.000	80.000	Car alarm
+QNKo1W1WRbc_22.000_32.000.wav	22.000	32.000	Car alarm
+R0VxYDfjyAU_60.000_70.000.wav	60.000	70.000	Car alarm
+TJ58vMpSy1w_30.000_40.000.wav	30.000	40.000	Car alarm
+ToU1kRagUjY_0.000_10.000.wav	0.000	10.000	Car alarm
+TrQGIZqrW0s_30.000_40.000.wav	30.000	40.000	Car alarm
+ULFhHR0OLSE_30.000_40.000.wav	30.000	40.000	Car alarm
+ULS3ffQkCW4_30.000_40.000.wav	30.000	40.000	Car alarm
+U_9NuNORYQM_1.000_11.000.wav	1.000	11.000	Car alarm
+UkCEuwYUW8c_110.000_120.000.wav	110.000	120.000	Car alarm
+Wak5QxsS-QU_30.000_40.000.wav	30.000	40.000	Car alarm
+XzE7mp3pVik_0.000_10.000.wav	0.000	10.000	Car alarm
+Y-4dtrP-RNo_7.000_17.000.wav	7.000	17.000	Car alarm
+Zltlj0fDeS4_30.000_40.000.wav	30.000	40.000	Car alarm
+cB1jkzgH2es_150.000_160.000.wav	150.000	160.000	Car alarm
+eIMjkADTWzA_60.000_70.000.wav	60.000	70.000	Car alarm
+eL7s5CoW0UA_0.000_7.000.wav	0.000	7.000	Car alarm
+i9VjpIbM3iE_410.000_420.000.wav	410.000	420.000	Car alarm
+iWl-5LNURFc_30.000_40.000.wav	30.000	40.000	Car alarm
+iX34nDCq9NU_10.000_20.000.wav	10.000	20.000	Car alarm
+ii87iO6JboA_10.000_20.000.wav	10.000	20.000	Car alarm
+l6_h_YHuTbY_30.000_40.000.wav	30.000	40.000	Car alarm
+lhedRVb85Fk_30.000_40.000.wav	30.000	40.000	Car alarm
+monelE7hnwI_20.000_30.000.wav	20.000	30.000	Car alarm
+o2CmtHNUrXg_30.000_40.000.wav	30.000	40.000	Car alarm
+pXX6cK4xtiY_11.000_21.000.wav	11.000	21.000	Car alarm
+stnVta2ip9g_30.000_40.000.wav	30.000	40.000	Car alarm
+uvuVg9Cl0n0_30.000_40.000.wav	30.000	40.000	Car alarm
+vF2zXcbADUk_20.000_30.000.wav	20.000	30.000	Car alarm
+vN7dJyt-nj0_20.000_30.000.wav	20.000	30.000	Car alarm
+w8Md65mE5Vc_30.000_40.000.wav	30.000	40.000	Car alarm
+ySqfMcFk5LM_30.000_40.000.wav	30.000	40.000	Car alarm
+ysNK5RVF3Zw_0.000_10.000.wav	0.000	10.000	Car alarm
+za8KPcQ0dTw_30.000_40.000.wav	30.000	40.000	Car alarm
+-2sE5CH8Wb8_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-fJsZm3YRc0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-oSzD8P2BtU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-pzwalZ0ub0_5.000_15.000.wav	5.000	15.000	Reversing beeps
+-t-htrAtNvM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-zNEcuo28oE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+077aWlQn6XI_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0O-gZoirpRA_30.000_40.000.wav	30.000	40.000	Reversing beeps
+10aF24rMeu0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+1P5FFxXLSpY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+1n_s2Gb5R1Q_30.000_40.000.wav	30.000	40.000	Reversing beeps
+2HZcxlRs-hg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+2Jpg_KvJWL0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+2WTk_j_fivY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+38F6eeIR-s0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+3xh2kScw64U_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4MIHbR4QZhE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4Tpy1lsfcSM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4XMY2IvVSf0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4ep09nZl3LA_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4t1VqRz4w2g_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4tKvAMmAUMM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+5-x2pk3YYAs_11.000_21.000.wav	11.000	21.000	Reversing beeps
+5DW8WjxxCag_30.000_40.000.wav	30.000	40.000	Reversing beeps
+5DjZHCumLfs_11.000_21.000.wav	11.000	21.000	Reversing beeps
+5V0xKS-FGMk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+5fLzQegwHUg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+6Y8bKS6KLeE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+6xEHP-C-ZuU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+6yyToq9cW9A_60.000_70.000.wav	60.000	70.000	Reversing beeps
+7Gua0-UrKIw_30.000_40.000.wav	30.000	40.000	Reversing beeps
+7nglQSmcjAk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+81DteAPIhoE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+96a4smrM_30_30.000_40.000.wav	30.000	40.000	Reversing beeps
+9EsgN-WS2qY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+9OcAwC8y-eQ_30.000_40.000.wav	30.000	40.000	Reversing beeps
+9Ti98L4PRCo_17.000_27.000.wav	17.000	27.000	Reversing beeps
+9yhMtJ50sys_30.000_40.000.wav	30.000	40.000	Reversing beeps
+A9KMqwqLboE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+AFwmMFq_xlc_390.000_400.000.wav	390.000	400.000	Reversing beeps
+AvhBRiwWJU4_30.000_40.000.wav	30.000	40.000	Reversing beeps
+CL5vkiMs2c0_10.000_20.000.wav	10.000	20.000	Reversing beeps
+DcU6AzN7imA_210.000_220.000.wav	210.000	220.000	Reversing beeps
+ISBJKY8hwnM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+LA5TekLaIPI_10.000_20.000.wav	10.000	20.000	Reversing beeps
+NqzZbJJl3E4_30.000_40.000.wav	30.000	40.000	Reversing beeps
+PSt0xAYgf4g_0.000_10.000.wav	0.000	10.000	Reversing beeps
+Q1CMSV81_ws_30.000_40.000.wav	30.000	40.000	Reversing beeps
+_gG0KNGD47M_30.000_40.000.wav	30.000	40.000	Reversing beeps
+ckt7YEGcSoY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+eIkUuCRE_0U_30.000_40.000.wav	30.000	40.000	Reversing beeps
+kH6fFjIZkB0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+mCJ0aqIygWE_24.000_34.000.wav	24.000	34.000	Reversing beeps
+nFqf1vflJaI_350.000_360.000.wav	350.000	360.000	Reversing beeps
+nMaSkwx6cHE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+oHKTmTLEy68_11.000_21.000.wav	11.000	21.000	Reversing beeps
+saPU2JNoytU_0.000_10.000.wav	0.000	10.000	Reversing beeps
+tQd0vFueRKs_30.000_40.000.wav	30.000	40.000	Reversing beeps
+vzP6soELj2Q_0.000_10.000.wav	0.000	10.000	Reversing beeps
+0x82_HySIVU_30.000_40.000.wav	30.000	40.000	Bicycle
+1IQdvfm9SDY_30.000_40.000.wav	30.000	40.000	Bicycle
+1_hGvbEiYAs_30.000_40.000.wav	30.000	40.000	Bicycle
+26CM8IXODG4_2.000_12.000.wav	2.000	12.000	Bicycle
+2f7Ad-XpbnY_30.000_40.000.wav	30.000	40.000	Bicycle
+3-a8i_MEUl8_30.000_40.000.wav	30.000	40.000	Bicycle
+7KiTXYwaD04_7.000_17.000.wav	7.000	17.000	Bicycle
+7gkjn-LLInI_30.000_40.000.wav	30.000	40.000	Bicycle
+84flVacRHUI_21.000_31.000.wav	21.000	31.000	Bicycle
+9VziOIkNXsE_30.000_40.000.wav	30.000	40.000	Bicycle
+ANofTuuN0W0_160.000_170.000.wav	160.000	170.000	Bicycle
+B6n0op0sLPA_30.000_40.000.wav	30.000	40.000	Bicycle
+D4_zTwsCRds_60.000_70.000.wav	60.000	70.000	Bicycle
+DEs_Sp9S1Nw_30.000_40.000.wav	30.000	40.000	Bicycle
+GjsxrMRRdfQ_3.000_13.000.wav	3.000	13.000	Bicycle
+GkpUU3VX4wQ_30.000_40.000.wav	30.000	40.000	Bicycle
+H9HNXYxRmv8_30.000_40.000.wav	30.000	40.000	Bicycle
+HPWRKwrs-rY_370.000_380.000.wav	370.000	380.000	Bicycle
+HrQxbNO5jXU_6.000_16.000.wav	6.000	16.000	Bicycle
+IYaEZkAO0LU_30.000_40.000.wav	30.000	40.000	Bicycle
+Idzfy0XbZRo_7.000_17.000.wav	7.000	17.000	Bicycle
+Iigfz_GeXVs_30.000_40.000.wav	30.000	40.000	Bicycle
+JWCtQ_94YoQ_30.000_40.000.wav	30.000	40.000	Bicycle
+JXmBrD4b4EI_30.000_40.000.wav	30.000	40.000	Bicycle
+LSZPNwZex9s_30.000_40.000.wav	30.000	40.000	Bicycle
+M5kwg1kx4q0_30.000_40.000.wav	30.000	40.000	Bicycle
+NrR1wmCpqAk_12.000_22.000.wav	12.000	22.000	Bicycle
+O1_Rw2dHb1I_2.000_12.000.wav	2.000	12.000	Bicycle
+OEN0TySl1Jw_10.000_20.000.wav	10.000	20.000	Bicycle
+PF7uY9ydMYc_30.000_40.000.wav	30.000	40.000	Bicycle
+SDl0tWf9Q44_30.000_40.000.wav	30.000	40.000	Bicycle
+SkXXjcw9sJI_30.000_40.000.wav	30.000	40.000	Bicycle
+Ssa1m5Mnllw_0.000_9.000.wav	0.000	9.000	Bicycle
+UB-A1oyNyyg_0.000_6.000.wav	0.000	6.000	Bicycle
+UqyvFyQthHo_30.000_40.000.wav	30.000	40.000	Bicycle
+Wg4ik5zZxBc_250.000_260.000.wav	250.000	260.000	Bicycle
+WvquSD2PcCE_30.000_40.000.wav	30.000	40.000	Bicycle
+YIJBuXUi64U_30.000_40.000.wav	30.000	40.000	Bicycle
+aBHdl_TiseI_30.000_40.000.wav	30.000	40.000	Bicycle
+aeHCq6fFkNo_30.000_40.000.wav	30.000	40.000	Bicycle
+amKDjVcs1Vg_30.000_40.000.wav	30.000	40.000	Bicycle
+ehYwty_G2L4_13.000_23.000.wav	13.000	23.000	Bicycle
+jOlVJv7jAHg_30.000_40.000.wav	30.000	40.000	Bicycle
+lGFDQ-ZwUfk_30.000_40.000.wav	30.000	40.000	Bicycle
+lmTHvLGQy3g_50.000_60.000.wav	50.000	60.000	Bicycle
+nNHW3Uxlb-g_30.000_40.000.wav	30.000	40.000	Bicycle
+o98R4ruf8kw_30.000_40.000.wav	30.000	40.000	Bicycle
+oiLHBkHgkAo_0.000_8.000.wav	0.000	8.000	Bicycle
+qL0ESQcaPhM_30.000_40.000.wav	30.000	40.000	Bicycle
+qjz5t9M4YCw_30.000_40.000.wav	30.000	40.000	Bicycle
+qrCWPsqG9vA_30.000_40.000.wav	30.000	40.000	Bicycle
+r06tmeUDgc8_3.000_13.000.wav	3.000	13.000	Bicycle
+sAMjMyCdGOc_30.000_40.000.wav	30.000	40.000	Bicycle
+tKdRlWz-1pg_30.000_40.000.wav	30.000	40.000	Bicycle
+uNpSMpqlkMA_0.000_10.000.wav	0.000	10.000	Bicycle
+vOYj9W7Jsxk_8.000_18.000.wav	8.000	18.000	Bicycle
+xBKrmKdjAIA_0.000_10.000.wav	0.000	10.000	Bicycle
+xfNeZaw4o3U_17.000_27.000.wav	17.000	27.000	Bicycle
+xgiJqbhhU3c_30.000_40.000.wav	30.000	40.000	Bicycle
+0vg9qxNKXOw_30.000_40.000.wav	30.000	40.000	Skateboard
+10YXuv9Go0E_140.000_150.000.wav	140.000	150.000	Skateboard
+3-a8i_MEUl8_30.000_40.000.wav	30.000	40.000	Skateboard
+6kXUG1Zo6VA_0.000_10.000.wav	0.000	10.000	Skateboard
+84fDGWoRtsU_210.000_220.000.wav	210.000	220.000	Skateboard
+8kbHA22EWd0_330.000_340.000.wav	330.000	340.000	Skateboard
+8m-a_6wLTkU_230.000_240.000.wav	230.000	240.000	Skateboard
+9QwaP-cvdeU_360.000_370.000.wav	360.000	370.000	Skateboard
+9ZYj5toEbGA_0.000_10.000.wav	0.000	10.000	Skateboard
+9gkppwB5CXA_30.000_40.000.wav	30.000	40.000	Skateboard
+9hlXgXWXYXQ_0.000_6.000.wav	0.000	6.000	Skateboard
+ALxn5-2bVyI_30.000_40.000.wav	30.000	40.000	Skateboard
+ANPjV_rudog_30.000_40.000.wav	30.000	40.000	Skateboard
+ATAL-_Dblvg_0.000_7.000.wav	0.000	7.000	Skateboard
+An-4jPvUT14_60.000_70.000.wav	60.000	70.000	Skateboard
+BGR0QnX4k6w_30.000_40.000.wav	30.000	40.000	Skateboard
+BlhUt8AJJO8_30.000_40.000.wav	30.000	40.000	Skateboard
+CD7INyI79fM_170.000_180.000.wav	170.000	180.000	Skateboard
+CNcxzB9F-Q8_100.000_110.000.wav	100.000	110.000	Skateboard
+DqOGYyFVnKk_200.000_210.000.wav	200.000	210.000	Skateboard
+E0gBwPTHxqE_30.000_40.000.wav	30.000	40.000	Skateboard
+E3XIdP8kxwg_110.000_120.000.wav	110.000	120.000	Skateboard
+FQZnQhiM41U_0.000_6.000.wav	0.000	6.000	Skateboard
+FRwFfq3Tl1g_310.000_320.000.wav	310.000	320.000	Skateboard
+JJo971B_eDg_30.000_40.000.wav	30.000	40.000	Skateboard
+KXkxqxoCylc_30.000_40.000.wav	30.000	40.000	Skateboard
+L4Z7XkS6CtA_30.000_40.000.wav	30.000	40.000	Skateboard
+LjEqr0Z7xm0_0.000_6.000.wav	0.000	6.000	Skateboard
+MAbDEeLF4cQ_30.000_40.000.wav	30.000	40.000	Skateboard
+MUBbiivNYZs_30.000_40.000.wav	30.000	40.000	Skateboard
+Nq8GyBrTI8Y_30.000_40.000.wav	30.000	40.000	Skateboard
+PPq9QZmV7jc_25.000_35.000.wav	25.000	35.000	Skateboard
+PVgL5wFOKMs_30.000_40.000.wav	30.000	40.000	Skateboard
+Tcq_xAdCMr4_30.000_40.000.wav	30.000	40.000	Skateboard
+UtZofZjccBs_290.000_300.000.wav	290.000	300.000	Skateboard
+VZfrDZhI7BU_30.000_40.000.wav	30.000	40.000	Skateboard
+WxChkRrVOIs_0.000_7.000.wav	0.000	7.000	Skateboard
+YV0noe1sZAs_150.000_160.000.wav	150.000	160.000	Skateboard
+YjScrri_F7U_0.000_10.000.wav	0.000	10.000	Skateboard
+YrGQKTbiG1g_30.000_40.000.wav	30.000	40.000	Skateboard
+ZM67kt6G-d4_30.000_40.000.wav	30.000	40.000	Skateboard
+ZaUaqnLdg6k_30.000_40.000.wav	30.000	40.000	Skateboard
+ZhpkRcAEJzc_3.000_13.000.wav	3.000	13.000	Skateboard
+_43OOP6UEw0_30.000_40.000.wav	30.000	40.000	Skateboard
+_6Fyave4jqA_260.000_270.000.wav	260.000	270.000	Skateboard
+aOoZ0bCoaZw_30.000_40.000.wav	30.000	40.000	Skateboard
+gV6y9L24wWg_0.000_10.000.wav	0.000	10.000	Skateboard
+hHb0Eq1I7Fk_0.000_10.000.wav	0.000	10.000	Skateboard
+lGf_L6i6AZI_20.000_30.000.wav	20.000	30.000	Skateboard
+leOH87itNWM_30.000_40.000.wav	30.000	40.000	Skateboard
+mIkW7mWlnXw_30.000_40.000.wav	30.000	40.000	Skateboard
+qadmKrM0ppo_20.000_30.000.wav	20.000	30.000	Skateboard
+rLUIHCc4b9A_0.000_7.000.wav	0.000	7.000	Skateboard
+u3vBJgEVJvk_0.000_10.000.wav	0.000	10.000	Skateboard
+vHKBrtPDSvA_150.000_160.000.wav	150.000	160.000	Skateboard
+wWmydRt0Z-w_21.000_31.000.wav	21.000	31.000	Skateboard
+xeHt-R5ScmI_0.000_10.000.wav	0.000	10.000	Skateboard
+xqGtIVeeXY4_330.000_340.000.wav	330.000	340.000	Skateboard
+y_lfY0uzmr0_30.000_40.000.wav	30.000	40.000	Skateboard
+02Ak1eIyj3M_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+0N0C0Wbe6AI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+4APBvMmKubU_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+5RgHBmX2HLw_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+6rXgD5JlYxY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+7eeN-fXbso8_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+8Aq2DyLbUBA_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+8qMHvgA9mGw_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+9CRb-PToaAM_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+BGp9-Ro5h8Y_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+CDrpqsGqfPo_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+Cc7-P0py1Mc_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+Daqv2F6SEmQ_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+F9Dbcxr-lAI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+GORjnSWhZeY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+GgV0yYogTPI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+H9xQQVv3ElI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+LNQ7fzfdLiY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+MEUcv-QM0cQ_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+QWVub6-0jX4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+R8G5Y0HASxY_60.000_70.000.wav	60.000	70.000	Ambulance (siren)
+RVTKY5KR3ME_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+Sm0pPvXPA9U_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+VXI3-DI4xNs_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+W8fIlauyJkk_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ZlS4vIWQMmE_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+ZxlbI2Rj1VY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ZyuX_gMFiss_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+bA8mt0JI0Ko_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+bIU0X1v4SF0_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+cHm1cYBAXMI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+cR79KnWpiQA_70.000_80.000.wav	70.000	80.000	Ambulance (siren)
+dPcw4R5lczw_500.000_510.000.wav	500.000	510.000	Ambulance (siren)
+epwDz5WBkvc_80.000_90.000.wav	80.000	90.000	Ambulance (siren)
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+gw9pYEG2Zb0_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+iEX8L_oEbsU_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+iM-U56fTTOQ_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+iSnWMz4FUAg_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+kSjvt2Z_pBo_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ke35yF1LHs4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+lqGtL8sUo_g_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+mAfPu0meA_Y_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+mlS9LLiMIG8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+oPR7tUEUptk_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+qsHc2X1toLs_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+rCQykaL8Hy4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+s0iddDFzL9s_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+tcKlq7_cOkw_8.000_18.000.wav	8.000	18.000	Ambulance (siren)
+u3yYpMwG4Us_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+vBXPyBiyJG0_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+vVqUvv1SSu8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+vYKWnuvq2FI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ysNK5RVF3Zw_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+z4B14tAqJ4w_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+zbiJEml563w_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+-HxRz4w60-Y_150.000_160.000.wav	150.000	160.000	Fire engine, fire truck (siren)
+-_dElQcyJnA_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+0K1mroXg8bs_9.000_19.000.wav	9.000	19.000	Fire engine, fire truck (siren)
+0SvSNVatkv0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+31WGUPOYS5g_22.000_32.000.wav	22.000	32.000	Fire engine, fire truck (siren)
+3h3_IZWhX0g_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+4APBvMmKubU_10.000_20.000.wav	10.000	20.000	Fire engine, fire truck (siren)
+5fjy_2ajEkg_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+6rXgD5JlYxY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+8Aq2DyLbUBA_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+8DaEd5KbnnA_80.000_90.000.wav	80.000	90.000	Fire engine, fire truck (siren)
+ARIVxBOc0BQ_40.000_50.000.wav	40.000	50.000	Fire engine, fire truck (siren)
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Bs2KqqI9F_k_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Cc7-P0py1Mc_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+D4M3YT75ZrQ_90.000_100.000.wav	90.000	100.000	Fire engine, fire truck (siren)
+DWXQ_cSUW98_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Daqv2F6SEmQ_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+DpagxUQwXDo_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+FFSI6Bg2M-Q_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+GORjnSWhZeY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+GbIuxmaiCOk_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+GgV0yYogTPI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+H6c8ZDrdUaM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+H9xQQVv3ElI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+HQQxGJKg1iM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+IiCh2H3JtsE_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+InrS4Fdndr4_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+JpLA7HY9r3Y_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+MEUcv-QM0cQ_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+PCl-q7lCT_U_50.000_60.000.wav	50.000	60.000	Fire engine, fire truck (siren)
+VXI3-DI4xNs_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Xggsbzzes3M_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+YbiiaDBU-HI_10.000_20.000.wav	10.000	20.000	Fire engine, fire truck (siren)
+ZeH6Fc7Y900_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+ZlS4vIWQMmE_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+bIU0X1v4SF0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+cHm1cYBAXMI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+iM-U56fTTOQ_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+k2a30--j37Q_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+kr8ssbrDDMY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+pvYwIdGrS90_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+qsHc2X1toLs_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+rCQykaL8Hy4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+u08iA12iAmM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+u9aHjYGbl5o_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+uUiZrgUpw2A_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+vBXPyBiyJG0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+vVqUvv1SSu8_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+vYKWnuvq2FI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+wD0P-doqkXo_20.000_30.000.wav	20.000	30.000	Fire engine, fire truck (siren)
+xbr7x2V6mxk_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+ysNK5RVF3Zw_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+z4B14tAqJ4w_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+zpzJKMG5iGc_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+02Ak1eIyj3M_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0CJFt950vOk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0phl6nlC-n0_10.000_20.000.wav	10.000	20.000	Civil defense siren
+1jhbNtCWC9w_50.000_60.000.wav	50.000	60.000	Civil defense siren
+4Ukj2TTJxHM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+4XAVaSz_P7c_150.000_160.000.wav	150.000	160.000	Civil defense siren
+69AIBPnJN5E_0.000_10.000.wav	0.000	10.000	Civil defense siren
+8DaEd5KbnnA_80.000_90.000.wav	80.000	90.000	Civil defense siren
+8ILgvaJVPCI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+9MWHXCLAX8I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+A5y-aZc0CiM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+AQCZH4OdNSM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+AVBUh6qeHrQ_30.000_40.000.wav	30.000	40.000	Civil defense siren
+BhQPDafekdw_30.000_40.000.wav	30.000	40.000	Civil defense siren
+CJXNdudcJrs_30.000_40.000.wav	30.000	40.000	Civil defense siren
+CU2MyVM_B48_30.000_40.000.wav	30.000	40.000	Civil defense siren
+DdZw0XDv0JI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+DgWHUawAGnI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+Do9Dffb6vHA_30.000_40.000.wav	30.000	40.000	Civil defense siren
+GO2zKyMtBV4_30.000_40.000.wav	30.000	40.000	Civil defense siren
+GeRgy4of730_30.000_40.000.wav	30.000	40.000	Civil defense siren
+IIypdzgZAaI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+JpLA7HY9r3Y_30.000_40.000.wav	30.000	40.000	Civil defense siren
+JqHJ7015aWM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+K7a1P4RX_5w_30.000_40.000.wav	30.000	40.000	Civil defense siren
+KrTocA-I550_190.000_200.000.wav	190.000	200.000	Civil defense siren
+KumYcZVLOVU_350.000_360.000.wav	350.000	360.000	Civil defense siren
+L60HS_jbZu0_30.000_40.000.wav	30.000	40.000	Civil defense siren
+MZ1Yh6mRC-E_30.000_40.000.wav	30.000	40.000	Civil defense siren
+R8XUrRCFkzs_30.000_40.000.wav	30.000	40.000	Civil defense siren
+SyWbolNFst4_60.000_70.000.wav	60.000	70.000	Civil defense siren
+TYLZuBBu8ms_0.000_10.000.wav	0.000	10.000	Civil defense siren
+Tx6eSkU2lKc_30.000_40.000.wav	30.000	40.000	Civil defense siren
+VcflBZLflSU_130.000_140.000.wav	130.000	140.000	Civil defense siren
+WXsTHg_DiYA_30.000_40.000.wav	30.000	40.000	Civil defense siren
+Wz5ffJxCElQ_10.000_20.000.wav	10.000	20.000	Civil defense siren
+X2MlmcY8UZU_30.000_40.000.wav	30.000	40.000	Civil defense siren
+XYLheTmlEYI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+YyxlD_FwZXM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+adCuLs-4nmI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+cPjtrTq3F-I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+eHDm93tI4Ok_30.000_40.000.wav	30.000	40.000	Civil defense siren
+etppP5Sdo14_30.000_40.000.wav	30.000	40.000	Civil defense siren
+fRKxUc1gQBw_50.000_60.000.wav	50.000	60.000	Civil defense siren
+feIue4LHzfM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+gr-Yen6Sj_Q_0.000_10.000.wav	0.000	10.000	Civil defense siren
+hl3Kqi9Wi_g_30.000_40.000.wav	30.000	40.000	Civil defense siren
+iKca2cbowd4_30.000_40.000.wav	30.000	40.000	Civil defense siren
+kzFyGWdj6MI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+m3LGopSVju4_30.000_40.000.wav	30.000	40.000	Civil defense siren
+ne4IMxs-hMk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+nuu2iNisoQc_6.000_16.000.wav	6.000	16.000	Civil defense siren
+oYeql9xE19k_30.000_40.000.wav	30.000	40.000	Civil defense siren
+rGUrM19BnJ8_110.000_120.000.wav	110.000	120.000	Civil defense siren
+u08iA12iAmM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+uCRAnDBXxgI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+vQG4HZR2KSk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+vjsG5b2yNzc_190.000_200.000.wav	190.000	200.000	Civil defense siren
+yO7guxGY-_k_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-9GUUhB3QV0_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-HxRz4w60-Y_150.000_160.000.wav	150.000	160.000	Police car (siren)
+-UBVqmhbT50_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-_dElQcyJnA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+0N0C0Wbe6AI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+0SvSNVatkv0_30.000_40.000.wav	30.000	40.000	Police car (siren)
+145N68nh4m0_120.000_130.000.wav	120.000	130.000	Police car (siren)
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Police car (siren)
+31WGUPOYS5g_22.000_32.000.wav	22.000	32.000	Police car (siren)
+5RgHBmX2HLw_30.000_40.000.wav	30.000	40.000	Police car (siren)
+6rXgD5JlYxY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+8Aq2DyLbUBA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+8DaEd5KbnnA_80.000_90.000.wav	80.000	90.000	Police car (siren)
+8E7okHnCcTA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+9CRb-PToaAM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+9OFUd38sBNM_0.000_8.000.wav	0.000	8.000	Police car (siren)
+AQCZH4OdNSM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Police car (siren)
+CDrpqsGqfPo_10.000_20.000.wav	10.000	20.000	Police car (siren)
+DK_6C29B2zs_14.000_24.000.wav	14.000	24.000	Police car (siren)
+GORjnSWhZeY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+GgV0yYogTPI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+H6c8ZDrdUaM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+H7lOMlND9dc_30.000_40.000.wav	30.000	40.000	Police car (siren)
+H9xQQVv3ElI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+IiCh2H3JtsE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+InrS4Fdndr4_0.000_10.000.wav	0.000	10.000	Police car (siren)
+JgDuU9kpHpM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+JpLA7HY9r3Y_30.000_40.000.wav	30.000	40.000	Police car (siren)
+LNQ7fzfdLiY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+PCl-q7lCT_U_50.000_60.000.wav	50.000	60.000	Police car (siren)
+QWVub6-0jX4_30.000_40.000.wav	30.000	40.000	Police car (siren)
+Wak5QxsS-QU_30.000_40.000.wav	30.000	40.000	Police car (siren)
+YbiiaDBU-HI_10.000_20.000.wav	10.000	20.000	Police car (siren)
+Z34SD-OEpJI_10.000_20.000.wav	10.000	20.000	Police car (siren)
+ZeH6Fc7Y900_30.000_40.000.wav	30.000	40.000	Police car (siren)
+ZlS4vIWQMmE_0.000_10.000.wav	0.000	10.000	Police car (siren)
+ZyuX_gMFiss_30.000_40.000.wav	30.000	40.000	Police car (siren)
+bIU0X1v4SF0_30.000_40.000.wav	30.000	40.000	Police car (siren)
+eIMjkADTWzA_60.000_70.000.wav	60.000	70.000	Police car (siren)
+epwDz5WBkvc_80.000_90.000.wav	80.000	90.000	Police car (siren)
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+fNcrlqPrAqM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+g_DBLppDZAs_30.000_40.000.wav	30.000	40.000	Police car (siren)
+gw9pYEG2Zb0_20.000_30.000.wav	20.000	30.000	Police car (siren)
+iEX8L_oEbsU_30.000_40.000.wav	30.000	40.000	Police car (siren)
+iM-U56fTTOQ_30.000_40.000.wav	30.000	40.000	Police car (siren)
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+kSjvt2Z_pBo_30.000_40.000.wav	30.000	40.000	Police car (siren)
+lqGtL8sUo_g_30.000_40.000.wav	30.000	40.000	Police car (siren)
+mAfPu0meA_Y_20.000_30.000.wav	20.000	30.000	Police car (siren)
+mlS9LLiMIG8_30.000_40.000.wav	30.000	40.000	Police car (siren)
+pzup58Eyhuo_30.000_40.000.wav	30.000	40.000	Police car (siren)
+rCQykaL8Hy4_30.000_40.000.wav	30.000	40.000	Police car (siren)
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Police car (siren)
+u08iA12iAmM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+u3yYpMwG4Us_30.000_40.000.wav	30.000	40.000	Police car (siren)
+u9aHjYGbl5o_30.000_40.000.wav	30.000	40.000	Police car (siren)
+uUiZrgUpw2A_30.000_40.000.wav	30.000	40.000	Police car (siren)
+vYKWnuvq2FI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+xbr7x2V6mxk_30.000_40.000.wav	30.000	40.000	Police car (siren)
+z4B14tAqJ4w_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-FKrYTj_eCU_0.000_10.000.wav	0.000	10.000	Screaming
+0G50t4FlbIA_60.000_70.000.wav	60.000	70.000	Screaming
+1LTxZ2aNytc_30.000_40.000.wav	30.000	40.000	Screaming
+2FEhG1UXb_E_370.000_380.000.wav	370.000	380.000	Screaming
+45vBbOhzS6g_50.000_60.000.wav	50.000	60.000	Screaming
+4PYTtp78Ig0_60.000_70.000.wav	60.000	70.000	Screaming
+5QNq0IEPICQ_30.000_40.000.wav	30.000	40.000	Screaming
+5YcIJuYQECc_0.000_6.000.wav	0.000	6.000	Screaming
+5kQF4r03yRI_0.000_6.000.wav	0.000	6.000	Screaming
+7ARVgI_wx5Y_30.000_40.000.wav	30.000	40.000	Screaming
+AIFvFuZPr68_30.000_40.000.wav	30.000	40.000	Screaming
+Aw43FUCkIb8_20.000_30.000.wav	20.000	30.000	Screaming
+AxM2BofYfPY_30.000_40.000.wav	30.000	40.000	Screaming
+BFqHyCoypfM_16.000_26.000.wav	16.000	26.000	Screaming
+Bk_xS_fKCpk_30.000_40.000.wav	30.000	40.000	Screaming
+C4YMjmJ7tt4_90.000_100.000.wav	90.000	100.000	Screaming
+CMWoAvgD0A0_9.000_19.000.wav	9.000	19.000	Screaming
+DZfYFhywhRs_30.000_40.000.wav	30.000	40.000	Screaming
+ElJFYwRtrH4_30.000_40.000.wav	30.000	40.000	Screaming
+FcUVtXJMkJs_30.000_40.000.wav	30.000	40.000	Screaming
+G--718JDmAQ_0.000_10.000.wav	0.000	10.000	Screaming
+GPJ1uQwmNHk_30.000_40.000.wav	30.000	40.000	Screaming
+H3vSRzkG82U_30.000_40.000.wav	30.000	40.000	Screaming
+HS28EUWt8dE_110.000_120.000.wav	110.000	120.000	Screaming
+KkGTB8ESMCM_0.000_10.000.wav	0.000	10.000	Screaming
+MQ0YasvMcuQ_1.000_11.000.wav	1.000	11.000	Screaming
+Msl9dI5yweA_90.000_100.000.wav	90.000	100.000	Screaming
+Ntn6YvZM3kA_0.000_10.000.wav	0.000	10.000	Screaming
+NwTHlpXdk4M_30.000_40.000.wav	30.000	40.000	Screaming
+OHjfSfqa804_0.000_10.000.wav	0.000	10.000	Screaming
+OzWJuqG2F3Y_30.000_40.000.wav	30.000	40.000	Screaming
+QDW_uCMnMMU_0.000_8.000.wav	0.000	8.000	Screaming
+SxI3Lnzzmkw_110.000_120.000.wav	110.000	120.000	Screaming
+TVvbfuGu9eM_70.000_80.000.wav	70.000	80.000	Screaming
+YCk9F0Uq3BE_70.000_80.000.wav	70.000	80.000	Screaming
+Z54pSnNw2iM_30.000_40.000.wav	30.000	40.000	Screaming
+a59ivTlYoNk_310.000_320.000.wav	310.000	320.000	Screaming
+auC_LgwFF8g_30.000_40.000.wav	30.000	40.000	Screaming
+bi8R9JbF2cc_80.000_90.000.wav	80.000	90.000	Screaming
+cdbYsoEasio_70.000_80.000.wav	70.000	80.000	Screaming
+dfsvT5xImNg_80.000_90.000.wav	80.000	90.000	Screaming
+e2AaF6siR1A_540.000_550.000.wav	540.000	550.000	Screaming
+gB1ytjgpcW4_190.000_200.000.wav	190.000	200.000	Screaming
+gE-0JxMtUh0_20.000_30.000.wav	20.000	30.000	Screaming
+hWiGgsuGnzs_100.000_110.000.wav	100.000	110.000	Screaming
+l-iIfi3SNpw_120.000_130.000.wav	120.000	130.000	Screaming
+mT-f0lGk-JM_30.000_40.000.wav	30.000	40.000	Screaming
+nApE_Biu13k_10.000_20.000.wav	10.000	20.000	Screaming
+nRMmafPUAEU_80.000_90.000.wav	80.000	90.000	Screaming
+nYAbLuyqPis_30.000_40.000.wav	30.000	40.000	Screaming
+nlYlNF30bVg_30.000_40.000.wav	30.000	40.000	Screaming
+sUp-UXzgmrA_0.000_10.000.wav	0.000	10.000	Screaming
+syIwNMo2TUA_0.000_7.000.wav	0.000	7.000	Screaming
+uTu0a1wd9-M_21.000_31.000.wav	21.000	31.000	Screaming
+xVG7dfH5DL0_320.000_330.000.wav	320.000	330.000	Screaming
+xvAQ44hx3_k_220.000_230.000.wav	220.000	230.000	Screaming
+yNTkb2zgA_M_70.000_80.000.wav	70.000	80.000	Screaming
+zCdOEvduBTo_30.000_40.000.wav	30.000	40.000	Screaming
+zMICvbCJ6zc_550.000_560.000.wav	550.000	560.000	Screaming
+-0RWZT-miFs_420.000_430.000.wav	420.000	430.000	Car
+-1pRmoJIGQc_11.000_21.000.wav	11.000	21.000	Car
+-7eDqv-6AKQ_30.000_40.000.wav	30.000	40.000	Car
+-CZ1LIc8aos_20.000_30.000.wav	20.000	30.000	Car
+-HWygXWSNRA_30.000_40.000.wav	30.000	40.000	Car
+-PVEno65928_30.000_40.000.wav	30.000	40.000	Car
+-WgJ-M292Yc_30.000_40.000.wav	30.000	40.000	Car
+0O-gZoirpRA_30.000_40.000.wav	30.000	40.000	Car
+0QwxnzHf_0E_30.000_40.000.wav	30.000	40.000	Car
+0bg1nzEVdgY_0.000_10.000.wav	0.000	10.000	Car
+0lpPdWvg7Eo_0.000_10.000.wav	0.000	10.000	Car
+11Pn3yJifSQ_4.000_14.000.wav	4.000	14.000	Car
+1BgqrhbyRFw_30.000_40.000.wav	30.000	40.000	Car
+1F9zCsJyw6k_430.000_440.000.wav	430.000	440.000	Car
+1HayoASR-54_80.000_90.000.wav	80.000	90.000	Car
+1P5FFxXLSpY_30.000_40.000.wav	30.000	40.000	Car
+1hIg-Lsvc7Q_30.000_40.000.wav	30.000	40.000	Car
+27m49pmJ8Og_370.000_380.000.wav	370.000	380.000	Car
+2E_N8lnoVKE_30.000_40.000.wav	30.000	40.000	Car
+2Fdau5KTEls_30.000_40.000.wav	30.000	40.000	Car
+2STASUlGAjs_30.000_40.000.wav	30.000	40.000	Car
+2fi0m8ei_B4_30.000_40.000.wav	30.000	40.000	Car
+2uMXfAIMeN0_180.000_190.000.wav	180.000	190.000	Car
+32V2zsK7GME_110.000_120.000.wav	110.000	120.000	Car
+3YChVhqW42E_130.000_140.000.wav	130.000	140.000	Car
+3_OLj6XChvM_30.000_40.000.wav	30.000	40.000	Car
+3hLxPQpmfQo_30.000_40.000.wav	30.000	40.000	Car
+3mDPQ_CPopw_30.000_40.000.wav	30.000	40.000	Car
+3mor5mPSYoU_7.000_17.000.wav	7.000	17.000	Car
+3xh2kScw64U_30.000_40.000.wav	30.000	40.000	Car
+40s88hEcn5I_170.000_180.000.wav	170.000	180.000	Car
+42P93B_GzGA_30.000_40.000.wav	30.000	40.000	Car
+4KZWpXlcpM4_60.000_70.000.wav	60.000	70.000	Car
+4TshFWSsrn8_290.000_300.000.wav	290.000	300.000	Car
+4WRgvRI06zc_30.000_40.000.wav	30.000	40.000	Car
+4aJfQpHt9lY_160.000_170.000.wav	160.000	170.000	Car
+4hd2CLrzCZs_30.000_40.000.wav	30.000	40.000	Car
+4zCHl7pRsNY_30.000_40.000.wav	30.000	40.000	Car
+5RgHBmX2HLw_30.000_40.000.wav	30.000	40.000	Car
+5oirFKi6Sfo_190.000_200.000.wav	190.000	200.000	Car
+5vmxFp1r1ZM_30.000_40.000.wav	30.000	40.000	Car
+5z1rE_l-0Ow_0.000_8.000.wav	0.000	8.000	Car
+620GoTv5Ic8_30.000_40.000.wav	30.000	40.000	Car
+6BitLl5Bnxw_30.000_40.000.wav	30.000	40.000	Car
+6FVA4hqp1Ro_30.000_40.000.wav	30.000	40.000	Car
+6U942AYlcXA_30.000_40.000.wav	30.000	40.000	Car
+6b2ZMMrLTz8_5.000_15.000.wav	5.000	15.000	Car
+6ibh38autyA_30.000_40.000.wav	30.000	40.000	Car
+6kuESYFcEqw_30.000_40.000.wav	30.000	40.000	Car
+73cuZZq-J3w_20.000_30.000.wav	20.000	30.000	Car
+764IcMEMVUk_90.000_100.000.wav	90.000	100.000	Car
+7NH1WJlSiYI_30.000_40.000.wav	30.000	40.000	Car
+7lJu9wEsErY_220.000_230.000.wav	220.000	230.000	Car
+8CqqK9CzuXM_30.000_40.000.wav	30.000	40.000	Car
+8SYLYWR47EE_30.000_40.000.wav	30.000	40.000	Car
+8Wk-ZmlsUqY_28.000_38.000.wav	28.000	38.000	Car
+8q8JrJNAa-Q_30.000_40.000.wav	30.000	40.000	Car
+8rMlNbKlp_s_0.000_10.000.wav	0.000	10.000	Car
+8sGJFPr2Nmc_30.000_40.000.wav	30.000	40.000	Car
+8yRROnG0-lA_30.000_40.000.wav	30.000	40.000	Car
+9Ti98L4PRCo_17.000_27.000.wav	17.000	27.000	Car
+9fzAWj5YJ9c_30.000_40.000.wav	30.000	40.000	Car
+9rq8h4oMJ98_30.000_40.000.wav	30.000	40.000	Car
+9ye2Fn62xDc_60.000_70.000.wav	60.000	70.000	Car
+ACGuC6SH4V4_150.000_160.000.wav	150.000	160.000	Car
+AFz5TIs_Gug_30.000_40.000.wav	30.000	40.000	Car
+AedlWfHafgw_21.000_31.000.wav	21.000	31.000	Car
+AlsDSDTiaWI_30.000_40.000.wav	30.000	40.000	Car
+B3SkK0wuOhY_130.000_140.000.wav	130.000	140.000	Car
+B9n4a5ciI48_16.000_26.000.wav	16.000	26.000	Car
+BAekfGvUtFM_30.000_40.000.wav	30.000	40.000	Car
+BNLOvQbrPdc_290.000_300.000.wav	290.000	300.000	Car
+BS1fqEDAvh0_330.000_340.000.wav	330.000	340.000	Car
+Bqx_SZgCzZw_10.000_20.000.wav	10.000	20.000	Car
+CZB6WXDuM1g_30.000_40.000.wav	30.000	40.000	Car
+C_pnsyNXphA_30.000_40.000.wav	30.000	40.000	Car
+Ck5ZjBf1nLM_30.000_40.000.wav	30.000	40.000	Car
+CqNyeZeHb8Y_30.000_40.000.wav	30.000	40.000	Car
+Cs1d7Ibk8CA_220.000_230.000.wav	220.000	230.000	Car
+CuS-ok0xG9g_0.000_10.000.wav	0.000	10.000	Car
+CuaBHNKycvI_30.000_40.000.wav	30.000	40.000	Car
+Cwur_jvxMzY_360.000_370.000.wav	360.000	370.000	Car
+DEGSyVygE98_110.000_120.000.wav	110.000	120.000	Car
+DLxTYAUifjU_30.000_40.000.wav	30.000	40.000	Car
+DkKpnvJk9u0_30.000_40.000.wav	30.000	40.000	Car
+DkVfro9iq80_30.000_40.000.wav	30.000	40.000	Car
+Dw1q9rBv7oU_30.000_40.000.wav	30.000	40.000	Car
+E8NgxTz1d90_30.000_40.000.wav	30.000	40.000	Car
+ExqedxdXuBc_70.000_80.000.wav	70.000	80.000	Car
+FCxEMSNSEuI_160.000_170.000.wav	160.000	170.000	Car
+FEoMTMxzn3U_30.000_40.000.wav	30.000	40.000	Car
+FFSWmryaZ60_30.000_40.000.wav	30.000	40.000	Car
+FYk2paHPSdg_30.000_40.000.wav	30.000	40.000	Car
+Fo_FDiZhzDo_30.000_40.000.wav	30.000	40.000	Car
+GteozUDpJRc_30.000_40.000.wav	30.000	40.000	Car
+GwBS2NzjAvA_30.000_40.000.wav	30.000	40.000	Car
+H8d1mZOqb1c_110.000_120.000.wav	110.000	120.000	Car
+HFF_PpqLQ9w_30.000_40.000.wav	30.000	40.000	Car
+HHlb-h2Pc7o_30.000_40.000.wav	30.000	40.000	Car
+Hu8lxbHYaqg_40.000_50.000.wav	40.000	50.000	Car
+I-HlrcP6Qg4_30.000_40.000.wav	30.000	40.000	Car
+I7vs2H-Htt8_480.000_490.000.wav	480.000	490.000	Car
+IblhEF_MiH8_400.000_410.000.wav	400.000	410.000	Car
+JgXnbgS_XBk_480.000_490.000.wav	480.000	490.000	Car
+Ju7Kg_H2iZQ_30.000_40.000.wav	30.000	40.000	Car
+KiCB6pP6EEo_100.000_110.000.wav	100.000	110.000	Car
+Kwpn3utYEHM_30.000_40.000.wav	30.000	40.000	Car
+Ky9Kw-0XwAs_30.000_40.000.wav	30.000	40.000	Car
+KzKDk-UgS54_30.000_40.000.wav	30.000	40.000	Car
+L1qC8DicAZE_70.000_80.000.wav	70.000	80.000	Car
+L4N0LOYZrFo_30.000_40.000.wav	30.000	40.000	Car
+L535vIV3ED4_40.000_50.000.wav	40.000	50.000	Car
+L9YtOeck3A0_0.000_10.000.wav	0.000	10.000	Car
+LEtkHiZZugk_30.000_40.000.wav	30.000	40.000	Car
+LLkNFGrrgUo_30.000_40.000.wav	30.000	40.000	Car
+LhRNnXaSsCk_30.000_40.000.wav	30.000	40.000	Car
+M7NvD1WJQ7o_70.000_80.000.wav	70.000	80.000	Car
+M8BFtmQRHq4_200.000_210.000.wav	200.000	210.000	Car
+Mxn2FKuNwiI_20.000_30.000.wav	20.000	30.000	Car
+NMqSBlEq14Q_30.000_40.000.wav	30.000	40.000	Car
+NoPbk9fy6uw_10.000_20.000.wav	10.000	20.000	Car
+O36torHptH4_30.000_40.000.wav	30.000	40.000	Car
+OBwh-KGukE8_30.000_40.000.wav	30.000	40.000	Car
+Oa2Os8eOUjs_30.000_40.000.wav	30.000	40.000	Car
+PNaLTW50fxM_60.000_70.000.wav	60.000	70.000	Car
+PfXdcsW8dJI_540.000_550.000.wav	540.000	550.000	Car
+QAWuHvVCI6g_30.000_40.000.wav	30.000	40.000	Car
+QBMDnMRwQCc_70.000_80.000.wav	70.000	80.000	Car
+QzrS-S7OerE_370.000_380.000.wav	370.000	380.000	Car
+R0BtkTm_CPI_30.000_40.000.wav	30.000	40.000	Car
+SEHxfje9Eio_30.000_40.000.wav	30.000	40.000	Car
+Sb3V17F8xU8_360.000_370.000.wav	360.000	370.000	Car
+SkbFczIabRY_30.000_40.000.wav	30.000	40.000	Car
+SqWkV-UQ6CI_30.000_40.000.wav	30.000	40.000	Car
+TWDytzefXXc_10.000_20.000.wav	10.000	20.000	Car
+Tv67JhZDAYs_30.000_40.000.wav	30.000	40.000	Car
+VTwVF3xRSWg_12.000_22.000.wav	12.000	22.000	Car
+VulCKZgWspc_570.000_580.000.wav	570.000	580.000	Car
+Vx6mttDHWfo_30.000_40.000.wav	30.000	40.000	Car
+W11cJ9HZNaY_30.000_40.000.wav	30.000	40.000	Car
+WLXQgcx8qTI_30.000_40.000.wav	30.000	40.000	Car
+WMbdMQ7rdFs_30.000_40.000.wav	30.000	40.000	Car
+WZoQD6cInx8_360.000_370.000.wav	360.000	370.000	Car
+WffmaOr2p8I_30.000_40.000.wav	30.000	40.000	Car
+WoynilrteLU_30.000_40.000.wav	30.000	40.000	Car
+WxrKq0aI0iM_130.000_140.000.wav	130.000	140.000	Car
+X60eVxecY3I_30.000_40.000.wav	30.000	40.000	Car
+X8fEzx-fA0U_80.000_90.000.wav	80.000	90.000	Car
+XVxlZqwWcBI_10.000_20.000.wav	10.000	20.000	Car
+Xnd8ERrynEo_120.000_130.000.wav	120.000	130.000	Car
+XqXLI7bDb-I_0.000_7.000.wav	0.000	7.000	Car
+XyCjByHuDIk_260.000_270.000.wav	260.000	270.000	Car
+XzE7mp3pVik_0.000_10.000.wav	0.000	10.000	Car
+Y5e8BW513ww_20.000_30.000.wav	20.000	30.000	Car
+YJdBwuIn4Ec_30.000_40.000.wav	30.000	40.000	Car
+YTFJUFWcRns_30.000_40.000.wav	30.000	40.000	Car
+YY9aConw2QE_0.000_10.000.wav	0.000	10.000	Car
+Yc_WuISxfLI_30.000_40.000.wav	30.000	40.000	Car
+Ys_rO2Ieg1U_30.000_40.000.wav	30.000	40.000	Car
+Z34SD-OEpJI_10.000_20.000.wav	10.000	20.000	Car
+Z8cigemT5_g_210.000_220.000.wav	210.000	220.000	Car
+ZJW7ymsioQc_16.000_26.000.wav	16.000	26.000	Car
+ZY6A9ZDkudg_130.000_140.000.wav	130.000	140.000	Car
+_Mw9lKigni4_30.000_40.000.wav	30.000	40.000	Car
+_ZiJA6phEq8_30.000_40.000.wav	30.000	40.000	Car
+_yU0-fmspFY_210.000_220.000.wav	210.000	220.000	Car
+a5vTn5286-A_80.000_90.000.wav	80.000	90.000	Car
+aCX6vJhHO2c_30.000_40.000.wav	30.000	40.000	Car
+aHEAK0iWqKk_180.000_190.000.wav	180.000	190.000	Car
+aOVPHKqKjyQ_90.000_100.000.wav	90.000	100.000	Car
+aUq4glO5ryE_30.000_40.000.wav	30.000	40.000	Car
+aW3DY8XDrmw_22.000_32.000.wav	22.000	32.000	Car
+aa4uhPvKviY_30.000_40.000.wav	30.000	40.000	Car
+akgqVmFFDiY_30.000_40.000.wav	30.000	40.000	Car
+buOEFwXhoe0_310.000_320.000.wav	310.000	320.000	Car
+cHCIoXF7moA_30.000_40.000.wav	30.000	40.000	Car
+cW859JAzVZ0_30.000_40.000.wav	30.000	40.000	Car
+cbYZQRz09bc_390.000_400.000.wav	390.000	400.000	Car
+d-do1XZ8f_E_30.000_40.000.wav	30.000	40.000	Car
+d3gMwtMK6Gs_30.000_40.000.wav	30.000	40.000	Car
+d6AioJ8CkTc_30.000_40.000.wav	30.000	40.000	Car
+dAud19zNZyw_190.000_200.000.wav	190.000	200.000	Car
+dC1TVxwiitc_30.000_40.000.wav	30.000	40.000	Car
+dFqOBLxhEl8_20.000_30.000.wav	20.000	30.000	Car
+dSfcznv4KLo_30.000_40.000.wav	30.000	40.000	Car
+dThSTe35jb0_50.000_60.000.wav	50.000	60.000	Car
+dfwr8wgZU8M_40.000_50.000.wav	40.000	50.000	Car
+dmJH84FnQa8_30.000_40.000.wav	30.000	40.000	Car
+e9xPBfEJni8_230.000_240.000.wav	230.000	240.000	Car
+eAl9WwRaWUE_30.000_40.000.wav	30.000	40.000	Car
+eAt6si6k65c_30.000_40.000.wav	30.000	40.000	Car
+eHiqCLHmoxI_0.000_8.000.wav	0.000	8.000	Car
+eV5JX81GzqA_150.000_160.000.wav	150.000	160.000	Car
+er1vQ-nse_g_30.000_40.000.wav	30.000	40.000	Car
+eyFPHlybqDg_30.000_40.000.wav	30.000	40.000	Car
+f70nsY7ThBA_220.000_230.000.wav	220.000	230.000	Car
+fJLCT3xDGxA_30.000_40.000.wav	30.000	40.000	Car
+fZMPDCNyQxE_30.000_40.000.wav	30.000	40.000	Car
+f__6chtFRM0_30.000_40.000.wav	30.000	40.000	Car
+fdDTuo_COG8_90.000_100.000.wav	90.000	100.000	Car
+gFJjYWXeBn0_30.000_40.000.wav	30.000	40.000	Car
+g_DBLppDZAs_30.000_40.000.wav	30.000	40.000	Car
+gaFQgJLQHtU_90.000_100.000.wav	90.000	100.000	Car
+gc6VlixMHXE_30.000_40.000.wav	30.000	40.000	Car
+hN1ykzC8kZM_30.000_40.000.wav	30.000	40.000	Car
+hQ_yyPI46FI_11.000_21.000.wav	11.000	21.000	Car
+haiMRJEH-Aw_0.000_9.000.wav	0.000	9.000	Car
+hsC_sT0A4XM_30.000_40.000.wav	30.000	40.000	Car
+ihQDd1CqFBw_70.000_80.000.wav	70.000	80.000	Car
+ii87iO6JboA_10.000_20.000.wav	10.000	20.000	Car
+j2R1zurR39E_30.000_40.000.wav	30.000	40.000	Car
+j42ETHcp044_0.000_10.000.wav	0.000	10.000	Car
+j7OEpDiK3IA_30.000_40.000.wav	30.000	40.000	Car
+jCeUZwd8b2w_0.000_10.000.wav	0.000	10.000	Car
+jZxusrD28rM_30.000_40.000.wav	30.000	40.000	Car
+kdDgTDfo9HY_100.000_110.000.wav	100.000	110.000	Car
+l6_h_YHuTbY_30.000_40.000.wav	30.000	40.000	Car
+lRrv5m9Xu4k_30.000_40.000.wav	30.000	40.000	Car
+lb1awXgoyQE_0.000_10.000.wav	0.000	10.000	Car
+llZBUsAwRWc_30.000_40.000.wav	30.000	40.000	Car
+lu5teS1j1RQ_0.000_10.000.wav	0.000	10.000	Car
+mCmjh_EJtb4_30.000_40.000.wav	30.000	40.000	Car
+nFqf1vflJaI_350.000_360.000.wav	350.000	360.000	Car
+njodYtK0Hqg_30.000_40.000.wav	30.000	40.000	Car
+noymXcxyxis_30.000_40.000.wav	30.000	40.000	Car
+o2CmtHNUrXg_30.000_40.000.wav	30.000	40.000	Car
+oPJVdi0cqNE_30.000_40.000.wav	30.000	40.000	Car
+oxJYMzEmtk4_10.000_20.000.wav	10.000	20.000	Car
+pPnLErF3GOY_30.000_40.000.wav	30.000	40.000	Car
+pXX6cK4xtiY_11.000_21.000.wav	11.000	21.000	Car
+qC5M7BAsKOA_0.000_10.000.wav	0.000	10.000	Car
+qg4WxBm8h_w_510.000_520.000.wav	510.000	520.000	Car
+qxLdv8u_Ujw_0.000_5.000.wav	0.000	5.000	Car
+rgeu0Gtf3Es_40.000_50.000.wav	40.000	50.000	Car
+s3-i5eUpe6c_30.000_40.000.wav	30.000	40.000	Car
+s5s3aR8Z7I8_350.000_360.000.wav	350.000	360.000	Car
+syCQldBsAtg_30.000_40.000.wav	30.000	40.000	Car
+tAfucDIyRiM_30.000_40.000.wav	30.000	40.000	Car
+teoER4j9H14_290.000_300.000.wav	290.000	300.000	Car
+uFSkczD2i14_30.000_40.000.wav	30.000	40.000	Car
+uUyB4q7jgn4_30.000_40.000.wav	30.000	40.000	Car
+uYqlVTlSgbM_40.000_50.000.wav	40.000	50.000	Car
+v8Kry1CbTkM_310.000_320.000.wav	310.000	320.000	Car
+vF2zXcbADUk_20.000_30.000.wav	20.000	30.000	Car
+vHlqKDR7ggA_30.000_40.000.wav	30.000	40.000	Car
+vPDXFKcdaS4_0.000_10.000.wav	0.000	10.000	Car
+vW1nk4o9u5g_30.000_40.000.wav	30.000	40.000	Car
+vdFYBSlmsXw_30.000_40.000.wav	30.000	40.000	Car
+vtE1J8HsCUs_30.000_40.000.wav	30.000	40.000	Car
+w0vy1YvNcOg_30.000_40.000.wav	30.000	40.000	Car
+wDKrcZ7xLY8_80.000_90.000.wav	80.000	90.000	Car
+wM-sBzIDzok_30.000_40.000.wav	30.000	40.000	Car
+wUY4eWJt17w_30.000_40.000.wav	30.000	40.000	Car
+we66pU0MN1M_30.000_40.000.wav	30.000	40.000	Car
+wjfMWiYLDWA_30.000_40.000.wav	30.000	40.000	Car
+wu3-_VKULZU_30.000_40.000.wav	30.000	40.000	Car
+wwNIm8bgzKc_30.000_40.000.wav	30.000	40.000	Car
+xqH9TpH6Xy0_0.000_10.000.wav	0.000	10.000	Car
+xsT5ZJUnBg0_160.000_170.000.wav	160.000	170.000	Car
+y9DFJEsiTLk_110.000_120.000.wav	110.000	120.000	Car
+yESwp_fg0Po_70.000_80.000.wav	70.000	80.000	Car
+yQg3eMb0QKU_30.000_40.000.wav	30.000	40.000	Car
+yQjnNR7fXKo_50.000_60.000.wav	50.000	60.000	Car
+zCuKYr_oMlE_60.000_70.000.wav	60.000	70.000	Car
+zz35Va7tYmA_30.000_40.000.wav	30.000	40.000	Car
+-CZ1LIc8aos_20.000_30.000.wav	20.000	30.000	Car passing by
+-WgJ-M292Yc_30.000_40.000.wav	30.000	40.000	Car passing by
+-iAAxJkoqcM_0.000_6.000.wav	0.000	6.000	Car passing by
+0mQcGLpc8to_30.000_40.000.wav	30.000	40.000	Car passing by
+1HtGgZnlKjU_30.000_40.000.wav	30.000	40.000	Car passing by
+2IsAlhq0XFc_30.000_40.000.wav	30.000	40.000	Car passing by
+2UvEmetE__I_30.000_40.000.wav	30.000	40.000	Car passing by
+2oHGIzH_XzA_30.000_40.000.wav	30.000	40.000	Car passing by
+3mor5mPSYoU_7.000_17.000.wav	7.000	17.000	Car passing by
+8SYLYWR47EE_30.000_40.000.wav	30.000	40.000	Car passing by
+8rzhhvS0tGc_30.000_40.000.wav	30.000	40.000	Car passing by
+8v377AXrgac_30.000_40.000.wav	30.000	40.000	Car passing by
+9lMtTDKyDEk_30.000_40.000.wav	30.000	40.000	Car passing by
+BWoL8oKoTFI_30.000_40.000.wav	30.000	40.000	Car passing by
+BsvD806qNM8_10.000_20.000.wav	10.000	20.000	Car passing by
+C3LLtToB2zA_30.000_40.000.wav	30.000	40.000	Car passing by
+Dk6b9dVD0i8_6.000_16.000.wav	6.000	16.000	Car passing by
+Dw1q9rBv7oU_30.000_40.000.wav	30.000	40.000	Car passing by
+EqFuY_U0Yz0_30.000_40.000.wav	30.000	40.000	Car passing by
+FjpOboRcrNc_10.000_20.000.wav	10.000	20.000	Car passing by
+FjyZV8zIJ0k_30.000_40.000.wav	30.000	40.000	Car passing by
+Fn7eSPVvgCQ_30.000_40.000.wav	30.000	40.000	Car passing by
+G6A-sT2DOjY_30.000_40.000.wav	30.000	40.000	Car passing by
+GBXRuYIvhfM_30.000_40.000.wav	30.000	40.000	Car passing by
+HDEPd5MIaow_30.000_40.000.wav	30.000	40.000	Car passing by
+HQQxGJKg1iM_30.000_40.000.wav	30.000	40.000	Car passing by
+If-V0XO-mpo_30.000_40.000.wav	30.000	40.000	Car passing by
+JtuNiusRRLk_30.000_40.000.wav	30.000	40.000	Car passing by
+M8BFtmQRHq4_200.000_210.000.wav	200.000	210.000	Car passing by
+NKPAwhwZmqs_30.000_40.000.wav	30.000	40.000	Car passing by
+Oa2Os8eOUjs_30.000_40.000.wav	30.000	40.000	Car passing by
+QcLfJE-YfJY_30.000_40.000.wav	30.000	40.000	Car passing by
+SkbFczIabRY_30.000_40.000.wav	30.000	40.000	Car passing by
+VAiH1LX8guk_17.000_27.000.wav	17.000	27.000	Car passing by
+Yc_WuISxfLI_30.000_40.000.wav	30.000	40.000	Car passing by
+Yd10enP9ykM_30.000_40.000.wav	30.000	40.000	Car passing by
+_HGGCwtyNxM_30.000_40.000.wav	30.000	40.000	Car passing by
+a2U10_mi5as_30.000_40.000.wav	30.000	40.000	Car passing by
+aB6FDPKAPus_30.000_40.000.wav	30.000	40.000	Car passing by
+bDFQWubN4x4_30.000_40.000.wav	30.000	40.000	Car passing by
+cW859JAzVZ0_30.000_40.000.wav	30.000	40.000	Car passing by
+dDTvjXXFkDg_30.000_40.000.wav	30.000	40.000	Car passing by
+dfwr8wgZU8M_40.000_50.000.wav	40.000	50.000	Car passing by
+fJLCT3xDGxA_30.000_40.000.wav	30.000	40.000	Car passing by
+gc6VlixMHXE_30.000_40.000.wav	30.000	40.000	Car passing by
+gd_KjDM4fi8_0.000_10.000.wav	0.000	10.000	Car passing by
+j7OEpDiK3IA_30.000_40.000.wav	30.000	40.000	Car passing by
+jZxusrD28rM_30.000_40.000.wav	30.000	40.000	Car passing by
+llZBUsAwRWc_30.000_40.000.wav	30.000	40.000	Car passing by
+m_dCO5bBCic_26.000_36.000.wav	26.000	36.000	Car passing by
+qDQX7Xi3GsQ_30.000_40.000.wav	30.000	40.000	Car passing by
+qxLdv8u_Ujw_0.000_5.000.wav	0.000	5.000	Car passing by
+reP-OOWiLWU_30.000_40.000.wav	30.000	40.000	Car passing by
+s4jG5ZJYCvQ_30.000_40.000.wav	30.000	40.000	Car passing by
+s5s3aR8Z7I8_350.000_360.000.wav	350.000	360.000	Car passing by
+uUyB4q7jgn4_30.000_40.000.wav	30.000	40.000	Car passing by
+vPDXFKcdaS4_0.000_10.000.wav	0.000	10.000	Car passing by
+wD4QouhX8zo_30.000_40.000.wav	30.000	40.000	Car passing by
+xqH9TpH6Xy0_0.000_10.000.wav	0.000	10.000	Car passing by
+zd67ihUZ1u4_25.000_35.000.wav	25.000	35.000	Car passing by
+-3z5mFRgbxc_30.000_40.000.wav	30.000	40.000	Bus
+0N9EN0BEjP0_430.000_440.000.wav	430.000	440.000	Bus
+0lPcHRhXlWk_30.000_40.000.wav	30.000	40.000	Bus
+1E1evA4T_Tk_30.000_40.000.wav	30.000	40.000	Bus
+1hIg-Lsvc7Q_30.000_40.000.wav	30.000	40.000	Bus
+6-yQsEH2WYA_30.000_40.000.wav	30.000	40.000	Bus
+6Y8wSI1l-Lw_30.000_40.000.wav	30.000	40.000	Bus
+7T04388Ijk8_30.000_40.000.wav	30.000	40.000	Bus
+8E7okHnCcTA_30.000_40.000.wav	30.000	40.000	Bus
+8oEdgb8iXYA_1.000_11.000.wav	1.000	11.000	Bus
+AdpNSGX2_Pk_10.000_20.000.wav	10.000	20.000	Bus
+AwJ8orGuOXg_2.000_12.000.wav	2.000	12.000	Bus
+BS1fqEDAvh0_330.000_340.000.wav	330.000	340.000	Bus
+CoFbRc1OxFU_9.000_19.000.wav	9.000	19.000	Bus
+DRqKOlP8BmU_110.000_120.000.wav	110.000	120.000	Bus
+DYcXvyBFc5w_30.000_40.000.wav	30.000	40.000	Bus
+DYdalOQnx1Y_30.000_40.000.wav	30.000	40.000	Bus
+DkwFXd5nYLE_40.000_50.000.wav	40.000	50.000	Bus
+FBMR3pW9H9o_30.000_40.000.wav	30.000	40.000	Bus
+FEGa4e6RAlw_30.000_40.000.wav	30.000	40.000	Bus
+Ge_KWS-0098_30.000_40.000.wav	30.000	40.000	Bus
+HxMoMMrA6Eo_30.000_40.000.wav	30.000	40.000	Bus
+I7esm6vqqZ4_30.000_40.000.wav	30.000	40.000	Bus
+JLj11umr1CE_0.000_10.000.wav	0.000	10.000	Bus
+JwAhcHHF2qg_30.000_40.000.wav	30.000	40.000	Bus
+LhRNnXaSsCk_30.000_40.000.wav	30.000	40.000	Bus
+LzZ_nxuZ8Co_30.000_40.000.wav	30.000	40.000	Bus
+LzcNa3HvD7c_30.000_40.000.wav	30.000	40.000	Bus
+Nyi9_-u6-w0_30.000_40.000.wav	30.000	40.000	Bus
+O_SKumO328I_30.000_40.000.wav	30.000	40.000	Bus
+Owg_XU9XmRM_30.000_40.000.wav	30.000	40.000	Bus
+P94rcZSuTT8_30.000_40.000.wav	30.000	40.000	Bus
+PP741kd2vRM_30.000_40.000.wav	30.000	40.000	Bus
+Qna9qrV8_go_30.000_40.000.wav	30.000	40.000	Bus
+Qt7FJkuqWPE_30.000_40.000.wav	30.000	40.000	Bus
+UcQ7cVukaxY_21.000_31.000.wav	21.000	31.000	Bus
+W8fIlauyJkk_30.000_40.000.wav	30.000	40.000	Bus
+WDn851XbWTk_30.000_40.000.wav	30.000	40.000	Bus
+WvquSD2PcCE_30.000_40.000.wav	30.000	40.000	Bus
+a9B_HA3y8WQ_30.000_40.000.wav	30.000	40.000	Bus
+cEEoKQ38fHY_30.000_40.000.wav	30.000	40.000	Bus
+er1vQ-nse_g_30.000_40.000.wav	30.000	40.000	Bus
+fLvM4bbpg6w_0.000_10.000.wav	0.000	10.000	Bus
+fOVsAMJ3Yms_30.000_40.000.wav	30.000	40.000	Bus
+gxVhAVNjSU0_30.000_40.000.wav	30.000	40.000	Bus
+jaSK_t8QP1E_30.000_40.000.wav	30.000	40.000	Bus
+ji_YCMygNHQ_8.000_18.000.wav	8.000	18.000	Bus
+kNKfoDp0uUw_30.000_40.000.wav	30.000	40.000	Bus
+kdDgTDfo9HY_100.000_110.000.wav	100.000	110.000	Bus
+lHP0q2sQzPQ_30.000_40.000.wav	30.000	40.000	Bus
+mGG8rop4Jig_30.000_40.000.wav	30.000	40.000	Bus
+oHKTmTLEy68_11.000_21.000.wav	11.000	21.000	Bus
+tAfucDIyRiM_30.000_40.000.wav	30.000	40.000	Bus
+tQd0vFueRKs_30.000_40.000.wav	30.000	40.000	Bus
+ucICmff0K-Q_30.000_40.000.wav	30.000	40.000	Bus
+x-2Abohj8VY_30.000_40.000.wav	30.000	40.000	Bus
+xFr2xX6PulQ_70.000_80.000.wav	70.000	80.000	Bus
+yfSBqp5IZSM_10.000_20.000.wav	10.000	20.000	Bus
+-2sE5CH8Wb8_30.000_40.000.wav	30.000	40.000	Truck
+-BY64_p-vtM_30.000_40.000.wav	30.000	40.000	Truck
+-fJsZm3YRc0_30.000_40.000.wav	30.000	40.000	Truck
+-t-htrAtNvM_30.000_40.000.wav	30.000	40.000	Truck
+-zNEcuo28oE_30.000_40.000.wav	30.000	40.000	Truck
+01WuUBxFBp4_30.000_40.000.wav	30.000	40.000	Truck
+077aWlQn6XI_30.000_40.000.wav	30.000	40.000	Truck
+0Ga7T-2e490_17.000_27.000.wav	17.000	27.000	Truck
+0N9EN0BEjP0_430.000_440.000.wav	430.000	440.000	Truck
+10aF24rMeu0_30.000_40.000.wav	30.000	40.000	Truck
+2HZcxlRs-hg_30.000_40.000.wav	30.000	40.000	Truck
+2Jpg_KvJWL0_30.000_40.000.wav	30.000	40.000	Truck
+2Tmi7EqpGZQ_0.000_10.000.wav	0.000	10.000	Truck
+4DlKNmVcoek_20.000_30.000.wav	20.000	30.000	Truck
+4MRzQbAIyV4_90.000_100.000.wav	90.000	100.000	Truck
+4Tpy1lsfcSM_30.000_40.000.wav	30.000	40.000	Truck
+4ep09nZl3LA_30.000_40.000.wav	30.000	40.000	Truck
+5DW8WjxxCag_30.000_40.000.wav	30.000	40.000	Truck
+5DjZHCumLfs_11.000_21.000.wav	11.000	21.000	Truck
+5QP1Tc3XbDc_30.000_40.000.wav	30.000	40.000	Truck
+5V0xKS-FGMk_30.000_40.000.wav	30.000	40.000	Truck
+5fLzQegwHUg_30.000_40.000.wav	30.000	40.000	Truck
+6HL_DKWK-WA_10.000_20.000.wav	10.000	20.000	Truck
+6VQGk8IrV-4_30.000_40.000.wav	30.000	40.000	Truck
+6Y8bKS6KLeE_30.000_40.000.wav	30.000	40.000	Truck
+6xEHP-C-ZuU_30.000_40.000.wav	30.000	40.000	Truck
+6yyToq9cW9A_60.000_70.000.wav	60.000	70.000	Truck
+7Gua0-UrKIw_30.000_40.000.wav	30.000	40.000	Truck
+7nglQSmcjAk_30.000_40.000.wav	30.000	40.000	Truck
+81DteAPIhoE_30.000_40.000.wav	30.000	40.000	Truck
+84E9i9_ELBs_30.000_40.000.wav	30.000	40.000	Truck
+8jblPMBafKE_30.000_40.000.wav	30.000	40.000	Truck
+8k17D6qiuqI_30.000_40.000.wav	30.000	40.000	Truck
+9EsgN-WS2qY_30.000_40.000.wav	30.000	40.000	Truck
+9LJnjmcRcb8_280.000_290.000.wav	280.000	290.000	Truck
+9yhMtJ50sys_30.000_40.000.wav	30.000	40.000	Truck
+A9KMqwqLboE_30.000_40.000.wav	30.000	40.000	Truck
+ARIVxBOc0BQ_40.000_50.000.wav	40.000	50.000	Truck
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Truck
+BQVXzH6YK8g_30.000_40.000.wav	30.000	40.000	Truck
+CnYWJp2bknU_50.000_60.000.wav	50.000	60.000	Truck
+DRqKOlP8BmU_110.000_120.000.wav	110.000	120.000	Truck
+DXlTakKvLzg_30.000_40.000.wav	30.000	40.000	Truck
+DkVfro9iq80_30.000_40.000.wav	30.000	40.000	Truck
+Dmy4EjohxxU_60.000_70.000.wav	60.000	70.000	Truck
+DvMFQ64YwcI_30.000_40.000.wav	30.000	40.000	Truck
+FEoMTMxzn3U_30.000_40.000.wav	30.000	40.000	Truck
+GTk_6JDmtCY_230.000_240.000.wav	230.000	240.000	Truck
+HDEPd5MIaow_30.000_40.000.wav	30.000	40.000	Truck
+HQkLVac7z9Q_70.000_80.000.wav	70.000	80.000	Truck
+I4VDcVTE4YA_30.000_40.000.wav	30.000	40.000	Truck
+IxlvxvG8zOE_110.000_120.000.wav	110.000	120.000	Truck
+JLzD44Im1Ec_30.000_40.000.wav	30.000	40.000	Truck
+K4Hcb00hTTY_30.000_40.000.wav	30.000	40.000	Truck
+L2M3xanqQP8_30.000_40.000.wav	30.000	40.000	Truck
+LA5TekLaIPI_10.000_20.000.wav	10.000	20.000	Truck
+LhRNnXaSsCk_30.000_40.000.wav	30.000	40.000	Truck
+MWTTe0M9vi4_30.000_40.000.wav	30.000	40.000	Truck
+Nkqx09b-xyI_70.000_80.000.wav	70.000	80.000	Truck
+NqzZbJJl3E4_30.000_40.000.wav	30.000	40.000	Truck
+OPd0cz1hRqc_30.000_40.000.wav	30.000	40.000	Truck
+PCl-q7lCT_U_50.000_60.000.wav	50.000	60.000	Truck
+PNaLTW50fxM_60.000_70.000.wav	60.000	70.000	Truck
+PO1eaJ7tQOg_180.000_190.000.wav	180.000	190.000	Truck
+PSt0xAYgf4g_0.000_10.000.wav	0.000	10.000	Truck
+Pef6g19i5iI_30.000_40.000.wav	30.000	40.000	Truck
+Q1CMSV81_ws_30.000_40.000.wav	30.000	40.000	Truck
+SiBIYAiIajM_30.000_40.000.wav	30.000	40.000	Truck
+T6oYCFRafPs_30.000_40.000.wav	30.000	40.000	Truck
+WdubBeFntYQ_460.000_470.000.wav	460.000	470.000	Truck
+_ZiJA6phEq8_30.000_40.000.wav	30.000	40.000	Truck
+_jfv_ziZWII_60.000_70.000.wav	60.000	70.000	Truck
+acvV6yYNc7Y_30.000_40.000.wav	30.000	40.000	Truck
+bQSaQ0iX_vk_30.000_40.000.wav	30.000	40.000	Truck
+bhxN5w03yS0_30.000_40.000.wav	30.000	40.000	Truck
+ckt7YEGcSoY_30.000_40.000.wav	30.000	40.000	Truck
+eIkUuCRE_0U_30.000_40.000.wav	30.000	40.000	Truck
+gxVhAVNjSU0_30.000_40.000.wav	30.000	40.000	Truck
+hDVNQOJCvOk_30.000_40.000.wav	30.000	40.000	Truck
+ieZVo7W3BQ4_30.000_40.000.wav	30.000	40.000	Truck
+ikmE_kRvDAc_30.000_40.000.wav	30.000	40.000	Truck
+jwZTKNsbf58_70.000_80.000.wav	70.000	80.000	Truck
+kH6fFjIZkB0_30.000_40.000.wav	30.000	40.000	Truck
+kr8ssbrDDMY_30.000_40.000.wav	30.000	40.000	Truck
+lp66EaEOOoU_30.000_40.000.wav	30.000	40.000	Truck
+n4o1r8Ai66o_30.000_40.000.wav	30.000	40.000	Truck
+nDtrUUc2J2U_0.000_10.000.wav	0.000	10.000	Truck
+nMaSkwx6cHE_30.000_40.000.wav	30.000	40.000	Truck
+p70IcMwsW9M_30.000_40.000.wav	30.000	40.000	Truck
+pJ1fore8JbQ_30.000_40.000.wav	30.000	40.000	Truck
+pt-J_L-OFI8_0.000_10.000.wav	0.000	10.000	Truck
+rdanJP7Usrg_30.000_40.000.wav	30.000	40.000	Truck
+srTX18ikXkE_10.000_20.000.wav	10.000	20.000	Truck
+tuplsUUDXKw_30.000_40.000.wav	30.000	40.000	Truck
+x6vuWsdeS3s_30.000_40.000.wav	30.000	40.000	Truck
+xMClk12ouB8_30.000_40.000.wav	30.000	40.000	Truck
+ycqDMKTrvLY_30.000_40.000.wav	30.000	40.000	Truck
+yk5LqHTtHLo_30.000_40.000.wav	30.000	40.000	Truck
+yrscqyUOIlI_30.000_40.000.wav	30.000	40.000	Truck
+zM3chsL-B7U_30.000_40.000.wav	30.000	40.000	Truck
+06si40RVDco_30.000_40.000.wav	30.000	40.000	Motorcycle
+0DzsPL-xElE_20.000_30.000.wav	20.000	30.000	Motorcycle
+145N68nh4m0_120.000_130.000.wav	120.000	130.000	Motorcycle
+16vw4K9qJnY_30.000_40.000.wav	30.000	40.000	Motorcycle
+21QlKF17ipc_30.000_40.000.wav	30.000	40.000	Motorcycle
+3LulQoOXNB0_30.000_40.000.wav	30.000	40.000	Motorcycle
+45JHcLU57B8_20.000_30.000.wav	20.000	30.000	Motorcycle
+4NZkW-XaIa4_30.000_40.000.wav	30.000	40.000	Motorcycle
+506I6LfdDuk_50.000_60.000.wav	50.000	60.000	Motorcycle
+6MCy1lh4qaw_20.000_30.000.wav	20.000	30.000	Motorcycle
+6R8cO4ARzkY_30.000_40.000.wav	30.000	40.000	Motorcycle
+6taAP7SFewI_30.000_40.000.wav	30.000	40.000	Motorcycle
+7g6aZTBe2xE_30.000_40.000.wav	30.000	40.000	Motorcycle
+9HcahqYUVoc_90.000_100.000.wav	90.000	100.000	Motorcycle
+9N1iw5Vdim8_20.000_30.000.wav	20.000	30.000	Motorcycle
+ANWU9Hiy_5k_40.000_50.000.wav	40.000	50.000	Motorcycle
+BTNz6NftP34_30.000_40.000.wav	30.000	40.000	Motorcycle
+BxnLAGsByCI_10.000_20.000.wav	10.000	20.000	Motorcycle
+CZgx_6XaEkg_30.000_40.000.wav	30.000	40.000	Motorcycle
+D3BJuOwltoI_10.000_20.000.wav	10.000	20.000	Motorcycle
+FgN9v1jYqjA_30.000_40.000.wav	30.000	40.000	Motorcycle
+HQ8eR2lvjSE_30.000_40.000.wav	30.000	40.000	Motorcycle
+Mb-GyQEKoEc_30.000_40.000.wav	30.000	40.000	Motorcycle
+Pair_NsHdTc_30.000_40.000.wav	30.000	40.000	Motorcycle
+UFIBEBkm7ao_30.000_40.000.wav	30.000	40.000	Motorcycle
+UWz5OIijWM4_30.000_40.000.wav	30.000	40.000	Motorcycle
+WLX3Db60418_20.000_30.000.wav	20.000	30.000	Motorcycle
+X5Xs8Y1cJK0_30.000_40.000.wav	30.000	40.000	Motorcycle
+ZGf0vrZStwI_30.000_40.000.wav	30.000	40.000	Motorcycle
+ZfkO1HlI0zM_30.000_40.000.wav	30.000	40.000	Motorcycle
+bhtB2Zgh9Q8_110.000_120.000.wav	110.000	120.000	Motorcycle
+d-m8eXCpeDg_30.000_40.000.wav	30.000	40.000	Motorcycle
+d21IwtH2oHI_30.000_40.000.wav	30.000	40.000	Motorcycle
+dhaKGPCgtfw_30.000_40.000.wav	30.000	40.000	Motorcycle
+ee-0JGvEIng_30.000_40.000.wav	30.000	40.000	Motorcycle
+epGDNMrsQb8_40.000_50.000.wav	40.000	50.000	Motorcycle
+ezUkPETm6cs_30.000_40.000.wav	30.000	40.000	Motorcycle
+f724u5z_UDw_30.000_40.000.wav	30.000	40.000	Motorcycle
+gGmWm1i6pVo_30.000_40.000.wav	30.000	40.000	Motorcycle
+i9VjpIbM3iE_410.000_420.000.wav	410.000	420.000	Motorcycle
+iMp8nODaotA_580.000_590.000.wav	580.000	590.000	Motorcycle
+lVW2CqsHJ4Y_30.000_40.000.wav	30.000	40.000	Motorcycle
+lj7hzmz19-M_30.000_40.000.wav	30.000	40.000	Motorcycle
+mX45CiTjf8I_30.000_40.000.wav	30.000	40.000	Motorcycle
+mbLiZ_jpgeY_20.000_30.000.wav	20.000	30.000	Motorcycle
+owZDBEq6WdU_30.000_40.000.wav	30.000	40.000	Motorcycle
+pNMBIqvbyB4_30.000_40.000.wav	30.000	40.000	Motorcycle
+po-tnKZAzdg_40.000_50.000.wav	40.000	50.000	Motorcycle
+qAQuljp-atA_30.000_40.000.wav	30.000	40.000	Motorcycle
+r0Oll28wmXs_30.000_40.000.wav	30.000	40.000	Motorcycle
+sAMjMyCdGOc_30.000_40.000.wav	30.000	40.000	Motorcycle
+vHlqKDR7ggA_30.000_40.000.wav	30.000	40.000	Motorcycle
+wPfv8ifzzyg_30.000_40.000.wav	30.000	40.000	Motorcycle
+wyhurCZbKQU_30.000_40.000.wav	30.000	40.000	Motorcycle
+xQTPEQDb0Gg_30.000_40.000.wav	30.000	40.000	Motorcycle
+xTPmoYwgKf4_30.000_40.000.wav	30.000	40.000	Motorcycle
+xXGIKM4daMU_30.000_40.000.wav	30.000	40.000	Motorcycle
+xZ8hQliZqhg_160.000_170.000.wav	160.000	170.000	Motorcycle
+xuMBy2NoROI_30.000_40.000.wav	30.000	40.000	Motorcycle
+z_8yGVO1qws_30.000_40.000.wav	30.000	40.000	Motorcycle
+-BaVEk1zS2g_50.000_60.000.wav	50.000	60.000	Train
+-Q4fBQ4egrs_0.000_10.000.wav	0.000	10.000	Train
+-QxSFr1cYuQ_20.000_30.000.wav	20.000	30.000	Train
+-ZdReI9dL6M_530.000_540.000.wav	530.000	540.000	Train
+0YIyGEM0yG0_550.000_560.000.wav	550.000	560.000	Train
+1Mk2MJDhLJQ_20.000_30.000.wav	20.000	30.000	Train
+2nejPPEWqJ8_320.000_330.000.wav	320.000	330.000	Train
+3ACjUf9QpAQ_30.000_40.000.wav	30.000	40.000	Train
+3RfrTU1p5SA_500.000_510.000.wav	500.000	510.000	Train
+3YJewEC-NWo_30.000_40.000.wav	30.000	40.000	Train
+3ZZDuYU2HM4_150.000_160.000.wav	150.000	160.000	Train
+3fPX1LaGwJo_60.000_70.000.wav	60.000	70.000	Train
+4_gyCWuPxRg_170.000_180.000.wav	170.000	180.000	Train
+4l4vGrMD4Tw_550.000_560.000.wav	550.000	560.000	Train
+4oT0bxldS80_30.000_40.000.wav	30.000	40.000	Train
+4t7Mi3pnSA4_210.000_220.000.wav	210.000	220.000	Train
+53oq_Otm_XI_30.000_40.000.wav	30.000	40.000	Train
+6OgSNQOTw2U_30.000_40.000.wav	30.000	40.000	Train
+6_TGlFO0DCk_10.000_20.000.wav	10.000	20.000	Train
+7KdSGBzXvz8_420.000_430.000.wav	420.000	430.000	Train
+7W_kcu0CJqI_310.000_320.000.wav	310.000	320.000	Train
+8IaInXpdd9M_0.000_10.000.wav	0.000	10.000	Train
+8nU1aVscJec_30.000_40.000.wav	30.000	40.000	Train
+9LQEZJPNVpw_30.000_40.000.wav	30.000	40.000	Train
+9NT6gEiqpWA_30.000_40.000.wav	30.000	40.000	Train
+AFhll08KM98_30.000_40.000.wav	30.000	40.000	Train
+AHom7lBbtoY_30.000_40.000.wav	30.000	40.000	Train
+AK0kZUDk294_2.000_12.000.wav	2.000	12.000	Train
+AKPC4rEGoyI_30.000_40.000.wav	30.000	40.000	Train
+APsvUzw7bWA_60.000_70.000.wav	60.000	70.000	Train
+AshwkKUV07s_23.000_33.000.wav	23.000	33.000	Train
+BI2Tol64na0_30.000_40.000.wav	30.000	40.000	Train
+BmS2NiuT2c0_160.000_170.000.wav	160.000	170.000	Train
+CCX_4cW_SAU_0.000_10.000.wav	0.000	10.000	Train
+D_nXtMgbPNY_30.000_40.000.wav	30.000	40.000	Train
+F-JFxERdA2w_30.000_40.000.wav	30.000	40.000	Train
+FoIBRxw0tyE_30.000_40.000.wav	30.000	40.000	Train
+G958vjLYBcI_110.000_120.000.wav	110.000	120.000	Train
+GFQnh84kNwU_30.000_40.000.wav	30.000	40.000	Train
+GKc8PCTen8Q_310.000_320.000.wav	310.000	320.000	Train
+I4qODX0fypE_30.000_40.000.wav	30.000	40.000	Train
+IIIxN_ziy_I_60.000_70.000.wav	60.000	70.000	Train
+IdqEbjujFb8_30.000_40.000.wav	30.000	40.000	Train
+K-i81KrH8BQ_30.000_40.000.wav	30.000	40.000	Train
+K9pSRLw6FNc_40.000_50.000.wav	40.000	50.000	Train
+KPyYUly5xCc_90.000_100.000.wav	90.000	100.000	Train
+L3a132_uApg_50.000_60.000.wav	50.000	60.000	Train
+LK4b2eJpy24_30.000_40.000.wav	30.000	40.000	Train
+LzcNa3HvD7c_30.000_40.000.wav	30.000	40.000	Train
+MCYY8tJsnfY_7.000_17.000.wav	7.000	17.000	Train
+MDF2vsjm8jU_10.000_20.000.wav	10.000	20.000	Train
+MMfiWJVftMA_60.000_70.000.wav	60.000	70.000	Train
+MYzVHespZ-E_30.000_40.000.wav	30.000	40.000	Train
+Mbe4rlNiM84_0.000_7.000.wav	0.000	7.000	Train
+MczH_PWBNeI_360.000_370.000.wav	360.000	370.000	Train
+Mfkif49LLc4_30.000_40.000.wav	30.000	40.000	Train
+MwSbYICrYj8_290.000_300.000.wav	290.000	300.000	Train
+PJUy17bXlhc_40.000_50.000.wav	40.000	50.000	Train
+QDTbchu0LrU_30.000_40.000.wav	30.000	40.000	Train
+QZJ5WAYIUh8_70.000_80.000.wav	70.000	80.000	Train
+QrAoRSA13bM_30.000_40.000.wav	30.000	40.000	Train
+RN-_agT8_Cg_0.000_10.000.wav	0.000	10.000	Train
+R_Lpb-51Kl4_30.000_40.000.wav	30.000	40.000	Train
+Rhvy7V4F95Q_40.000_50.000.wav	40.000	50.000	Train
+Rq-22Cycrpg_30.000_40.000.wav	30.000	40.000	Train
+RrlgSfQrqQc_20.000_30.000.wav	20.000	30.000	Train
+RwBKGPEg6uA_340.000_350.000.wav	340.000	350.000	Train
+T73runykdnE_25.000_35.000.wav	25.000	35.000	Train
+T8M6W4yOzI4_30.000_40.000.wav	30.000	40.000	Train
+Tmm4H6alHCE_30.000_40.000.wav	30.000	40.000	Train
+TyTORMEourg_270.000_280.000.wav	270.000	280.000	Train
+UQx0EMXtLZA_60.000_70.000.wav	60.000	70.000	Train
+UZx7OAgRMRY_90.000_100.000.wav	90.000	100.000	Train
+UerX5Bv2hcs_70.000_80.000.wav	70.000	80.000	Train
+UxSUGCvpskM_340.000_350.000.wav	340.000	350.000	Train
+V2hln47cP78_130.000_140.000.wav	130.000	140.000	Train
+VIe_Qkg5RJI_130.000_140.000.wav	130.000	140.000	Train
+WDn851XbWTk_30.000_40.000.wav	30.000	40.000	Train
+WFdpQCtpBB4_30.000_40.000.wav	30.000	40.000	Train
+XAUtk9lwzU8_30.000_40.000.wav	30.000	40.000	Train
+XDTlBb3aYqo_30.000_40.000.wav	30.000	40.000	Train
+XKvLkIM8dck_40.000_50.000.wav	40.000	50.000	Train
+XQbeLJYzY9k_90.000_100.000.wav	90.000	100.000	Train
+XW8pSKLyr0o_20.000_30.000.wav	20.000	30.000	Train
+XeYiNanFS_M_120.000_130.000.wav	120.000	130.000	Train
+Y10I9JSvJuQ_30.000_40.000.wav	30.000	40.000	Train
+YDGf-razgyU_250.000_260.000.wav	250.000	260.000	Train
+YFD1Qrlskrg_60.000_70.000.wav	60.000	70.000	Train
+Y_jwEflLthg_190.000_200.000.wav	190.000	200.000	Train
+Y_ynIwm3qm0_370.000_380.000.wav	370.000	380.000	Train
+Zy0goYEHPHU_30.000_40.000.wav	30.000	40.000	Train
+_dkeW6lqmq4_30.000_40.000.wav	30.000	40.000	Train
+aNO2KEXBCOk_30.000_40.000.wav	30.000	40.000	Train
+aXsUHAKbyLs_30.000_40.000.wav	30.000	40.000	Train
+ahct5yzUtdE_20.000_30.000.wav	20.000	30.000	Train
+arevYmB0qGg_30.000_40.000.wav	30.000	40.000	Train
+bCGtzspNbNo_30.000_40.000.wav	30.000	40.000	Train
+bI6wPI9kAm8_70.000_80.000.wav	70.000	80.000	Train
+bpdCMWWiB_0_30.000_40.000.wav	30.000	40.000	Train
+cdrjKqyDrak_420.000_430.000.wav	420.000	430.000	Train
+d1o334I5X_k_30.000_40.000.wav	30.000	40.000	Train
+dSzZWgbJ378_30.000_40.000.wav	30.000	40.000	Train
+eRclX9l0F_c_150.000_160.000.wav	150.000	160.000	Train
+fOVsAMJ3Yms_30.000_40.000.wav	30.000	40.000	Train
+fWVfi9pAh_4_10.000_20.000.wav	10.000	20.000	Train
+fztkF47lVQg_0.000_10.000.wav	0.000	10.000	Train
+g0ICxHjC9Uc_30.000_40.000.wav	30.000	40.000	Train
+g2scd3YVgwQ_30.000_40.000.wav	30.000	40.000	Train
+g4cA-ifQc70_30.000_40.000.wav	30.000	40.000	Train
+g9JVq7wfDIo_30.000_40.000.wav	30.000	40.000	Train
+gKMpowHeyKc_30.000_40.000.wav	30.000	40.000	Train
+gTFCK9TuLOQ_30.000_40.000.wav	30.000	40.000	Train
+gU0mD2fSh4c_500.000_510.000.wav	500.000	510.000	Train
+gkH_Zxasn8o_40.000_50.000.wav	40.000	50.000	Train
+gvnM4kK4r70_10.000_20.000.wav	10.000	20.000	Train
+hH_M56EnnDk_30.000_40.000.wav	30.000	40.000	Train
+hVvtTC9AmNs_30.000_40.000.wav	30.000	40.000	Train
+hYqzr_rIIAw_30.000_40.000.wav	30.000	40.000	Train
+hdYQzH2E-e4_310.000_320.000.wav	310.000	320.000	Train
+iZgzRfa-xPQ_30.000_40.000.wav	30.000	40.000	Train
+j9Z63H5hvrQ_0.000_10.000.wav	0.000	10.000	Train
+jbW2ew8VMfU_50.000_60.000.wav	50.000	60.000	Train
+jlz7r-NSUuA_50.000_60.000.wav	50.000	60.000	Train
+k0vRZm7ZnQk_280.000_290.000.wav	280.000	290.000	Train
+k8H8rn4NaSM_0.000_10.000.wav	0.000	10.000	Train
+kbfkq3TuAe0_470.000_480.000.wav	470.000	480.000	Train
+lf1Sblrda3A_560.000_570.000.wav	560.000	570.000	Train
+m4DS9-5Gkds_30.000_40.000.wav	30.000	40.000	Train
+m5HeCy87QYY_380.000_390.000.wav	380.000	390.000	Train
+nKM4MUAsVzg_100.000_110.000.wav	100.000	110.000	Train
+nY1gcEMzsWI_10.000_20.000.wav	10.000	20.000	Train
+nfY_zkJceDw_30.000_40.000.wav	30.000	40.000	Train
+oogrnx-_LBA_60.000_70.000.wav	60.000	70.000	Train
+pW5SI1ZKUpA_30.000_40.000.wav	30.000	40.000	Train
+pbOZLMrJy0A_0.000_10.000.wav	0.000	10.000	Train
+pxmrmtEnROk_30.000_40.000.wav	30.000	40.000	Train
+q7zzKHFWGkg_30.000_40.000.wav	30.000	40.000	Train
+qu8vVFWKszA_30.000_40.000.wav	30.000	40.000	Train
+r6mHSfFkY_8_30.000_40.000.wav	30.000	40.000	Train
+rNNPQ9DD4no_30.000_40.000.wav	30.000	40.000	Train
+rSrBDAgLUoI_460.000_470.000.wav	460.000	470.000	Train
+stdjjG6Y5IU_30.000_40.000.wav	30.000	40.000	Train
+t_lFhyZaZR0_150.000_160.000.wav	150.000	160.000	Train
+txXSE7kgrc8_30.000_40.000.wav	30.000	40.000	Train
+uZfsEDo3elY_20.000_30.000.wav	20.000	30.000	Train
+umcnfA9veOw_160.000_170.000.wav	160.000	170.000	Train
+uysTr0SfhLI_10.000_20.000.wav	10.000	20.000	Train
+wM9wNgY8d4g_150.000_160.000.wav	150.000	160.000	Train
+xabrKa79prM_30.000_40.000.wav	30.000	40.000	Train
+xshKOSEF_6o_0.000_10.000.wav	0.000	10.000	Train
+yBVxtq9k8Sg_0.000_10.000.wav	0.000	10.000	Train
+yH1r2Bblluw_240.000_250.000.wav	240.000	250.000	Train
+yywGJu6jp8U_30.000_40.000.wav	30.000	40.000	Train
+z5uKFGeTtNg_30.000_40.000.wav	30.000	40.000	Train
diff --git a/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv
new file mode 100644
index 0000000000000000000000000000000000000000..d98569b2bb2a47882ab09081c204bc66823b5053
--- /dev/null
+++ b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv
@@ -0,0 +1,606 @@
+-5QrBL6MzLg_60.000_70.000.wav	60.000	70.000	Train horn
+-E0shPRxAbo_30.000_40.000.wav	30.000	40.000	Train horn
+-GCwoyCnYsY_0.000_10.000.wav	0.000	10.000	Train horn
+-Gbohom8C4Q_30.000_40.000.wav	30.000	40.000	Train horn
+-Qfk_Q2ctBs_30.000_40.000.wav	30.000	40.000	Train horn
+-Wd1pV7UjWg_60.000_70.000.wav	60.000	70.000	Train horn
+-Zq22n4OewA_30.000_40.000.wav	30.000	40.000	Train horn
+-jj2tyuf6-A_80.000_90.000.wav	80.000	90.000	Train horn
+-nGBPqlRNg4_30.000_40.000.wav	30.000	40.000	Train horn
+-u9BxBNcrw4_30.000_40.000.wav	30.000	40.000	Train horn
+-zqW9xCZd80_260.000_270.000.wav	260.000	270.000	Train horn
+02w3vd_GgF0_390.000_400.000.wav	390.000	400.000	Train horn
+0HqeYIREv8M_30.000_40.000.wav	30.000	40.000	Train horn
+0IpYF91Fdt0_80.000_90.000.wav	80.000	90.000	Train horn
+0NaZejdABG0_90.000_100.000.wav	90.000	100.000	Train horn
+0RurXUfKyow_4.000_14.000.wav	4.000	14.000	Train horn
+0_HnD-rW3lI_170.000_180.000.wav	170.000	180.000	Train horn
+10i60V1RZkQ_210.000_220.000.wav	210.000	220.000	Train horn
+1FJY5X1iY9I_170.000_180.000.wav	170.000	180.000	Train horn
+1S5WKCcf-wU_40.000_50.000.wav	40.000	50.000	Train horn
+1U0Ty6CW6AM_40.000_50.000.wav	40.000	50.000	Train horn
+1hQLr88iCvg_30.000_40.000.wav	30.000	40.000	Train horn
+1iUXERALOOs_190.000_200.000.wav	190.000	200.000	Train horn
+1iWFlLpixKU_5.000_15.000.wav	5.000	15.000	Train horn
+1oJAVJPX0YY_20.000_30.000.wav	20.000	30.000	Train horn
+26dNsDuIt9Q_340.000_350.000.wav	340.000	350.000	Train horn
+2BMHsKLcb7E_90.000_100.000.wav	90.000	100.000	Train horn
+2RpOd9MJjyQ_10.000_20.000.wav	10.000	20.000	Train horn
+2U4wSdl10to_200.000_210.000.wav	200.000	210.000	Train horn
+2aBV6AZt5nk_570.000_580.000.wav	570.000	580.000	Train horn
+-8baTnilyjs_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-Gbohom8C4Q_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-jG26jT3fP8_230.000_240.000.wav	230.000	240.000	Air horn, truck horn
+-jj2tyuf6-A_80.000_90.000.wav	80.000	90.000	Air horn, truck horn
+-v7cUxke-f4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-yeWlsEpcpA_15.000_25.000.wav	15.000	25.000	Air horn, truck horn
+04KOunVOkSA_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+08y2LHhxmsM_400.000_410.000.wav	400.000	410.000	Air horn, truck horn
+0G73yqtBwgE_11.000_21.000.wav	11.000	21.000	Air horn, truck horn
+0UPY7ws-VFs_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+0euD32aKYUs_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+1T1i2rny8RU_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+1iRgwn7p0DA_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+1myTsHAIvYc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+1z0XoG6GEv4_420.000_430.000.wav	420.000	430.000	Air horn, truck horn
+26dNsDuIt9Q_340.000_350.000.wav	340.000	350.000	Air horn, truck horn
+2KmSuPb9gwA_24.000_34.000.wav	24.000	34.000	Air horn, truck horn
+2Vy5NCEkg2I_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+2ZciT0XrifM_0.000_8.000.wav	0.000	8.000	Air horn, truck horn
+2jOzX06bzuA_16.000_26.000.wav	16.000	26.000	Air horn, truck horn
+35EOmSMTQ6I_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+3YaLkgUMhAA_110.000_120.000.wav	110.000	120.000	Air horn, truck horn
+3ntFslTK6hM_90.000_100.000.wav	90.000	100.000	Air horn, truck horn
+3rGOv4evODE_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+42U7xIucU68_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+46r7mO2k6zY_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+4EBnb2DN3Yg_13.000_23.000.wav	13.000	23.000	Air horn, truck horn
+4NTjS5pFfSc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+4bvfOnX7BIE_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+4l78f9VZ9uE_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-ajCLjpfGKI_83.000_93.000.wav	83.000	93.000	Car alarm
+-hLSc9aPOms_13.000_23.000.wav	13.000	23.000	Car alarm
+-rgDWfvxxqw_30.000_40.000.wav	30.000	40.000	Car alarm
+0C3kqtF76t8_50.000_60.000.wav	50.000	60.000	Car alarm
+0Hz4R_m0hmI_80.000_90.000.wav	80.000	90.000	Car alarm
+0ZPafgZftWk_80.000_90.000.wav	80.000	90.000	Car alarm
+0npLQ4LzD0c_40.000_50.000.wav	40.000	50.000	Car alarm
+17VuPl9Wxvs_20.000_30.000.wav	20.000	30.000	Car alarm
+3HxQ83IMyw4_70.000_80.000.wav	70.000	80.000	Car alarm
+3z05luLEc_Q_0.000_10.000.wav	0.000	10.000	Car alarm
+4A1Ar1TIXIY_30.000_40.000.wav	30.000	40.000	Car alarm
+4Kpklmj-ze0_53.000_63.000.wav	53.000	63.000	Car alarm
+4h01lBkTVQY_18.000_28.000.wav	18.000	28.000	Car alarm
+5-SzZotiaBU_30.000_40.000.wav	30.000	40.000	Car alarm
+54PbkldEp9M_30.000_40.000.wav	30.000	40.000	Car alarm
+5P6YYsMaIH4_30.000_40.000.wav	30.000	40.000	Car alarm
+5tzTahLHylw_70.000_80.000.wav	70.000	80.000	Car alarm
+7DC3HtNi4fU_160.000_170.000.wav	160.000	170.000	Car alarm
+7NJ5TbNEIvA_250.000_260.000.wav	250.000	260.000	Car alarm
+7NZ0kMj2HSI_54.000_64.000.wav	54.000	64.000	Car alarm
+7RQpt1_1ZzU_30.000_40.000.wav	30.000	40.000	Car alarm
+7ee54nr6jG8_30.000_40.000.wav	30.000	40.000	Car alarm
+8OajsyPSNt8_40.000_50.000.wav	40.000	50.000	Car alarm
+9fCibkUT_gQ_30.000_40.000.wav	30.000	40.000	Car alarm
+9fzeD7CeI7Y_110.000_120.000.wav	110.000	120.000	Car alarm
+9jYv9WuyknA_130.000_140.000.wav	130.000	140.000	Car alarm
+A-GNszKtjJc_93.000_103.000.wav	93.000	103.000	Car alarm
+A437a4Y_xag_230.000_240.000.wav	230.000	240.000	Car alarm
+APMPW2YI-Zk_20.000_30.000.wav	20.000	30.000	Car alarm
+AR-KmtlXg4Y_70.000_80.000.wav	70.000	80.000	Car alarm
+-60XojQWWoc_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-6d-zxMvC5E_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-6qSMlbJJ58_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-8OITuFZha8_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-8n2NqDFRko_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-AIrHVeCgtM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-AVzYvKHwPg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-AXDeY-N2_M_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-B1uzsLG0Dk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-BM_EAszxBg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-Em3OpyaefM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-FWkB2IDMhc_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-SP7KWmTRUU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-h4or05bj_I_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-oV6dQu5tZo_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-r8mfjRiHrU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-s9kwrRilOY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-uMiGr6xvRA_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-x70B12Mb-8_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-xYsfYZOI-Y_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-zxrdL6MlKI_30.000_40.000.wav	30.000	40.000	Reversing beeps
+03xMfqt4fZI_24.000_34.000.wav	24.000	34.000	Reversing beeps
+0E4AqW9dmdk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0FQo-2xRJ0E_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0HmiH-wKLB4_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0KskqFt3DoY_15.000_25.000.wav	15.000	25.000	Reversing beeps
+0OiPtV9sd_w_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0P-YGHC5cBU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0QKet-tdquc_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0VnoYVqd-yo_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-5px8DVPl8A_28.000_38.000.wav	28.000	38.000	Bicycle
+-D08wyQwDPQ_10.000_20.000.wav	10.000	20.000	Bicycle
+-F1_Gh78vJ0_30.000_40.000.wav	30.000	40.000	Bicycle
+-FZQIkX44Pk_10.000_20.000.wav	10.000	20.000	Bicycle
+-FsvS99nWTc_30.000_40.000.wav	30.000	40.000	Bicycle
+-Holdef_BZ0_30.000_40.000.wav	30.000	40.000	Bicycle
+-Inn26beF70_30.000_40.000.wav	30.000	40.000	Bicycle
+-Jq9HNSs_ns_14.000_24.000.wav	14.000	24.000	Bicycle
+-KlN_AXMM0Q_30.000_40.000.wav	30.000	40.000	Bicycle
+-NCcqKWiGus_30.000_40.000.wav	30.000	40.000	Bicycle
+-NNC_TqWfGw_30.000_40.000.wav	30.000	40.000	Bicycle
+-OGFiXvmldM_30.000_40.000.wav	30.000	40.000	Bicycle
+-RFpDUZhN-g_13.000_23.000.wav	13.000	23.000	Bicycle
+-XUfeRTw3b4_0.000_6.000.wav	0.000	6.000	Bicycle
+-XoATxJ-Qcg_30.000_40.000.wav	30.000	40.000	Bicycle
+-bFNxvFwDts_470.000_480.000.wav	470.000	480.000	Bicycle
+-e5PokL6Cyo_30.000_40.000.wav	30.000	40.000	Bicycle
+-fNyOf9zIU0_30.000_40.000.wav	30.000	40.000	Bicycle
+-fhpkRyZL90_30.000_40.000.wav	30.000	40.000	Bicycle
+-fo3m0hiZbg_30.000_40.000.wav	30.000	40.000	Bicycle
+-ikJkNwcmkA_27.000_37.000.wav	27.000	37.000	Bicycle
+-k2nMcxAjWE_30.000_40.000.wav	30.000	40.000	Bicycle
+-k80ibA-fyw_30.000_40.000.wav	30.000	40.000	Bicycle
+-lBcEVa_NKw_30.000_40.000.wav	30.000	40.000	Bicycle
+-mQyAYU_Bd4_50.000_60.000.wav	50.000	60.000	Bicycle
+-ngrinYHF4c_30.000_40.000.wav	30.000	40.000	Bicycle
+-nqm_RJ2xj8_40.000_50.000.wav	40.000	50.000	Bicycle
+-oAw5iTeT1g_40.000_50.000.wav	40.000	50.000	Bicycle
+-p2EMzpTE38_4.000_14.000.wav	4.000	14.000	Bicycle
+-qmfWP_yzn4_30.000_40.000.wav	30.000	40.000	Bicycle
+-0DIFwkUpjQ_50.000_60.000.wav	50.000	60.000	Skateboard
+-53qltVyjpc_180.000_190.000.wav	180.000	190.000	Skateboard
+-5y4jb9eUWs_110.000_120.000.wav	110.000	120.000	Skateboard
+-81kolkG8M0_0.000_8.000.wav	0.000	8.000	Skateboard
+-9dwTSq6JZg_70.000_80.000.wav	70.000	80.000	Skateboard
+-9oKZsjjf_0_20.000_30.000.wav	20.000	30.000	Skateboard
+-AFGfu5zOzQ_30.000_40.000.wav	30.000	40.000	Skateboard
+-DHGwygUsQc_30.000_40.000.wav	30.000	40.000	Skateboard
+-DkuTmIs7_Q_30.000_40.000.wav	30.000	40.000	Skateboard
+-E1E17R7UBA_260.000_270.000.wav	260.000	270.000	Skateboard
+-E1aIXhB4YU_30.000_40.000.wav	30.000	40.000	Skateboard
+-McJLXNN3-o_50.000_60.000.wav	50.000	60.000	Skateboard
+-N7nQ4CXGsY_170.000_180.000.wav	170.000	180.000	Skateboard
+-O5vrHFRzcY_30.000_40.000.wav	30.000	40.000	Skateboard
+-Plh9jAN_Eo_0.000_2.000.wav	0.000	2.000	Skateboard
+-Qd_dXTbgK0_30.000_40.000.wav	30.000	40.000	Skateboard
+-aVZ-H92M_s_0.000_4.000.wav	0.000	4.000	Skateboard
+-cd-Zn8qFxU_90.000_100.000.wav	90.000	100.000	Skateboard
+-esP4loyvjM_60.000_70.000.wav	60.000	70.000	Skateboard
+-iB3a71aPew_30.000_40.000.wav	30.000	40.000	Skateboard
+-lZapwtvwlg_0.000_10.000.wav	0.000	10.000	Skateboard
+-mxMaMJCXL8_180.000_190.000.wav	180.000	190.000	Skateboard
+-nYGTw9Sypg_20.000_30.000.wav	20.000	30.000	Skateboard
+-oS19KshdlM_30.000_40.000.wav	30.000	40.000	Skateboard
+-s6uxc77NWo_40.000_50.000.wav	40.000	50.000	Skateboard
+-sCrXS2kJlA_30.000_40.000.wav	30.000	40.000	Skateboard
+-saCvPTdQ7s_30.000_40.000.wav	30.000	40.000	Skateboard
+-sb-knLiDic_20.000_30.000.wav	20.000	30.000	Skateboard
+-tSwRvqaKWg_90.000_100.000.wav	90.000	100.000	Skateboard
+-x_jV34hVq4_30.000_40.000.wav	30.000	40.000	Skateboard
+--ljM2Kojag_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-4F1TX-T6T4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-7HVWUwyMig_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-9pUUT-6o8U_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-Ei2LE71Dfg_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+-LGTb-xyjzA_11.000_21.000.wav	11.000	21.000	Ambulance (siren)
+-Y1qiiugnk8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-YsrLG2K1TE_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-ZeMV790MXE_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+-d-T8Y9-TOg_17.000_27.000.wav	17.000	27.000	Ambulance (siren)
+-dcrL5JLmvo_11.000_21.000.wav	11.000	21.000	Ambulance (siren)
+-fCSO8SVWZU_6.000_16.000.wav	6.000	16.000	Ambulance (siren)
+-fGFQTGd2nA_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+-hA1yMrEXz0_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+-jnQgpHubNI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-k6p9n9y22Q_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-kr4SUjnm88_29.000_39.000.wav	29.000	39.000	Ambulance (siren)
+-lyPnABQhCI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-od8LQAVgno_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-pVEgzu95Nc_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-w-9yF465IY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-woquFRnQk8_16.000_26.000.wav	16.000	26.000	Ambulance (siren)
+-xz75wUCln8_50.000_60.000.wav	50.000	60.000	Ambulance (siren)
+-yGElLHdkEI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-yPSgCn9AWo_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-z8jsgl3iHE_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+00H_s-krtg8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+02u3P99INjs_8.000_18.000.wav	8.000	18.000	Ambulance (siren)
+06RreMb5qbE_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+0EPK7Pv_lbE_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-0Eem_FuIto_15.000_25.000.wav	15.000	25.000	Fire engine, fire truck (siren)
+-2sT5oBBWWY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-45cKZA7Jww_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-4B435WQvag_20.000_30.000.wav	20.000	30.000	Fire engine, fire truck (siren)
+-6qhtwdfGOA_23.000_33.000.wav	23.000	33.000	Fire engine, fire truck (siren)
+-8uyNBFbdFc_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-Jsu4dbuO4A_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-KsPTvgJJVE_350.000_360.000.wav	350.000	360.000	Fire engine, fire truck (siren)
+-PRrNx6_MD0_16.000_26.000.wav	16.000	26.000	Fire engine, fire truck (siren)
+-QBo1W2w8II_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-QX-ddNtUvE_24.000_34.000.wav	24.000	34.000	Fire engine, fire truck (siren)
+-RlUu1el2G4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-SkO97C81Ms_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-T8QHPXfIC4_13.000_23.000.wav	13.000	23.000	Fire engine, fire truck (siren)
+-USiTjZoh88_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-X0vNLwH1C0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-Z3ByS_RCwI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-ZtZOcg3s7M_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-cOjJ0Nvtlw_23.000_33.000.wav	23.000	33.000	Fire engine, fire truck (siren)
+-cbYvBBXE6A_12.000_22.000.wav	12.000	22.000	Fire engine, fire truck (siren)
+-eYUCWGQ_wU_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-hA1yMrEXz0_10.000_20.000.wav	10.000	20.000	Fire engine, fire truck (siren)
+-hplTh4SGvs_90.000_100.000.wav	90.000	100.000	Fire engine, fire truck (siren)
+-nPhg6Eu4b4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-oCvKmNbhl0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-oEGuMg8hT4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-pvaJ4DwtRg_3.000_13.000.wav	3.000	13.000	Fire engine, fire truck (siren)
+-qKRKDTbt4c_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-sJn3uUxpH8_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-sfn1NDHWJI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-09rxiqNNEs_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-3qh-WFUV2U_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-4JG_Ag99hY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-60NmEaP0is_0.000_10.000.wav	0.000	10.000	Civil defense siren
+-6cTEqIcics_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-6iVBmb5PZU_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-6qp8NjWffE_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-75iY1j3MeY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-E3Yju3lrRo_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-FHSBdx5A3g_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-JhSzxTdcwY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-OtNDK_Hxp8_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-S3_I0RiG3g_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-YMXgDKKAwU_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-c7XoYM-SSY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-j8EeIX9ynk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-t478yabOQw_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-uIyMR9luvg_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-wgP6ua-t4k_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-zGAb18JxmI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+03NLMEMi8-I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0552YhBdeXo_30.000_40.000.wav	30.000	40.000	Civil defense siren
+06TM6z3NvuY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0CUi0oGUzjU_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0GpUFFJNFH8_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0H_WUo2srs0_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0HvYkBXQ44A_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0I6Mlp27_gM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0JKcTVpby0I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0PhU-PIsUMw_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-122tCXtFhU_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-1U98XBTyB4_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-2GlU3e0nTU_170.000_180.000.wav	170.000	180.000	Police car (siren)
+-6WqJCSmkCw_70.000_80.000.wav	70.000	80.000	Police car (siren)
+-AF7wp3ezww_140.000_150.000.wav	140.000	150.000	Police car (siren)
+-AFASmp1fpk_6.000_16.000.wav	6.000	16.000	Police car (siren)
+-F2lk9A8B8M_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-GPv09qi9A8_120.000_130.000.wav	120.000	130.000	Police car (siren)
+-Hi-WpRGUpc_9.000_19.000.wav	9.000	19.000	Police car (siren)
+-KsPTvgJJVE_350.000_360.000.wav	350.000	360.000	Police car (siren)
+-MfBpxtGQmE_20.000_30.000.wav	20.000	30.000	Police car (siren)
+-Pg4vVPs4bE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-UCf_-3yzWU_290.000_300.000.wav	290.000	300.000	Police car (siren)
+-VULyMtKazE_0.000_7.000.wav	0.000	7.000	Police car (siren)
+-XRiLbb3Syo_2.000_12.000.wav	2.000	12.000	Police car (siren)
+-XrpzGb6xCU_190.000_200.000.wav	190.000	200.000	Police car (siren)
+-YsrLG2K1TE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-ZtZOcg3s7M_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-_8fdnv6Crg_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-az6BooRLxw_40.000_50.000.wav	40.000	50.000	Police car (siren)
+-bs3c27rEtc_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-dBTGdL4RFs_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-gKNRXbpAKs_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-hA1yMrEXz0_10.000_20.000.wav	10.000	20.000	Police car (siren)
+-haSUR_IUto_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-l-DEfDAvNA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-lWs7_49gss_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-lhnhB4rbGw_3.000_13.000.wav	3.000	13.000	Police car (siren)
+-rkJeBBmiTQ_60.000_70.000.wav	60.000	70.000	Police car (siren)
+-rs7FPxzc6w_8.000_18.000.wav	8.000	18.000	Police car (siren)
+-20uudT97E0_30.000_40.000.wav	30.000	40.000	Screaming
+-3bGlOhRkAo_140.000_150.000.wav	140.000	150.000	Screaming
+-4pUrlMafww_1.000_11.000.wav	1.000	11.000	Screaming
+-7R0ybQQAHg_60.000_70.000.wav	60.000	70.000	Screaming
+-7gojlG6bE4_30.000_40.000.wav	30.000	40.000	Screaming
+-GI5PbO6j50_30.000_40.000.wav	30.000	40.000	Screaming
+-MuIRudOtxw_30.000_40.000.wav	30.000	40.000	Screaming
+-WfQBr42ymw_30.000_40.000.wav	30.000	40.000	Screaming
+-YOjIgYspsY_30.000_40.000.wav	30.000	40.000	Screaming
+-g_AcRVFfXU_30.000_40.000.wav	30.000	40.000	Screaming
+-gb5uvwsRpI_30.000_40.000.wav	30.000	40.000	Screaming
+-iAwqlQ3TEk_0.000_3.000.wav	0.000	3.000	Screaming
+-nJoxcmxz5g_30.000_40.000.wav	30.000	40.000	Screaming
+-pwgypWE-J8_30.000_40.000.wav	30.000	40.000	Screaming
+-pzasCR0kpc_30.000_40.000.wav	30.000	40.000	Screaming
+-sUgHKZQKYc_30.000_40.000.wav	30.000	40.000	Screaming
+-uazzQEmQ7c_0.000_10.000.wav	0.000	10.000	Screaming
+-vHJU1wDRsY_30.000_40.000.wav	30.000	40.000	Screaming
+0-RnTXpp8Q0_30.000_40.000.wav	30.000	40.000	Screaming
+09YQukdYVI4_30.000_40.000.wav	30.000	40.000	Screaming
+0Ees8KFCUXM_30.000_40.000.wav	30.000	40.000	Screaming
+0EymGuYWkFk_30.000_40.000.wav	30.000	40.000	Screaming
+0Nw1OyTsaAo_30.000_40.000.wav	30.000	40.000	Screaming
+0YnOMAls83g_30.000_40.000.wav	30.000	40.000	Screaming
+0_gyUQkLCY8_30.000_40.000.wav	30.000	40.000	Screaming
+0_hnDV2SHBI_7.000_17.000.wav	7.000	17.000	Screaming
+0cqEaAkbrbI_80.000_90.000.wav	80.000	90.000	Screaming
+0hC044mDsWA_30.000_40.000.wav	30.000	40.000	Screaming
+0kQANiakiH0_30.000_40.000.wav	30.000	40.000	Screaming
+0rVBXpbgO8s_30.000_40.000.wav	30.000	40.000	Screaming
+---lTs1dxhU_30.000_40.000.wav	30.000	40.000	Car
+--330hg-Ocw_30.000_40.000.wav	30.000	40.000	Car
+--8puiAGLhs_30.000_40.000.wav	30.000	40.000	Car
+--9VR_F7CtY_30.000_40.000.wav	30.000	40.000	Car
+--F70LWypIg_30.000_40.000.wav	30.000	40.000	Car
+--P4wuph3Mc_0.000_8.000.wav	0.000	8.000	Car
+--QvRbvnbUE_30.000_40.000.wav	30.000	40.000	Car
+--SeOZy3Yik_30.000_40.000.wav	30.000	40.000	Car
+--Zz7BgxSUg_30.000_40.000.wav	30.000	40.000	Car
+--e0Vu_ruTc_30.000_40.000.wav	30.000	40.000	Car
+--iFD6IyQW8_30.000_40.000.wav	30.000	40.000	Car
+--jGnLqFsQ4_24.000_34.000.wav	24.000	34.000	Car
+--jc0NAxK8M_30.000_40.000.wav	30.000	40.000	Car
+--v1WjOJv-w_150.000_160.000.wav	150.000	160.000	Car
+--xDffQ9Mwo_30.000_40.000.wav	30.000	40.000	Car
+--yaQA8d1dI_6.000_16.000.wav	6.000	16.000	Car
+--zLzL0sq3M_30.000_40.000.wav	30.000	40.000	Car
+-0-jXXldDOU_10.000_20.000.wav	10.000	20.000	Car
+-03ld83JliM_29.000_39.000.wav	29.000	39.000	Car
+-0B-egfXU7E_30.000_40.000.wav	30.000	40.000	Car
+-0Bkyt8iZ1I_8.000_18.000.wav	8.000	18.000	Car
+-0CIk-OOp7Y_30.000_40.000.wav	30.000	40.000	Car
+-0CRb8H4hzY_4.000_14.000.wav	4.000	14.000	Car
+-0CY5NWBHyY_20.000_30.000.wav	20.000	30.000	Car
+-0HsrVfb5vc_20.000_30.000.wav	20.000	30.000	Car
+-0I89-H0AFo_26.000_36.000.wav	26.000	36.000	Car
+-0P6VDQ1YDs_80.000_90.000.wav	80.000	90.000	Car
+-0PrEsytvc0_30.000_40.000.wav	30.000	40.000	Car
+-0RqnaXZu_E_30.000_40.000.wav	30.000	40.000	Car
+-0Yynyhm1AY_14.000_24.000.wav	14.000	24.000	Car
+---lTs1dxhU_30.000_40.000.wav	30.000	40.000	Car passing by
+--P4wuph3Mc_0.000_8.000.wav	0.000	8.000	Car passing by
+--xDffQ9Mwo_30.000_40.000.wav	30.000	40.000	Car passing by
+--zLzL0sq3M_30.000_40.000.wav	30.000	40.000	Car passing by
+--zbPxnl27o_20.000_30.000.wav	20.000	30.000	Car passing by
+-0CRb8H4hzY_4.000_14.000.wav	4.000	14.000	Car passing by
+-0MnD7jBvkE_0.000_4.000.wav	0.000	4.000	Car passing by
+-0U3c4PN8sc_30.000_40.000.wav	30.000	40.000	Car passing by
+-0Yynyhm1AY_14.000_24.000.wav	14.000	24.000	Car passing by
+-10fWp7Pqs4_30.000_40.000.wav	30.000	40.000	Car passing by
+-14BFlDzjS4_6.000_16.000.wav	6.000	16.000	Car passing by
+-15nPYi2v1g_30.000_40.000.wav	30.000	40.000	Car passing by
+-19pq3HJoBM_30.000_40.000.wav	30.000	40.000	Car passing by
+-1BrkFLHD74_19.000_29.000.wav	19.000	29.000	Car passing by
+-1HlfoHZCEE_6.000_16.000.wav	6.000	16.000	Car passing by
+-1McjOPUzbo_30.000_40.000.wav	30.000	40.000	Car passing by
+-1sGSNmgiPs_4.000_14.000.wav	4.000	14.000	Car passing by
+-2-luek6dI8_30.000_40.000.wav	30.000	40.000	Car passing by
+-21-RfxQscI_30.000_40.000.wav	30.000	40.000	Car passing by
+-25LkbSjEos_30.000_40.000.wav	30.000	40.000	Car passing by
+-2LJWaL2PuA_30.000_40.000.wav	30.000	40.000	Car passing by
+-2ZbvsBSZmY_2.000_12.000.wav	2.000	12.000	Car passing by
+-2cz2qQDmr4_30.000_40.000.wav	30.000	40.000	Car passing by
+-31KUAOSg5U_5.000_15.000.wav	5.000	15.000	Car passing by
+-35qBdzN9ck_30.000_40.000.wav	30.000	40.000	Car passing by
+-3929cmVE20_30.000_40.000.wav	30.000	40.000	Car passing by
+-3M-k4nIYIM_30.000_40.000.wav	30.000	40.000	Car passing by
+-3MNphBfq_0_30.000_40.000.wav	30.000	40.000	Car passing by
+-3_RSVYKkkk_30.000_40.000.wav	30.000	40.000	Car passing by
+-3exNVlj92w_30.000_40.000.wav	30.000	40.000	Car passing by
+--0w1YA1Hm4_30.000_40.000.wav	30.000	40.000	Bus
+-0_vEaaXndY_11.000_21.000.wav	11.000	21.000	Bus
+-5GcZwBvBdI_30.000_40.000.wav	30.000	40.000	Bus
+-5digoPWn6U_8.000_18.000.wav	8.000	18.000	Bus
+-79l4w4DsYM_30.000_40.000.wav	30.000	40.000	Bus
+-7B4pbkIEas_30.000_40.000.wav	30.000	40.000	Bus
+-8YTu7ZGA2w_30.000_40.000.wav	30.000	40.000	Bus
+-93IM29_8rs_14.000_24.000.wav	14.000	24.000	Bus
+-9GhPxGkpio_26.000_36.000.wav	26.000	36.000	Bus
+-9J9xs7LM9Y_25.000_35.000.wav	25.000	35.000	Bus
+-AY_lZLYJR8_8.000_18.000.wav	8.000	18.000	Bus
+-AdQBgtN_4E_30.000_40.000.wav	30.000	40.000	Bus
+-BxfsWlPUPY_30.000_40.000.wav	30.000	40.000	Bus
+-CgCr8Eknm0_14.000_24.000.wav	14.000	24.000	Bus
+-CnsvTDIXdE_20.000_30.000.wav	20.000	30.000	Bus
+-CpMlnGhxEU_0.000_9.000.wav	0.000	9.000	Bus
+-DP_cv0x_Ng_30.000_40.000.wav	30.000	40.000	Bus
+-FEXRjcryZE_30.000_40.000.wav	30.000	40.000	Bus
+-Fp2-w-iLiE_20.000_30.000.wav	20.000	30.000	Bus
+-GLk6G9U09A_30.000_40.000.wav	30.000	40.000	Bus
+-Ga9sSkpngg_30.000_40.000.wav	30.000	40.000	Bus
+-H8V23dZoLo_0.000_10.000.wav	0.000	10.000	Bus
+-HeQfwKbFzg_30.000_40.000.wav	30.000	40.000	Bus
+-HzzEuFBiDU_30.000_40.000.wav	30.000	40.000	Bus
+-I4INTpMKT4_30.000_40.000.wav	30.000	40.000	Bus
+-II-7qJxKPc_21.000_31.000.wav	21.000	31.000	Bus
+-LnpzyfTkF8_30.000_40.000.wav	30.000	40.000	Bus
+-OgRshQfsi8_30.000_40.000.wav	30.000	40.000	Bus
+-P53lJ1ViWk_30.000_40.000.wav	30.000	40.000	Bus
+-PvNUvEov4Q_30.000_40.000.wav	30.000	40.000	Bus
+--12UOziMF0_30.000_40.000.wav	30.000	40.000	Truck
+--73E04RpiQ_0.000_9.000.wav	0.000	9.000	Truck
+--J947HxQVM_0.000_9.000.wav	0.000	9.000	Truck
+--bD1DVKlzQ_30.000_40.000.wav	30.000	40.000	Truck
+--ivFZu-hlc_30.000_40.000.wav	30.000	40.000	Truck
+--wuU7kzB5o_30.000_40.000.wav	30.000	40.000	Truck
+-0B_CYyG5Dg_30.000_40.000.wav	30.000	40.000	Truck
+-0JqTq_4jaE_40.000_50.000.wav	40.000	50.000	Truck
+-0MrEZKJ5MQ_30.000_40.000.wav	30.000	40.000	Truck
+-0awng26xQ8_30.000_40.000.wav	30.000	40.000	Truck
+-0dq1Vg9rd8_30.000_40.000.wav	30.000	40.000	Truck
+-0wkq7CUYME_310.000_320.000.wav	310.000	320.000	Truck
+-14RXdkqYuI_30.000_40.000.wav	30.000	40.000	Truck
+-1B3CzpiW1M_30.000_40.000.wav	30.000	40.000	Truck
+-1Q21cZhHDE_30.000_40.000.wav	30.000	40.000	Truck
+-1ZXXnBXJ6c_8.000_18.000.wav	8.000	18.000	Truck
+-1s0DWApvT8_30.000_40.000.wav	30.000	40.000	Truck
+-1s84_2Vn4g_30.000_40.000.wav	30.000	40.000	Truck
+-26ansJluVo_30.000_40.000.wav	30.000	40.000	Truck
+-2EscdO0l-A_30.000_40.000.wav	30.000	40.000	Truck
+-2GlU3e0nTU_170.000_180.000.wav	170.000	180.000	Truck
+-2NBZUCcvm0_30.000_40.000.wav	30.000	40.000	Truck
+-2sT5oBBWWY_30.000_40.000.wav	30.000	40.000	Truck
+-2vmprMUw10_30.000_40.000.wav	30.000	40.000	Truck
+-2x4TB8VWvE_18.000_28.000.wav	18.000	28.000	Truck
+-39q4y0tt-g_30.000_40.000.wav	30.000	40.000	Truck
+-3N5rjPrNCc_190.000_200.000.wav	190.000	200.000	Truck
+-3NcUIyJtFY_30.000_40.000.wav	30.000	40.000	Truck
+-3PplV0ErOk_30.000_40.000.wav	30.000	40.000	Truck
+-3gSkrDKNSA_27.000_37.000.wav	27.000	37.000	Truck
+--p-rk_HBuU_30.000_40.000.wav	30.000	40.000	Motorcycle
+-1WK72M4xeg_220.000_230.000.wav	220.000	230.000	Motorcycle
+-1XfuJcdvfg_30.000_40.000.wav	30.000	40.000	Motorcycle
+-3XWBAmjmaQ_11.000_21.000.wav	11.000	21.000	Motorcycle
+-4-87UgJcUw_70.000_80.000.wav	70.000	80.000	Motorcycle
+-4D3Gkyisyc_30.000_40.000.wav	30.000	40.000	Motorcycle
+-5k5GyHd2So_4.000_14.000.wav	4.000	14.000	Motorcycle
+-6A2L1U9b5Y_54.000_64.000.wav	54.000	64.000	Motorcycle
+-6Yfati1N10_80.000_90.000.wav	80.000	90.000	Motorcycle
+-7_o_GhpZpM_12.000_22.000.wav	12.000	22.000	Motorcycle
+-7rZwMK6uSs_70.000_80.000.wav	70.000	80.000	Motorcycle
+-85f5DKKfSo_30.000_40.000.wav	30.000	40.000	Motorcycle
+-9Smdrt5zwk_40.000_50.000.wav	40.000	50.000	Motorcycle
+-9gZLVDKpnE_30.000_40.000.wav	30.000	40.000	Motorcycle
+-BGebo8V4XY_30.000_40.000.wav	30.000	40.000	Motorcycle
+-DdiduB5B_w_190.000_200.000.wav	190.000	200.000	Motorcycle
+-HIPq7T3eFI_11.000_21.000.wav	11.000	21.000	Motorcycle
+-H_3oEkKe0M_50.000_60.000.wav	50.000	60.000	Motorcycle
+-HmuMoykRqA_500.000_510.000.wav	500.000	510.000	Motorcycle
+-IMRE_psvtI_30.000_40.000.wav	30.000	40.000	Motorcycle
+-Ie4LSPDEF4_6.000_16.000.wav	6.000	16.000	Motorcycle
+-J0F29UCZiA_70.000_80.000.wav	70.000	80.000	Motorcycle
+-KFCJ7ydu2E_0.000_10.000.wav	0.000	10.000	Motorcycle
+-KmDAgYb0Uo_100.000_110.000.wav	100.000	110.000	Motorcycle
+-P7iW3WzNfc_400.000_410.000.wav	400.000	410.000	Motorcycle
+-QMAKXzIGx4_10.000_20.000.wav	10.000	20.000	Motorcycle
+-S-5z2vYtxw_10.000_20.000.wav	10.000	20.000	Motorcycle
+-SlL0NZh51w_30.000_40.000.wav	30.000	40.000	Motorcycle
+-US2mpJxbj4_30.000_40.000.wav	30.000	40.000	Motorcycle
+-VO-C9C0uqY_1.000_11.000.wav	1.000	11.000	Motorcycle
+--H_-CEB2wA_30.000_40.000.wav	30.000	40.000	Train
+-1VsFy0eVJs_30.000_40.000.wav	30.000	40.000	Train
+-1X7kpLnOpM_60.000_70.000.wav	60.000	70.000	Train
+-3FIglJti0s_30.000_40.000.wav	30.000	40.000	Train
+-5QrBL6MzLg_60.000_70.000.wav	60.000	70.000	Train
+-6KOEEiAf9s_19.000_29.000.wav	19.000	29.000	Train
+-97l_c6PToE_30.000_40.000.wav	30.000	40.000	Train
+-9S5Z-uciLo_70.000_80.000.wav	70.000	80.000	Train
+-CkgGfKepO4_140.000_150.000.wav	140.000	150.000	Train
+-E0shPRxAbo_30.000_40.000.wav	30.000	40.000	Train
+-Gbohom8C4Q_30.000_40.000.wav	30.000	40.000	Train
+-JpQivta6MQ_20.000_30.000.wav	20.000	30.000	Train
+-K9oTZj3mVQ_30.000_40.000.wav	30.000	40.000	Train
+-KjE40DlSdU_0.000_10.000.wav	0.000	10.000	Train
+-NrFtZ_xxFU_30.000_40.000.wav	30.000	40.000	Train
+-PYRamK58Ss_0.000_10.000.wav	0.000	10.000	Train
+-P_XDJt4p_s_30.000_40.000.wav	30.000	40.000	Train
+-Pjylzex7oc_350.000_360.000.wav	350.000	360.000	Train
+-QHuZGmIy_I_30.000_40.000.wav	30.000	40.000	Train
+-Qfk_Q2ctBs_30.000_40.000.wav	30.000	40.000	Train
+-RXKRoRPWXg_30.000_40.000.wav	30.000	40.000	Train
+-VH414svzI0_30.000_40.000.wav	30.000	40.000	Train
+-WFdYxE-PYI_30.000_40.000.wav	30.000	40.000	Train
+-Wd1pV7UjWg_60.000_70.000.wav	60.000	70.000	Train
+-XcC-UlbcRA_30.000_40.000.wav	30.000	40.000	Train
+-Y2cD8xvCHI_30.000_40.000.wav	30.000	40.000	Train
+-ZKZkMHe3cY_70.000_80.000.wav	70.000	80.000	Train
+-Zq22n4OewA_30.000_40.000.wav	30.000	40.000	Train
+-aZ7XC4LG2A_30.000_40.000.wav	30.000	40.000	Train
+-abVemAm9HM_430.000_440.000.wav	430.000	440.000	Train
+1T1i2rny8RU_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+7DC3HtNi4fU_160.000_170.000.wav	160.000	170.000	Ambulance (siren)
+-z8jsgl3iHE_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+00H_s-krtg8_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+0I6Mlp27_gM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+3YaLkgUMhAA_110.000_120.000.wav	110.000	120.000	Fire engine, fire truck (siren)
+4l78f9VZ9uE_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+35EOmSMTQ6I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+06RreMb5qbE_0.000_10.000.wav	0.000	10.000	Police car (siren)
+0EPK7Pv_lbE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+0I6Mlp27_gM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+17VuPl9Wxvs_20.000_30.000.wav	20.000	30.000	Police car (siren)
+4A1Ar1TIXIY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-10fWp7Pqs4_30.000_40.000.wav	30.000	40.000	Car
+-122tCXtFhU_30.000_40.000.wav	30.000	40.000	Car
+-14BFlDzjS4_6.000_16.000.wav	6.000	16.000	Car
+-1BrkFLHD74_19.000_29.000.wav	19.000	29.000	Car
+-1HlfoHZCEE_6.000_16.000.wav	6.000	16.000	Car
+-1McjOPUzbo_30.000_40.000.wav	30.000	40.000	Car
+-1sGSNmgiPs_4.000_14.000.wav	4.000	14.000	Car
+-25LkbSjEos_30.000_40.000.wav	30.000	40.000	Car
+-2GlU3e0nTU_170.000_180.000.wav	170.000	180.000	Car
+-2LJWaL2PuA_30.000_40.000.wav	30.000	40.000	Car
+-2ZbvsBSZmY_2.000_12.000.wav	2.000	12.000	Car
+-2cz2qQDmr4_30.000_40.000.wav	30.000	40.000	Car
+-31KUAOSg5U_5.000_15.000.wav	5.000	15.000	Car
+-35qBdzN9ck_30.000_40.000.wav	30.000	40.000	Car
+-3929cmVE20_30.000_40.000.wav	30.000	40.000	Car
+-3M-k4nIYIM_30.000_40.000.wav	30.000	40.000	Car
+-3MNphBfq_0_30.000_40.000.wav	30.000	40.000	Car
+-3_RSVYKkkk_30.000_40.000.wav	30.000	40.000	Car
+-AF7wp3ezww_140.000_150.000.wav	140.000	150.000	Car
+-Pg4vVPs4bE_30.000_40.000.wav	30.000	40.000	Car
+-VULyMtKazE_0.000_7.000.wav	0.000	7.000	Car
+-cbYvBBXE6A_12.000_22.000.wav	12.000	22.000	Car
+06RreMb5qbE_0.000_10.000.wav	0.000	10.000	Car
+0E4AqW9dmdk_30.000_40.000.wav	30.000	40.000	Car
+0Hz4R_m0hmI_80.000_90.000.wav	80.000	90.000	Car
+4Kpklmj-ze0_53.000_63.000.wav	53.000	63.000	Car
+5tzTahLHylw_70.000_80.000.wav	70.000	80.000	Car
+7NJ5TbNEIvA_250.000_260.000.wav	250.000	260.000	Car
+9fCibkUT_gQ_30.000_40.000.wav	30.000	40.000	Car
+9jYv9WuyknA_130.000_140.000.wav	130.000	140.000	Car
+-l-DEfDAvNA_30.000_40.000.wav	30.000	40.000	Car passing by
+9fCibkUT_gQ_30.000_40.000.wav	30.000	40.000	Car passing by
+-jj2tyuf6-A_80.000_90.000.wav	80.000	90.000	Bus
+-45cKZA7Jww_30.000_40.000.wav	30.000	40.000	Truck
+-4B435WQvag_20.000_30.000.wav	20.000	30.000	Truck
+-60XojQWWoc_30.000_40.000.wav	30.000	40.000	Truck
+-6qhtwdfGOA_23.000_33.000.wav	23.000	33.000	Truck
+-8OITuFZha8_30.000_40.000.wav	30.000	40.000	Truck
+-8n2NqDFRko_30.000_40.000.wav	30.000	40.000	Truck
+-AIrHVeCgtM_30.000_40.000.wav	30.000	40.000	Truck
+-AVzYvKHwPg_30.000_40.000.wav	30.000	40.000	Truck
+-BM_EAszxBg_30.000_40.000.wav	30.000	40.000	Truck
+-Ei2LE71Dfg_20.000_30.000.wav	20.000	30.000	Truck
+-FWkB2IDMhc_30.000_40.000.wav	30.000	40.000	Truck
+-Jsu4dbuO4A_30.000_40.000.wav	30.000	40.000	Truck
+-PRrNx6_MD0_16.000_26.000.wav	16.000	26.000	Truck
+-X0vNLwH1C0_30.000_40.000.wav	30.000	40.000	Truck
+-cbYvBBXE6A_12.000_22.000.wav	12.000	22.000	Truck
+-oCvKmNbhl0_30.000_40.000.wav	30.000	40.000	Truck
+-oV6dQu5tZo_30.000_40.000.wav	30.000	40.000	Truck
+-qKRKDTbt4c_30.000_40.000.wav	30.000	40.000	Truck
+-r8mfjRiHrU_30.000_40.000.wav	30.000	40.000	Truck
+-s9kwrRilOY_30.000_40.000.wav	30.000	40.000	Truck
+-uMiGr6xvRA_30.000_40.000.wav	30.000	40.000	Truck
+-x70B12Mb-8_30.000_40.000.wav	30.000	40.000	Truck
+-xYsfYZOI-Y_30.000_40.000.wav	30.000	40.000	Truck
+-zxrdL6MlKI_30.000_40.000.wav	30.000	40.000	Truck
+0C3kqtF76t8_50.000_60.000.wav	50.000	60.000	Truck
+0HmiH-wKLB4_30.000_40.000.wav	30.000	40.000	Truck
+0KskqFt3DoY_15.000_25.000.wav	15.000	25.000	Truck
+0OiPtV9sd_w_30.000_40.000.wav	30.000	40.000	Truck
+0VnoYVqd-yo_30.000_40.000.wav	30.000	40.000	Truck
+3YaLkgUMhAA_110.000_120.000.wav	110.000	120.000	Truck
+-nGBPqlRNg4_30.000_40.000.wav	30.000	40.000	Train
+02w3vd_GgF0_390.000_400.000.wav	390.000	400.000	Train
+0HqeYIREv8M_30.000_40.000.wav	30.000	40.000	Train
+0IpYF91Fdt0_80.000_90.000.wav	80.000	90.000	Train
+0NaZejdABG0_90.000_100.000.wav	90.000	100.000	Train
+0RurXUfKyow_4.000_14.000.wav	4.000	14.000	Train
+0_HnD-rW3lI_170.000_180.000.wav	170.000	180.000	Train
+10i60V1RZkQ_210.000_220.000.wav	210.000	220.000	Train
+1FJY5X1iY9I_170.000_180.000.wav	170.000	180.000	Train
+1U0Ty6CW6AM_40.000_50.000.wav	40.000	50.000	Train
+1hQLr88iCvg_30.000_40.000.wav	30.000	40.000	Train
+1iUXERALOOs_190.000_200.000.wav	190.000	200.000	Train
+1iWFlLpixKU_5.000_15.000.wav	5.000	15.000	Train
+1oJAVJPX0YY_20.000_30.000.wav	20.000	30.000	Train
+26dNsDuIt9Q_340.000_350.000.wav	340.000	350.000	Train
+2BMHsKLcb7E_90.000_100.000.wav	90.000	100.000	Train
+2RpOd9MJjyQ_10.000_20.000.wav	10.000	20.000	Train
+2U4wSdl10to_200.000_210.000.wav	200.000	210.000	Train
+2aBV6AZt5nk_570.000_580.000.wav	570.000	580.000	Train
+3ntFslTK6hM_90.000_100.000.wav	90.000	100.000	Train
diff --git a/audio_detection/audio_infer/metadata/class_labels_indices.csv b/audio_detection/audio_infer/metadata/class_labels_indices.csv
new file mode 100644
index 0000000000000000000000000000000000000000..3a2767e81114adecde59992cf6607f31c1862f4c
--- /dev/null
+++ b/audio_detection/audio_infer/metadata/class_labels_indices.csv
@@ -0,0 +1,528 @@
+index,mid,display_name
+0,/m/09x0r,"Speech"
+1,/m/05zppz,"Male speech, man speaking"
+2,/m/02zsn,"Female speech, woman speaking"
+3,/m/0ytgt,"Child speech, kid speaking"
+4,/m/01h8n0,"Conversation"
+5,/m/02qldy,"Narration, monologue"
+6,/m/0261r1,"Babbling"
+7,/m/0brhx,"Speech synthesizer"
+8,/m/07p6fty,"Shout"
+9,/m/07q4ntr,"Bellow"
+10,/m/07rwj3x,"Whoop"
+11,/m/07sr1lc,"Yell"
+12,/m/04gy_2,"Battle cry"
+13,/t/dd00135,"Children shouting"
+14,/m/03qc9zr,"Screaming"
+15,/m/02rtxlg,"Whispering"
+16,/m/01j3sz,"Laughter"
+17,/t/dd00001,"Baby laughter"
+18,/m/07r660_,"Giggle"
+19,/m/07s04w4,"Snicker"
+20,/m/07sq110,"Belly laugh"
+21,/m/07rgt08,"Chuckle, chortle"
+22,/m/0463cq4,"Crying, sobbing"
+23,/t/dd00002,"Baby cry, infant cry"
+24,/m/07qz6j3,"Whimper"
+25,/m/07qw_06,"Wail, moan"
+26,/m/07plz5l,"Sigh"
+27,/m/015lz1,"Singing"
+28,/m/0l14jd,"Choir"
+29,/m/01swy6,"Yodeling"
+30,/m/02bk07,"Chant"
+31,/m/01c194,"Mantra"
+32,/t/dd00003,"Male singing"
+33,/t/dd00004,"Female singing"
+34,/t/dd00005,"Child singing"
+35,/t/dd00006,"Synthetic singing"
+36,/m/06bxc,"Rapping"
+37,/m/02fxyj,"Humming"
+38,/m/07s2xch,"Groan"
+39,/m/07r4k75,"Grunt"
+40,/m/01w250,"Whistling"
+41,/m/0lyf6,"Breathing"
+42,/m/07mzm6,"Wheeze"
+43,/m/01d3sd,"Snoring"
+44,/m/07s0dtb,"Gasp"
+45,/m/07pyy8b,"Pant"
+46,/m/07q0yl5,"Snort"
+47,/m/01b_21,"Cough"
+48,/m/0dl9sf8,"Throat clearing"
+49,/m/01hsr_,"Sneeze"
+50,/m/07ppn3j,"Sniff"
+51,/m/06h7j,"Run"
+52,/m/07qv_x_,"Shuffle"
+53,/m/07pbtc8,"Walk, footsteps"
+54,/m/03cczk,"Chewing, mastication"
+55,/m/07pdhp0,"Biting"
+56,/m/0939n_,"Gargling"
+57,/m/01g90h,"Stomach rumble"
+58,/m/03q5_w,"Burping, eructation"
+59,/m/02p3nc,"Hiccup"
+60,/m/02_nn,"Fart"
+61,/m/0k65p,"Hands"
+62,/m/025_jnm,"Finger snapping"
+63,/m/0l15bq,"Clapping"
+64,/m/01jg02,"Heart sounds, heartbeat"
+65,/m/01jg1z,"Heart murmur"
+66,/m/053hz1,"Cheering"
+67,/m/028ght,"Applause"
+68,/m/07rkbfh,"Chatter"
+69,/m/03qtwd,"Crowd"
+70,/m/07qfr4h,"Hubbub, speech noise, speech babble"
+71,/t/dd00013,"Children playing"
+72,/m/0jbk,"Animal"
+73,/m/068hy,"Domestic animals, pets"
+74,/m/0bt9lr,"Dog"
+75,/m/05tny_,"Bark"
+76,/m/07r_k2n,"Yip"
+77,/m/07qf0zm,"Howl"
+78,/m/07rc7d9,"Bow-wow"
+79,/m/0ghcn6,"Growling"
+80,/t/dd00136,"Whimper (dog)"
+81,/m/01yrx,"Cat"
+82,/m/02yds9,"Purr"
+83,/m/07qrkrw,"Meow"
+84,/m/07rjwbb,"Hiss"
+85,/m/07r81j2,"Caterwaul"
+86,/m/0ch8v,"Livestock, farm animals, working animals"
+87,/m/03k3r,"Horse"
+88,/m/07rv9rh,"Clip-clop"
+89,/m/07q5rw0,"Neigh, whinny"
+90,/m/01xq0k1,"Cattle, bovinae"
+91,/m/07rpkh9,"Moo"
+92,/m/0239kh,"Cowbell"
+93,/m/068zj,"Pig"
+94,/t/dd00018,"Oink"
+95,/m/03fwl,"Goat"
+96,/m/07q0h5t,"Bleat"
+97,/m/07bgp,"Sheep"
+98,/m/025rv6n,"Fowl"
+99,/m/09b5t,"Chicken, rooster"
+100,/m/07st89h,"Cluck"
+101,/m/07qn5dc,"Crowing, cock-a-doodle-doo"
+102,/m/01rd7k,"Turkey"
+103,/m/07svc2k,"Gobble"
+104,/m/09ddx,"Duck"
+105,/m/07qdb04,"Quack"
+106,/m/0dbvp,"Goose"
+107,/m/07qwf61,"Honk"
+108,/m/01280g,"Wild animals"
+109,/m/0cdnk,"Roaring cats (lions, tigers)"
+110,/m/04cvmfc,"Roar"
+111,/m/015p6,"Bird"
+112,/m/020bb7,"Bird vocalization, bird call, bird song"
+113,/m/07pggtn,"Chirp, tweet"
+114,/m/07sx8x_,"Squawk"
+115,/m/0h0rv,"Pigeon, dove"
+116,/m/07r_25d,"Coo"
+117,/m/04s8yn,"Crow"
+118,/m/07r5c2p,"Caw"
+119,/m/09d5_,"Owl"
+120,/m/07r_80w,"Hoot"
+121,/m/05_wcq,"Bird flight, flapping wings"
+122,/m/01z5f,"Canidae, dogs, wolves"
+123,/m/06hps,"Rodents, rats, mice"
+124,/m/04rmv,"Mouse"
+125,/m/07r4gkf,"Patter"
+126,/m/03vt0,"Insect"
+127,/m/09xqv,"Cricket"
+128,/m/09f96,"Mosquito"
+129,/m/0h2mp,"Fly, housefly"
+130,/m/07pjwq1,"Buzz"
+131,/m/01h3n,"Bee, wasp, etc."
+132,/m/09ld4,"Frog"
+133,/m/07st88b,"Croak"
+134,/m/078jl,"Snake"
+135,/m/07qn4z3,"Rattle"
+136,/m/032n05,"Whale vocalization"
+137,/m/04rlf,"Music"
+138,/m/04szw,"Musical instrument"
+139,/m/0fx80y,"Plucked string instrument"
+140,/m/0342h,"Guitar"
+141,/m/02sgy,"Electric guitar"
+142,/m/018vs,"Bass guitar"
+143,/m/042v_gx,"Acoustic guitar"
+144,/m/06w87,"Steel guitar, slide guitar"
+145,/m/01glhc,"Tapping (guitar technique)"
+146,/m/07s0s5r,"Strum"
+147,/m/018j2,"Banjo"
+148,/m/0jtg0,"Sitar"
+149,/m/04rzd,"Mandolin"
+150,/m/01bns_,"Zither"
+151,/m/07xzm,"Ukulele"
+152,/m/05148p4,"Keyboard (musical)"
+153,/m/05r5c,"Piano"
+154,/m/01s0ps,"Electric piano"
+155,/m/013y1f,"Organ"
+156,/m/03xq_f,"Electronic organ"
+157,/m/03gvt,"Hammond organ"
+158,/m/0l14qv,"Synthesizer"
+159,/m/01v1d8,"Sampler"
+160,/m/03q5t,"Harpsichord"
+161,/m/0l14md,"Percussion"
+162,/m/02hnl,"Drum kit"
+163,/m/0cfdd,"Drum machine"
+164,/m/026t6,"Drum"
+165,/m/06rvn,"Snare drum"
+166,/m/03t3fj,"Rimshot"
+167,/m/02k_mr,"Drum roll"
+168,/m/0bm02,"Bass drum"
+169,/m/011k_j,"Timpani"
+170,/m/01p970,"Tabla"
+171,/m/01qbl,"Cymbal"
+172,/m/03qtq,"Hi-hat"
+173,/m/01sm1g,"Wood block"
+174,/m/07brj,"Tambourine"
+175,/m/05r5wn,"Rattle (instrument)"
+176,/m/0xzly,"Maraca"
+177,/m/0mbct,"Gong"
+178,/m/016622,"Tubular bells"
+179,/m/0j45pbj,"Mallet percussion"
+180,/m/0dwsp,"Marimba, xylophone"
+181,/m/0dwtp,"Glockenspiel"
+182,/m/0dwt5,"Vibraphone"
+183,/m/0l156b,"Steelpan"
+184,/m/05pd6,"Orchestra"
+185,/m/01kcd,"Brass instrument"
+186,/m/0319l,"French horn"
+187,/m/07gql,"Trumpet"
+188,/m/07c6l,"Trombone"
+189,/m/0l14_3,"Bowed string instrument"
+190,/m/02qmj0d,"String section"
+191,/m/07y_7,"Violin, fiddle"
+192,/m/0d8_n,"Pizzicato"
+193,/m/01xqw,"Cello"
+194,/m/02fsn,"Double bass"
+195,/m/085jw,"Wind instrument, woodwind instrument"
+196,/m/0l14j_,"Flute"
+197,/m/06ncr,"Saxophone"
+198,/m/01wy6,"Clarinet"
+199,/m/03m5k,"Harp"
+200,/m/0395lw,"Bell"
+201,/m/03w41f,"Church bell"
+202,/m/027m70_,"Jingle bell"
+203,/m/0gy1t2s,"Bicycle bell"
+204,/m/07n_g,"Tuning fork"
+205,/m/0f8s22,"Chime"
+206,/m/026fgl,"Wind chime"
+207,/m/0150b9,"Change ringing (campanology)"
+208,/m/03qjg,"Harmonica"
+209,/m/0mkg,"Accordion"
+210,/m/0192l,"Bagpipes"
+211,/m/02bxd,"Didgeridoo"
+212,/m/0l14l2,"Shofar"
+213,/m/07kc_,"Theremin"
+214,/m/0l14t7,"Singing bowl"
+215,/m/01hgjl,"Scratching (performance technique)"
+216,/m/064t9,"Pop music"
+217,/m/0glt670,"Hip hop music"
+218,/m/02cz_7,"Beatboxing"
+219,/m/06by7,"Rock music"
+220,/m/03lty,"Heavy metal"
+221,/m/05r6t,"Punk rock"
+222,/m/0dls3,"Grunge"
+223,/m/0dl5d,"Progressive rock"
+224,/m/07sbbz2,"Rock and roll"
+225,/m/05w3f,"Psychedelic rock"
+226,/m/06j6l,"Rhythm and blues"
+227,/m/0gywn,"Soul music"
+228,/m/06cqb,"Reggae"
+229,/m/01lyv,"Country"
+230,/m/015y_n,"Swing music"
+231,/m/0gg8l,"Bluegrass"
+232,/m/02x8m,"Funk"
+233,/m/02w4v,"Folk music"
+234,/m/06j64v,"Middle Eastern music"
+235,/m/03_d0,"Jazz"
+236,/m/026z9,"Disco"
+237,/m/0ggq0m,"Classical music"
+238,/m/05lls,"Opera"
+239,/m/02lkt,"Electronic music"
+240,/m/03mb9,"House music"
+241,/m/07gxw,"Techno"
+242,/m/07s72n,"Dubstep"
+243,/m/0283d,"Drum and bass"
+244,/m/0m0jc,"Electronica"
+245,/m/08cyft,"Electronic dance music"
+246,/m/0fd3y,"Ambient music"
+247,/m/07lnk,"Trance music"
+248,/m/0g293,"Music of Latin America"
+249,/m/0ln16,"Salsa music"
+250,/m/0326g,"Flamenco"
+251,/m/0155w,"Blues"
+252,/m/05fw6t,"Music for children"
+253,/m/02v2lh,"New-age music"
+254,/m/0y4f8,"Vocal music"
+255,/m/0z9c,"A capella"
+256,/m/0164x2,"Music of Africa"
+257,/m/0145m,"Afrobeat"
+258,/m/02mscn,"Christian music"
+259,/m/016cjb,"Gospel music"
+260,/m/028sqc,"Music of Asia"
+261,/m/015vgc,"Carnatic music"
+262,/m/0dq0md,"Music of Bollywood"
+263,/m/06rqw,"Ska"
+264,/m/02p0sh1,"Traditional music"
+265,/m/05rwpb,"Independent music"
+266,/m/074ft,"Song"
+267,/m/025td0t,"Background music"
+268,/m/02cjck,"Theme music"
+269,/m/03r5q_,"Jingle (music)"
+270,/m/0l14gg,"Soundtrack music"
+271,/m/07pkxdp,"Lullaby"
+272,/m/01z7dr,"Video game music"
+273,/m/0140xf,"Christmas music"
+274,/m/0ggx5q,"Dance music"
+275,/m/04wptg,"Wedding music"
+276,/t/dd00031,"Happy music"
+277,/t/dd00032,"Funny music"
+278,/t/dd00033,"Sad music"
+279,/t/dd00034,"Tender music"
+280,/t/dd00035,"Exciting music"
+281,/t/dd00036,"Angry music"
+282,/t/dd00037,"Scary music"
+283,/m/03m9d0z,"Wind"
+284,/m/09t49,"Rustling leaves"
+285,/t/dd00092,"Wind noise (microphone)"
+286,/m/0jb2l,"Thunderstorm"
+287,/m/0ngt1,"Thunder"
+288,/m/0838f,"Water"
+289,/m/06mb1,"Rain"
+290,/m/07r10fb,"Raindrop"
+291,/t/dd00038,"Rain on surface"
+292,/m/0j6m2,"Stream"
+293,/m/0j2kx,"Waterfall"
+294,/m/05kq4,"Ocean"
+295,/m/034srq,"Waves, surf"
+296,/m/06wzb,"Steam"
+297,/m/07swgks,"Gurgling"
+298,/m/02_41,"Fire"
+299,/m/07pzfmf,"Crackle"
+300,/m/07yv9,"Vehicle"
+301,/m/019jd,"Boat, Water vehicle"
+302,/m/0hsrw,"Sailboat, sailing ship"
+303,/m/056ks2,"Rowboat, canoe, kayak"
+304,/m/02rlv9,"Motorboat, speedboat"
+305,/m/06q74,"Ship"
+306,/m/012f08,"Motor vehicle (road)"
+307,/m/0k4j,"Car"
+308,/m/0912c9,"Vehicle horn, car horn, honking"
+309,/m/07qv_d5,"Toot"
+310,/m/02mfyn,"Car alarm"
+311,/m/04gxbd,"Power windows, electric windows"
+312,/m/07rknqz,"Skidding"
+313,/m/0h9mv,"Tire squeal"
+314,/t/dd00134,"Car passing by"
+315,/m/0ltv,"Race car, auto racing"
+316,/m/07r04,"Truck"
+317,/m/0gvgw0,"Air brake"
+318,/m/05x_td,"Air horn, truck horn"
+319,/m/02rhddq,"Reversing beeps"
+320,/m/03cl9h,"Ice cream truck, ice cream van"
+321,/m/01bjv,"Bus"
+322,/m/03j1ly,"Emergency vehicle"
+323,/m/04qvtq,"Police car (siren)"
+324,/m/012n7d,"Ambulance (siren)"
+325,/m/012ndj,"Fire engine, fire truck (siren)"
+326,/m/04_sv,"Motorcycle"
+327,/m/0btp2,"Traffic noise, roadway noise"
+328,/m/06d_3,"Rail transport"
+329,/m/07jdr,"Train"
+330,/m/04zmvq,"Train whistle"
+331,/m/0284vy3,"Train horn"
+332,/m/01g50p,"Railroad car, train wagon"
+333,/t/dd00048,"Train wheels squealing"
+334,/m/0195fx,"Subway, metro, underground"
+335,/m/0k5j,"Aircraft"
+336,/m/014yck,"Aircraft engine"
+337,/m/04229,"Jet engine"
+338,/m/02l6bg,"Propeller, airscrew"
+339,/m/09ct_,"Helicopter"
+340,/m/0cmf2,"Fixed-wing aircraft, airplane"
+341,/m/0199g,"Bicycle"
+342,/m/06_fw,"Skateboard"
+343,/m/02mk9,"Engine"
+344,/t/dd00065,"Light engine (high frequency)"
+345,/m/08j51y,"Dental drill, dentist's drill"
+346,/m/01yg9g,"Lawn mower"
+347,/m/01j4z9,"Chainsaw"
+348,/t/dd00066,"Medium engine (mid frequency)"
+349,/t/dd00067,"Heavy engine (low frequency)"
+350,/m/01h82_,"Engine knocking"
+351,/t/dd00130,"Engine starting"
+352,/m/07pb8fc,"Idling"
+353,/m/07q2z82,"Accelerating, revving, vroom"
+354,/m/02dgv,"Door"
+355,/m/03wwcy,"Doorbell"
+356,/m/07r67yg,"Ding-dong"
+357,/m/02y_763,"Sliding door"
+358,/m/07rjzl8,"Slam"
+359,/m/07r4wb8,"Knock"
+360,/m/07qcpgn,"Tap"
+361,/m/07q6cd_,"Squeak"
+362,/m/0642b4,"Cupboard open or close"
+363,/m/0fqfqc,"Drawer open or close"
+364,/m/04brg2,"Dishes, pots, and pans"
+365,/m/023pjk,"Cutlery, silverware"
+366,/m/07pn_8q,"Chopping (food)"
+367,/m/0dxrf,"Frying (food)"
+368,/m/0fx9l,"Microwave oven"
+369,/m/02pjr4,"Blender"
+370,/m/02jz0l,"Water tap, faucet"
+371,/m/0130jx,"Sink (filling or washing)"
+372,/m/03dnzn,"Bathtub (filling or washing)"
+373,/m/03wvsk,"Hair dryer"
+374,/m/01jt3m,"Toilet flush"
+375,/m/012xff,"Toothbrush"
+376,/m/04fgwm,"Electric toothbrush"
+377,/m/0d31p,"Vacuum cleaner"
+378,/m/01s0vc,"Zipper (clothing)"
+379,/m/03v3yw,"Keys jangling"
+380,/m/0242l,"Coin (dropping)"
+381,/m/01lsmm,"Scissors"
+382,/m/02g901,"Electric shaver, electric razor"
+383,/m/05rj2,"Shuffling cards"
+384,/m/0316dw,"Typing"
+385,/m/0c2wf,"Typewriter"
+386,/m/01m2v,"Computer keyboard"
+387,/m/081rb,"Writing"
+388,/m/07pp_mv,"Alarm"
+389,/m/07cx4,"Telephone"
+390,/m/07pp8cl,"Telephone bell ringing"
+391,/m/01hnzm,"Ringtone"
+392,/m/02c8p,"Telephone dialing, DTMF"
+393,/m/015jpf,"Dial tone"
+394,/m/01z47d,"Busy signal"
+395,/m/046dlr,"Alarm clock"
+396,/m/03kmc9,"Siren"
+397,/m/0dgbq,"Civil defense siren"
+398,/m/030rvx,"Buzzer"
+399,/m/01y3hg,"Smoke detector, smoke alarm"
+400,/m/0c3f7m,"Fire alarm"
+401,/m/04fq5q,"Foghorn"
+402,/m/0l156k,"Whistle"
+403,/m/06hck5,"Steam whistle"
+404,/t/dd00077,"Mechanisms"
+405,/m/02bm9n,"Ratchet, pawl"
+406,/m/01x3z,"Clock"
+407,/m/07qjznt,"Tick"
+408,/m/07qjznl,"Tick-tock"
+409,/m/0l7xg,"Gears"
+410,/m/05zc1,"Pulleys"
+411,/m/0llzx,"Sewing machine"
+412,/m/02x984l,"Mechanical fan"
+413,/m/025wky1,"Air conditioning"
+414,/m/024dl,"Cash register"
+415,/m/01m4t,"Printer"
+416,/m/0dv5r,"Camera"
+417,/m/07bjf,"Single-lens reflex camera"
+418,/m/07k1x,"Tools"
+419,/m/03l9g,"Hammer"
+420,/m/03p19w,"Jackhammer"
+421,/m/01b82r,"Sawing"
+422,/m/02p01q,"Filing (rasp)"
+423,/m/023vsd,"Sanding"
+424,/m/0_ksk,"Power tool"
+425,/m/01d380,"Drill"
+426,/m/014zdl,"Explosion"
+427,/m/032s66,"Gunshot, gunfire"
+428,/m/04zjc,"Machine gun"
+429,/m/02z32qm,"Fusillade"
+430,/m/0_1c,"Artillery fire"
+431,/m/073cg4,"Cap gun"
+432,/m/0g6b5,"Fireworks"
+433,/g/122z_qxw,"Firecracker"
+434,/m/07qsvvw,"Burst, pop"
+435,/m/07pxg6y,"Eruption"
+436,/m/07qqyl4,"Boom"
+437,/m/083vt,"Wood"
+438,/m/07pczhz,"Chop"
+439,/m/07pl1bw,"Splinter"
+440,/m/07qs1cx,"Crack"
+441,/m/039jq,"Glass"
+442,/m/07q7njn,"Chink, clink"
+443,/m/07rn7sz,"Shatter"
+444,/m/04k94,"Liquid"
+445,/m/07rrlb6,"Splash, splatter"
+446,/m/07p6mqd,"Slosh"
+447,/m/07qlwh6,"Squish"
+448,/m/07r5v4s,"Drip"
+449,/m/07prgkl,"Pour"
+450,/m/07pqc89,"Trickle, dribble"
+451,/t/dd00088,"Gush"
+452,/m/07p7b8y,"Fill (with liquid)"
+453,/m/07qlf79,"Spray"
+454,/m/07ptzwd,"Pump (liquid)"
+455,/m/07ptfmf,"Stir"
+456,/m/0dv3j,"Boiling"
+457,/m/0790c,"Sonar"
+458,/m/0dl83,"Arrow"
+459,/m/07rqsjt,"Whoosh, swoosh, swish"
+460,/m/07qnq_y,"Thump, thud"
+461,/m/07rrh0c,"Thunk"
+462,/m/0b_fwt,"Electronic tuner"
+463,/m/02rr_,"Effects unit"
+464,/m/07m2kt,"Chorus effect"
+465,/m/018w8,"Basketball bounce"
+466,/m/07pws3f,"Bang"
+467,/m/07ryjzk,"Slap, smack"
+468,/m/07rdhzs,"Whack, thwack"
+469,/m/07pjjrj,"Smash, crash"
+470,/m/07pc8lb,"Breaking"
+471,/m/07pqn27,"Bouncing"
+472,/m/07rbp7_,"Whip"
+473,/m/07pyf11,"Flap"
+474,/m/07qb_dv,"Scratch"
+475,/m/07qv4k0,"Scrape"
+476,/m/07pdjhy,"Rub"
+477,/m/07s8j8t,"Roll"
+478,/m/07plct2,"Crushing"
+479,/t/dd00112,"Crumpling, crinkling"
+480,/m/07qcx4z,"Tearing"
+481,/m/02fs_r,"Beep, bleep"
+482,/m/07qwdck,"Ping"
+483,/m/07phxs1,"Ding"
+484,/m/07rv4dm,"Clang"
+485,/m/07s02z0,"Squeal"
+486,/m/07qh7jl,"Creak"
+487,/m/07qwyj0,"Rustle"
+488,/m/07s34ls,"Whir"
+489,/m/07qmpdm,"Clatter"
+490,/m/07p9k1k,"Sizzle"
+491,/m/07qc9xj,"Clicking"
+492,/m/07rwm0c,"Clickety-clack"
+493,/m/07phhsh,"Rumble"
+494,/m/07qyrcz,"Plop"
+495,/m/07qfgpx,"Jingle, tinkle"
+496,/m/07rcgpl,"Hum"
+497,/m/07p78v5,"Zing"
+498,/t/dd00121,"Boing"
+499,/m/07s12q4,"Crunch"
+500,/m/028v0c,"Silence"
+501,/m/01v_m0,"Sine wave"
+502,/m/0b9m1,"Harmonic"
+503,/m/0hdsk,"Chirp tone"
+504,/m/0c1dj,"Sound effect"
+505,/m/07pt_g0,"Pulse"
+506,/t/dd00125,"Inside, small room"
+507,/t/dd00126,"Inside, large room or hall"
+508,/t/dd00127,"Inside, public space"
+509,/t/dd00128,"Outside, urban or manmade"
+510,/t/dd00129,"Outside, rural or natural"
+511,/m/01b9nn,"Reverberation"
+512,/m/01jnbd,"Echo"
+513,/m/096m7z,"Noise"
+514,/m/06_y0by,"Environmental noise"
+515,/m/07rgkc5,"Static"
+516,/m/06xkwv,"Mains hum"
+517,/m/0g12c5,"Distortion"
+518,/m/08p9q4,"Sidetone"
+519,/m/07szfh9,"Cacophony"
+520,/m/0chx_,"White noise"
+521,/m/0cj0r,"Pink noise"
+522,/m/07p_0gm,"Throbbing"
+523,/m/01jwx6,"Vibration"
+524,/m/07c52,"Television"
+525,/m/06bz3,"Radio"
+526,/m/07hvw1,"Field recording"
diff --git a/audio_detection/audio_infer/pytorch/__pycache__/models.cpython-38.pyc b/audio_detection/audio_infer/pytorch/__pycache__/models.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4699888fb861c2ffee9c8575b4116eba8e7a41b6
Binary files /dev/null and b/audio_detection/audio_infer/pytorch/__pycache__/models.cpython-38.pyc differ
diff --git a/audio_detection/audio_infer/pytorch/__pycache__/pytorch_utils.cpython-38.pyc b/audio_detection/audio_infer/pytorch/__pycache__/pytorch_utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b4489e8b01c6cced77a08735295746c01e8f831
Binary files /dev/null and b/audio_detection/audio_infer/pytorch/__pycache__/pytorch_utils.cpython-38.pyc differ
diff --git a/audio_detection/audio_infer/pytorch/evaluate.py b/audio_detection/audio_infer/pytorch/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f1fa38eedd9e9cd2580143ceb92aba8f81becf3
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/evaluate.py
@@ -0,0 +1,42 @@
+from sklearn import metrics
+
+from pytorch_utils import forward
+
+
+class Evaluator(object):
+    def __init__(self, model):
+        """Evaluator.
+
+        Args:
+          model: object
+        """
+        self.model = model
+        
+    def evaluate(self, data_loader):
+        """Forward evaluation data and calculate statistics.
+
+        Args:
+          data_loader: object
+
+        Returns:
+          statistics: dict, 
+              {'average_precision': (classes_num,), 'auc': (classes_num,)}
+        """
+
+        # Forward
+        output_dict = forward(
+            model=self.model, 
+            generator=data_loader, 
+            return_target=True)
+
+        clipwise_output = output_dict['clipwise_output']    # (audios_num, classes_num)
+        target = output_dict['target']    # (audios_num, classes_num)
+
+        average_precision = metrics.average_precision_score(
+            target, clipwise_output, average=None)
+
+        auc = metrics.roc_auc_score(target, clipwise_output, average=None)
+        
+        statistics = {'average_precision': average_precision, 'auc': auc}
+
+        return statistics
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/finetune_template.py b/audio_detection/audio_infer/pytorch/finetune_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd43e462c47857f805b1ef4d345711354a1cff3d
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/finetune_template.py
@@ -0,0 +1,127 @@
+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import h5py
+import math
+import time
+import logging
+import matplotlib.pyplot as plt
+
+import torch
+torch.backends.cudnn.benchmark=True
+torch.manual_seed(0)
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data
+ 
+from utilities import get_filename
+from models import *
+import config
+
+
+class Transfer_Cnn14(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num, freeze_base):
+        """Classifier for a new task using pretrained Cnn14 as a sub module.
+        """
+        super(Transfer_Cnn14, self).__init__()
+        audioset_classes_num = 527
+        
+        self.base = Cnn14(sample_rate, window_size, hop_size, mel_bins, fmin, 
+            fmax, audioset_classes_num)
+
+        # Transfer to another task layer
+        self.fc_transfer = nn.Linear(2048, classes_num, bias=True)
+
+        if freeze_base:
+            # Freeze AudioSet pretrained layers
+            for param in self.base.parameters():
+                param.requires_grad = False
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_layer(self.fc_transfer)
+
+    def load_from_pretrain(self, pretrained_checkpoint_path):
+        checkpoint = torch.load(pretrained_checkpoint_path)
+        self.base.load_state_dict(checkpoint['model'])
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, data_length)
+        """
+        output_dict = self.base(input, mixup_lambda)
+        embedding = output_dict['embedding']
+
+        clipwise_output =  torch.log_softmax(self.fc_transfer(embedding), dim=-1)
+        output_dict['clipwise_output'] = clipwise_output
+ 
+        return output_dict
+
+
+def train(args):
+
+    # Arugments & parameters
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    pretrained_checkpoint_path = args.pretrained_checkpoint_path
+    freeze_base = args.freeze_base
+    device = 'cuda' if (args.cuda and torch.cuda.is_available()) else 'cpu'
+
+    classes_num = config.classes_num
+    pretrain = True if pretrained_checkpoint_path else False
+    
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate, window_size, hop_size, mel_bins, fmin, fmax, 
+        classes_num, freeze_base)
+
+    # Load pretrained model
+    if pretrain:
+        logging.info('Load pretrained model from {}'.format(pretrained_checkpoint_path))
+        model.load_from_pretrain(pretrained_checkpoint_path)
+
+    # Parallel
+    print('GPU number: {}'.format(torch.cuda.device_count()))
+    model = torch.nn.DataParallel(model)
+
+    if 'cuda' in device:
+        model.to(device)
+
+    print('Load pretrained model successfully!')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Example of parser. ')
+    subparsers = parser.add_subparsers(dest='mode')
+
+    # Train
+    parser_train = subparsers.add_parser('train')
+    parser_train.add_argument('--sample_rate', type=int, required=True)
+    parser_train.add_argument('--window_size', type=int, required=True)
+    parser_train.add_argument('--hop_size', type=int, required=True)
+    parser_train.add_argument('--mel_bins', type=int, required=True)
+    parser_train.add_argument('--fmin', type=int, required=True)
+    parser_train.add_argument('--fmax', type=int, required=True) 
+    parser_train.add_argument('--model_type', type=str, required=True)
+    parser_train.add_argument('--pretrained_checkpoint_path', type=str)
+    parser_train.add_argument('--freeze_base', action='store_true', default=False)
+    parser_train.add_argument('--cuda', action='store_true', default=False)
+
+    # Parse arguments
+    args = parser.parse_args()
+    args.filename = get_filename(__file__)
+
+    if args.mode == 'train':
+        train(args)
+
+    else:
+        raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/inference.py b/audio_detection/audio_infer/pytorch/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..49dc75f740aec7be287eab70bae1f7677ccc4662
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/inference.py
@@ -0,0 +1,206 @@
+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import librosa
+import matplotlib.pyplot as plt
+import torch
+
+from utilities import create_folder, get_filename
+from models import *
+from pytorch_utils import move_data_to_device
+import config
+
+def audio_tagging(args):
+    """Inference audio tagging result of an audio clip.
+    """
+
+    # Arugments & parameters
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    checkpoint_path = args.checkpoint_path
+    audio_path = args.audio_path
+    device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+    
+    classes_num = config.classes_num
+    labels = config.labels
+
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate=sample_rate, window_size=window_size, 
+        hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax, 
+        classes_num=classes_num)
+    
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    model.load_state_dict(checkpoint['model'])
+
+    # Parallel
+    if 'cuda' in str(device):
+        model.to(device)
+        print('GPU number: {}'.format(torch.cuda.device_count()))
+        model = torch.nn.DataParallel(model)
+    else:
+        print('Using CPU.')
+    
+    # Load audio
+    (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+
+    waveform = waveform[None, :]    # (1, audio_length)
+    waveform = move_data_to_device(waveform, device)
+
+    # Forward
+    with torch.no_grad():
+        model.eval()
+        batch_output_dict = model(waveform, None)
+
+    clipwise_output = batch_output_dict['clipwise_output'].data.cpu().numpy()[0]
+    """(classes_num,)"""
+
+    sorted_indexes = np.argsort(clipwise_output)[::-1]
+
+    # Print audio tagging top probabilities
+    for k in range(10):
+        print('{}: {:.3f}'.format(np.array(labels)[sorted_indexes[k]], 
+            clipwise_output[sorted_indexes[k]]))
+
+    # Print embedding
+    if 'embedding' in batch_output_dict.keys():
+        embedding = batch_output_dict['embedding'].data.cpu().numpy()[0]
+        print('embedding: {}'.format(embedding.shape))
+
+    return clipwise_output, labels
+
+
+def sound_event_detection(args):
+    """Inference sound event detection result of an audio clip.
+    """
+
+    # Arugments & parameters
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    checkpoint_path = args.checkpoint_path
+    audio_path = args.audio_path
+    device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+
+    classes_num = config.classes_num
+    labels = config.labels
+    frames_per_second = sample_rate // hop_size
+
+    # Paths
+    fig_path = os.path.join('results', '{}.png'.format(get_filename(audio_path)))
+    create_folder(os.path.dirname(fig_path))
+
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate=sample_rate, window_size=window_size, 
+        hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax, 
+        classes_num=classes_num)
+    
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    model.load_state_dict(checkpoint['model'])
+
+    # Parallel
+    print('GPU number: {}'.format(torch.cuda.device_count()))
+    model = torch.nn.DataParallel(model)
+
+    if 'cuda' in str(device):
+        model.to(device)
+    
+    # Load audio
+    (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+
+    waveform = waveform[None, :]    # (1, audio_length)
+    waveform = move_data_to_device(waveform, device)
+
+    # Forward
+    with torch.no_grad():
+        model.eval()
+        batch_output_dict = model(waveform, None)
+
+    framewise_output = batch_output_dict['framewise_output'].data.cpu().numpy()[0]
+    """(time_steps, classes_num)"""
+
+    print('Sound event detection result (time_steps x classes_num): {}'.format(
+        framewise_output.shape))
+
+    sorted_indexes = np.argsort(np.max(framewise_output, axis=0))[::-1]
+
+    top_k = 10  # Show top results
+    top_result_mat = framewise_output[:, sorted_indexes[0 : top_k]]    
+    """(time_steps, top_k)"""
+
+    # Plot result    
+    stft = librosa.core.stft(y=waveform[0].data.cpu().numpy(), n_fft=window_size, 
+        hop_length=hop_size, window='hann', center=True)
+    frames_num = stft.shape[-1]
+
+    fig, axs = plt.subplots(2, 1, sharex=True, figsize=(10, 4))
+    axs[0].matshow(np.log(np.abs(stft)), origin='lower', aspect='auto', cmap='jet')
+    axs[0].set_ylabel('Frequency bins')
+    axs[0].set_title('Log spectrogram')
+    axs[1].matshow(top_result_mat.T, origin='upper', aspect='auto', cmap='jet', vmin=0, vmax=1)
+    axs[1].xaxis.set_ticks(np.arange(0, frames_num, frames_per_second))
+    axs[1].xaxis.set_ticklabels(np.arange(0, frames_num / frames_per_second))
+    axs[1].yaxis.set_ticks(np.arange(0, top_k))
+    axs[1].yaxis.set_ticklabels(np.array(labels)[sorted_indexes[0 : top_k]])
+    axs[1].yaxis.grid(color='k', linestyle='solid', linewidth=0.3, alpha=0.3)
+    axs[1].set_xlabel('Seconds')
+    axs[1].xaxis.set_ticks_position('bottom')
+
+    plt.tight_layout()
+    plt.savefig(fig_path)
+    print('Save sound event detection visualization to {}'.format(fig_path))
+
+    return framewise_output, labels
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Example of parser. ')
+    subparsers = parser.add_subparsers(dest='mode')
+
+    parser_at = subparsers.add_parser('audio_tagging')
+    parser_at.add_argument('--sample_rate', type=int, default=32000)
+    parser_at.add_argument('--window_size', type=int, default=1024)
+    parser_at.add_argument('--hop_size', type=int, default=320)
+    parser_at.add_argument('--mel_bins', type=int, default=64)
+    parser_at.add_argument('--fmin', type=int, default=50)
+    parser_at.add_argument('--fmax', type=int, default=14000) 
+    parser_at.add_argument('--model_type', type=str, required=True)
+    parser_at.add_argument('--checkpoint_path', type=str, required=True)
+    parser_at.add_argument('--audio_path', type=str, required=True)
+    parser_at.add_argument('--cuda', action='store_true', default=False)
+
+    parser_sed = subparsers.add_parser('sound_event_detection')
+    parser_sed.add_argument('--sample_rate', type=int, default=32000)
+    parser_sed.add_argument('--window_size', type=int, default=1024)
+    parser_sed.add_argument('--hop_size', type=int, default=320)
+    parser_sed.add_argument('--mel_bins', type=int, default=64)
+    parser_sed.add_argument('--fmin', type=int, default=50)
+    parser_sed.add_argument('--fmax', type=int, default=14000) 
+    parser_sed.add_argument('--model_type', type=str, required=True)
+    parser_sed.add_argument('--checkpoint_path', type=str, required=True)
+    parser_sed.add_argument('--audio_path', type=str, required=True)
+    parser_sed.add_argument('--cuda', action='store_true', default=False)
+    
+    args = parser.parse_args()
+
+    if args.mode == 'audio_tagging':
+        audio_tagging(args)
+
+    elif args.mode == 'sound_event_detection':
+        sound_event_detection(args)
+
+    else:
+        raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/losses.py b/audio_detection/audio_infer/pytorch/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..587e8a64f2593e4a72c1a29cf374c1e24e20c366
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/losses.py
@@ -0,0 +1,14 @@
+import torch
+import torch.nn.functional as F
+
+
+def clip_bce(output_dict, target_dict):
+    """Binary crossentropy loss.
+    """
+    return F.binary_cross_entropy(
+        output_dict['clipwise_output'], target_dict['target'])
+
+
+def get_loss_func(loss_type):
+    if loss_type == 'clip_bce':
+        return clip_bce
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/main.py b/audio_detection/audio_infer/pytorch/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..358293521706ff525f6f1b1274085a08236394ff
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/main.py
@@ -0,0 +1,378 @@
+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import time
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data
+ 
+from utilities import (create_folder, get_filename, create_logging, Mixup, 
+    StatisticsContainer)
+from models import (PVT, PVT2, PVT_lr, PVT_nopretrain, PVT_2layer, Cnn14, Cnn14_no_specaug, Cnn14_no_dropout, 
+    Cnn6, Cnn10, ResNet22, ResNet38, ResNet54, Cnn14_emb512, Cnn14_emb128, 
+    Cnn14_emb32, MobileNetV1, MobileNetV2, LeeNet11, LeeNet24, DaiNet19, 
+    Res1dNet31, Res1dNet51, Wavegram_Cnn14, Wavegram_Logmel_Cnn14, 
+    Wavegram_Logmel128_Cnn14, Cnn14_16k, Cnn14_8k, Cnn14_mel32, Cnn14_mel128, 
+    Cnn14_mixup_time_domain, Cnn14_DecisionLevelMax, Cnn14_DecisionLevelAtt, Cnn6_Transformer, GLAM, GLAM2, GLAM3, Cnn4, EAT)
+#from models_test import (PVT_test)
+#from models1 import (PVT1)
+#from models_vig import (VIG, VIG2)
+#from models_vvt import (VVT)
+#from models2 import (MPVIT, MPVIT2)
+#from models_reshape import (PVT_reshape, PVT_tscam)
+#from models_swin import (Swin, Swin_nopretrain)
+#from models_swin2 import (Swin2)
+#from models_van import (Van, Van_tiny)
+#from models_focal import (Focal)
+#from models_cross import (Cross)
+#from models_cov import (Cov)
+#from models_cnn import (Cnn_light)
+#from models_twins import (Twins)
+#from models_cmt import (Cmt, Cmt1)
+#from models_shunted import (Shunted)
+#from models_quadtree import (Quadtree, Quadtree2, Quadtree_nopretrain)
+#from models_davit import (Davit_tscam, Davit, Davit_nopretrain)
+from pytorch_utils import (move_data_to_device, count_parameters, count_flops, 
+    do_mixup)
+from data_generator import (AudioSetDataset, TrainSampler, BalancedTrainSampler, 
+    AlternateTrainSampler, EvaluateSampler, collate_fn)
+from evaluate import Evaluator
+import config
+from losses import get_loss_func
+
+
+def train(args):
+    """Train AudioSet tagging model. 
+
+    Args:
+      dataset_dir: str
+      workspace: str
+      data_type: 'balanced_train' | 'full_train'
+      window_size: int
+      hop_size: int
+      mel_bins: int
+      model_type: str
+      loss_type: 'clip_bce'
+      balanced: 'none' | 'balanced' | 'alternate'
+      augmentation: 'none' | 'mixup'
+      batch_size: int
+      learning_rate: float
+      resume_iteration: int
+      early_stop: int
+      accumulation_steps: int
+      cuda: bool
+    """
+
+    # Arugments & parameters
+    workspace = args.workspace
+    data_type = args.data_type
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    loss_type = args.loss_type
+    balanced = args.balanced
+    augmentation = args.augmentation
+    batch_size = args.batch_size
+    learning_rate = args.learning_rate
+    resume_iteration = args.resume_iteration
+    early_stop = args.early_stop
+    device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+    filename = args.filename
+
+    num_workers = 8
+    clip_samples = config.clip_samples
+    classes_num = config.classes_num
+    loss_func = get_loss_func(loss_type)
+
+    # Paths
+    black_list_csv = None
+    
+    train_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes', 
+        '{}.h5'.format(data_type))
+
+    eval_bal_indexes_hdf5_path = os.path.join(workspace, 
+        'hdf5s', 'indexes', 'balanced_train.h5')
+
+    eval_test_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes', 
+        'eval.h5')
+
+    checkpoints_dir = os.path.join(workspace, 'checkpoints', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size))
+    create_folder(checkpoints_dir)
+    
+    statistics_path = os.path.join(workspace, 'statistics', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+        'statistics.pkl')
+    create_folder(os.path.dirname(statistics_path))
+
+    logs_dir = os.path.join(workspace, 'logs', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size))
+
+    create_logging(logs_dir, filemode='w')
+    logging.info(args)
+    
+    if 'cuda' in str(device):
+        logging.info('Using GPU.')
+        device = 'cuda'
+    else:
+        logging.info('Using CPU. Set --cuda flag to use GPU.')
+        device = 'cpu'
+    
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate=sample_rate, window_size=window_size, 
+        hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax, 
+        classes_num=classes_num)
+    total = sum(p.numel() for p in model.parameters())
+    print("Total params: %.2fM" % (total/1e6))
+    logging.info("Total params: %.2fM" % (total/1e6))
+    #params_num = count_parameters(model)
+    # flops_num = count_flops(model, clip_samples)
+    #logging.info('Parameters num: {}'.format(params_num))
+    # logging.info('Flops num: {:.3f} G'.format(flops_num / 1e9))
+    
+    # Dataset will be used by DataLoader later. Dataset takes a meta as input 
+    # and return a waveform and a target.
+    dataset = AudioSetDataset(sample_rate=sample_rate)
+
+    # Train sampler
+    if balanced == 'none':
+        Sampler = TrainSampler
+    elif balanced == 'balanced':
+        Sampler = BalancedTrainSampler
+    elif balanced == 'alternate':
+        Sampler = AlternateTrainSampler
+     
+    train_sampler = Sampler(
+        indexes_hdf5_path=train_indexes_hdf5_path, 
+        batch_size=batch_size * 2 if 'mixup' in augmentation else batch_size,
+        black_list_csv=black_list_csv)
+    
+    # Evaluate sampler
+    eval_bal_sampler = EvaluateSampler(
+        indexes_hdf5_path=eval_bal_indexes_hdf5_path, batch_size=batch_size)
+
+    eval_test_sampler = EvaluateSampler(
+        indexes_hdf5_path=eval_test_indexes_hdf5_path, batch_size=batch_size)
+
+    # Data loader
+    train_loader = torch.utils.data.DataLoader(dataset=dataset, 
+        batch_sampler=train_sampler, collate_fn=collate_fn, 
+        num_workers=num_workers, pin_memory=True)
+    
+    eval_bal_loader = torch.utils.data.DataLoader(dataset=dataset, 
+        batch_sampler=eval_bal_sampler, collate_fn=collate_fn, 
+        num_workers=num_workers, pin_memory=True)
+
+    eval_test_loader = torch.utils.data.DataLoader(dataset=dataset, 
+        batch_sampler=eval_test_sampler, collate_fn=collate_fn, 
+        num_workers=num_workers, pin_memory=True)
+    mix=0.5
+    if 'mixup' in augmentation:
+        mixup_augmenter = Mixup(mixup_alpha=mix)
+    print(mix)
+    logging.info(mix)
+
+    # Evaluator
+    evaluator = Evaluator(model=model)
+        
+    # Statistics
+    statistics_container = StatisticsContainer(statistics_path)
+    
+    # Optimizer
+    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.05, amsgrad=True)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4, min_lr=1e-06, verbose=True)
+    train_bgn_time = time.time()
+    
+    # Resume training
+    if resume_iteration > 0:
+        resume_checkpoint_path = os.path.join(workspace, 'checkpoints', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            '{}_iterations.pth'.format(resume_iteration))
+
+        logging.info('Loading checkpoint {}'.format(resume_checkpoint_path))
+        checkpoint = torch.load(resume_checkpoint_path)
+        model.load_state_dict(checkpoint['model'])
+        train_sampler.load_state_dict(checkpoint['sampler'])
+        statistics_container.load_state_dict(resume_iteration)
+        iteration = checkpoint['iteration']
+
+    else:
+        iteration = 0
+    
+    # Parallel
+    print('GPU number: {}'.format(torch.cuda.device_count()))
+    model = torch.nn.DataParallel(model)
+
+    if 'cuda' in str(device):
+        model.to(device)
+
+    if resume_iteration:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        scheduler.load_state_dict(checkpoint['scheduler'])
+        print(optimizer.state_dict()['param_groups'][0]['lr'])
+
+    time1 = time.time()
+    
+    for batch_data_dict in train_loader:
+        """batch_data_dict: {
+            'audio_name': (batch_size [*2 if mixup],), 
+            'waveform': (batch_size [*2 if mixup], clip_samples), 
+            'target': (batch_size [*2 if mixup], classes_num), 
+            (ifexist) 'mixup_lambda': (batch_size * 2,)}
+        """
+        
+        # Evaluate
+        if (iteration % 2000 == 0 and iteration >= resume_iteration) or (iteration == 0):
+            train_fin_time = time.time()
+
+            bal_statistics = evaluator.evaluate(eval_bal_loader)
+            test_statistics = evaluator.evaluate(eval_test_loader)
+                            
+            logging.info('Validate bal mAP: {:.3f}'.format(
+                np.mean(bal_statistics['average_precision'])))
+
+            logging.info('Validate test mAP: {:.3f}'.format(
+                np.mean(test_statistics['average_precision'])))
+
+            statistics_container.append(iteration, bal_statistics, data_type='bal')
+            statistics_container.append(iteration, test_statistics, data_type='test')
+            statistics_container.dump()
+
+            train_time = train_fin_time - train_bgn_time
+            validate_time = time.time() - train_fin_time
+
+            logging.info(
+                'iteration: {}, train time: {:.3f} s, validate time: {:.3f} s'
+                    ''.format(iteration, train_time, validate_time))
+
+            logging.info('------------------------------------')
+
+            train_bgn_time = time.time()
+        
+        # Save model
+        if iteration % 2000 == 0:
+            checkpoint = {
+                'iteration': iteration, 
+                'model': model.module.state_dict(), 
+                'sampler': train_sampler.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'scheduler': scheduler.state_dict()}
+
+            checkpoint_path = os.path.join(
+                checkpoints_dir, '{}_iterations.pth'.format(iteration))
+                
+            torch.save(checkpoint, checkpoint_path)
+            logging.info('Model saved to {}'.format(checkpoint_path))
+        
+        # Mixup lambda
+        if 'mixup' in augmentation:
+            batch_data_dict['mixup_lambda'] = mixup_augmenter.get_lambda(
+                batch_size=len(batch_data_dict['waveform']))
+
+        # Move data to device
+        for key in batch_data_dict.keys():
+            batch_data_dict[key] = move_data_to_device(batch_data_dict[key], device)
+        
+        # Forward
+        model.train()
+
+        if 'mixup' in augmentation:
+            batch_output_dict = model(batch_data_dict['waveform'], 
+                batch_data_dict['mixup_lambda'])
+            """{'clipwise_output': (batch_size, classes_num), ...}"""
+
+            batch_target_dict = {'target': do_mixup(batch_data_dict['target'], 
+                batch_data_dict['mixup_lambda'])}
+            """{'target': (batch_size, classes_num)}"""
+        else:
+            batch_output_dict = model(batch_data_dict['waveform'], None)
+            """{'clipwise_output': (batch_size, classes_num), ...}"""
+
+            batch_target_dict = {'target': batch_data_dict['target']}
+            """{'target': (batch_size, classes_num)}"""
+
+        # Loss
+        loss = loss_func(batch_output_dict, batch_target_dict)
+        # Backward
+        loss.backward()
+        
+        optimizer.step()
+        optimizer.zero_grad()
+        
+        if iteration % 10 == 0:
+            print(iteration, loss)
+            #print('--- Iteration: {}, train time: {:.3f} s / 10 iterations ---'\
+            #    .format(iteration, time.time() - time1))
+            #time1 = time.time()
+
+        if iteration % 2000 == 0:
+            scheduler.step(np.mean(test_statistics['average_precision']))
+            print(optimizer.state_dict()['param_groups'][0]['lr'])
+            logging.info(optimizer.state_dict()['param_groups'][0]['lr'])        
+
+        # Stop learning
+        if iteration == early_stop:
+            break
+
+        iteration += 1
+        
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Example of parser. ')
+    subparsers = parser.add_subparsers(dest='mode')
+
+    parser_train = subparsers.add_parser('train') 
+    parser_train.add_argument('--workspace', type=str, required=True)
+    parser_train.add_argument('--data_type', type=str, default='full_train', choices=['balanced_train', 'full_train'])
+    parser_train.add_argument('--sample_rate', type=int, default=32000)
+    parser_train.add_argument('--window_size', type=int, default=1024)
+    parser_train.add_argument('--hop_size', type=int, default=320)
+    parser_train.add_argument('--mel_bins', type=int, default=64)
+    parser_train.add_argument('--fmin', type=int, default=50)
+    parser_train.add_argument('--fmax', type=int, default=14000) 
+    parser_train.add_argument('--model_type', type=str, required=True)
+    parser_train.add_argument('--loss_type', type=str, default='clip_bce', choices=['clip_bce'])
+    parser_train.add_argument('--balanced', type=str, default='balanced', choices=['none', 'balanced', 'alternate'])
+    parser_train.add_argument('--augmentation', type=str, default='mixup', choices=['none', 'mixup'])
+    parser_train.add_argument('--batch_size', type=int, default=32)
+    parser_train.add_argument('--learning_rate', type=float, default=1e-3)
+    parser_train.add_argument('--resume_iteration', type=int, default=0)
+    parser_train.add_argument('--early_stop', type=int, default=1000000)
+    parser_train.add_argument('--cuda', action='store_true', default=False)
+    
+    args = parser.parse_args()
+    args.filename = get_filename(__file__)
+
+    if args.mode == 'train':
+        train(args)
+
+    else:
+        raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/models.py b/audio_detection/audio_infer/pytorch/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cf5456d1ee9a26a4afe58cea2b11ad78033e01e
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/models.py
@@ -0,0 +1,951 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+
+from audio_infer.pytorch.pytorch_utils import do_mixup, interpolate, pad_framewise_output
+import os
+import sys
+import math
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+from audio_infer.pytorch.pytorch_utils import do_mixup
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+import warnings
+from functools import partial
+#from mmdet.models.builder import BACKBONES
+from mmdet.utils import get_root_logger
+from mmcv.runner import load_checkpoint
+os.environ['TORCH_HOME'] = '../pretrained_models'
+from copy import deepcopy
+from timm.models.helpers import load_pretrained
+from torch.cuda.amp import autocast
+from collections import OrderedDict
+import io
+import re
+from mmcv.runner import _load_checkpoint, load_state_dict
+import mmcv.runner
+import copy
+import random
+from einops import rearrange
+from einops.layers.torch import Rearrange, Reduce
+from torch import nn, einsum
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location=None,
+                    strict=False,
+                    logger=None,
+                    revise_keys=[(r'^module\.', '')]):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Default: strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    checkpoint = _load_checkpoint(filename, map_location, logger)
+    new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+ 
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+            
+    
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+
+
+
+
+class TimeShift(nn.Module):
+    def __init__(self, mean, std):
+        super().__init__()
+        self.mean = mean
+        self.std = std
+
+    def forward(self, x):
+        if self.training:
+            shift = torch.empty(1).normal_(self.mean, self.std).int().item()
+            x = torch.roll(x, shift, dims=2)
+        return x
+
+class LinearSoftPool(nn.Module):
+    """LinearSoftPool
+    Linear softmax, takes logits and returns a probability, near to the actual maximum value.
+    Taken from the paper:
+        A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling
+    https://arxiv.org/abs/1810.09050
+    """
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+
+    def forward(self, logits, time_decision):
+        return (time_decision**2).sum(self.pooldim) / time_decision.sum(
+            self.pooldim)
+
+class PVT(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num):
+        
+        super(PVT, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+ 
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                #pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        #self.temp_pool = LinearSoftPool()  
+        self.avgpool = nn.AdaptiveAvgPool1d(1)    
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        
+        interpolate_ratio = 32
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        #clipwise_output = torch.mean(framewise_output, dim=1)
+        #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        x = framewise_output.transpose(1, 2).contiguous()
+        x = self.avgpool(x)
+        clipwise_output = torch.flatten(x, 1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output, 
+            'clipwise_output': clipwise_output}
+            
+        return output_dict
+
+class PVT2(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num):
+        
+        super(PVT2, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+ 
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        #self.temp_pool = LinearSoftPool()      
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        
+        interpolate_ratio = 32
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        
+        if self.training:
+            #x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        clipwise_output = torch.mean(framewise_output, dim=1)
+        #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output, 
+            'clipwise_output': clipwise_output}
+            
+        return output_dict
+
+class PVT_2layer(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num):
+        
+        super(PVT_2layer, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+ 
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128],
+                                depths=[3, 4],
+                                num_heads=[1, 2],
+                                mlp_ratios=[8, 8],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=2,
+                                pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        #self.temp_pool = LinearSoftPool()  
+        self.avgpool = nn.AdaptiveAvgPool1d(1)    
+        self.fc_audioset = nn.Linear(128, classes_num, bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        
+        interpolate_ratio = 8
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        #clipwise_output = torch.mean(framewise_output, dim=1)
+        #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        x = framewise_output.transpose(1, 2).contiguous()
+        x = self.avgpool(x)
+        clipwise_output = torch.flatten(x, 1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output, 
+            'clipwise_output': clipwise_output}
+            
+        return output_dict
+
+class PVT_lr(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num):
+        
+        super(PVT_lr, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+ 
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        self.temp_pool = LinearSoftPool()      
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        
+        interpolate_ratio = 32
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output, 
+            'clipwise_output': clipwise_output}
+            
+        return output_dict
+
+
+class PVT_nopretrain(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num):
+        
+        super(PVT_nopretrain, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+ 
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                #pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        self.temp_pool = LinearSoftPool()      
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        
+        interpolate_ratio = 32
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output, 
+            'clipwise_output': clipwise_output}
+            
+        return output_dict
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.linear = linear
+        if self.linear:
+            self.relu = nn.ReLU()
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        if self.linear:
+            x = self.relu(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.linear = linear
+        self.sr_ratio = sr_ratio
+        if not linear:
+            if sr_ratio > 1:
+                self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+                self.norm = nn.LayerNorm(dim)
+        else:
+            self.pool = nn.AdaptiveAvgPool2d(7)
+            self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
+            self.norm = nn.LayerNorm(dim)
+            self.act = nn.GELU()
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        if not self.linear:
+            if self.sr_ratio > 1:
+                x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+                x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
+                x_ = self.norm(x_)
+                kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            else:
+                kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        else:
+            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1)
+            x_ = self.norm(x_)
+            x_ = self.act(x_)
+            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Pooling(nn.Module):
+    """
+    Implementation of pooling for PoolFormer
+    --pool_size: pooling size
+    """
+    def __init__(self, pool_size=3):
+        super().__init__()
+        self.pool = nn.AvgPool2d(
+            pool_size, stride=1, padding=pool_size//2, count_include_pad=False)
+
+    def forward(self, x):
+        return self.pool(x) - x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)
+        #self.norm3 = norm_layer(dim)
+        #self.token_mixer = Pooling(pool_size=3)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        return x
+
+
+class OverlapPatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, tdim, fdim, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = (tdim, fdim)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // stride, img_size[1] // stride
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 3, patch_size[1] // 3))
+        self.norm = nn.LayerNorm(embed_dim)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+
+        return x, H, W
+
+
+class PyramidVisionTransformerV2(nn.Module):
+    def __init__(self, tdim=1001, fdim=64, patch_size=16, stride=4, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
+                 attn_drop_rate=0., drop_path_rate=0.1, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1], num_stages=2, linear=False, pretrained=None):
+        super().__init__()
+        # self.num_classes = num_classes
+        self.depths = depths
+        self.num_stages = num_stages
+        self.linear = linear
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        cur = 0
+
+        for i in range(num_stages):
+            patch_embed = OverlapPatchEmbed(tdim=tdim if i == 0 else tdim // (2 ** (i + 1)),
+                                            fdim=fdim if i == 0 else tdim // (2 ** (i + 1)),
+                                            patch_size=7 if i == 0 else 3,
+                                            stride=stride if i == 0 else 2,
+                                            in_chans=in_chans if i == 0 else embed_dims[i - 1],
+                                            embed_dim=embed_dims[i])
+            block = nn.ModuleList([Block(
+                dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer,
+                sr_ratio=sr_ratios[i], linear=linear)
+                for j in range(depths[i])])
+            norm = norm_layer(embed_dims[i])
+            cur += depths[i]
+ 
+            setattr(self, f"patch_embed{i + 1}", patch_embed)
+            setattr(self, f"block{i + 1}", block)
+            setattr(self, f"norm{i + 1}", norm)
+        #self.n = nn.Linear(125, 250, bias=True)
+        # classification head
+        # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+        self.init_weights(pretrained)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
+
+    def freeze_patch_emb(self):
+        self.patch_embed1.requires_grad = False
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'}  # has pos_embed may be better
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f"patch_embed{i + 1}")
+            block = getattr(self, f"block{i + 1}")
+            norm = getattr(self, f"norm{i + 1}")
+            x, H, W = patch_embed(x)
+            #print(x.shape)
+            for blk in block:
+                x = blk(x, H, W)
+            #print(x.shape)
+            x = norm(x)
+            #if i != self.num_stages - 1:
+            x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        #print(x.shape)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        # x = self.head(x)
+
+        return x
+
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+
+        return x
+
+
+def _conv_filter(state_dict, patch_size=16):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    for k, v in state_dict.items():
+        if 'patch_embed.proj.weight' in k:
+            v = v.reshape((v.shape[0], 3, patch_size, patch_size))
+        out_dict[k] = v
+
+    return out_dict
diff --git a/audio_detection/audio_infer/pytorch/pytorch_utils.py b/audio_detection/audio_infer/pytorch/pytorch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a135b336866acc61e834e42e5aa9e9db3f7998ff
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/pytorch_utils.py
@@ -0,0 +1,251 @@
+import numpy as np
+import time
+import torch
+import torch.nn as nn
+
+
+def move_data_to_device(x, device):
+    if 'float' in str(x.dtype):
+        x = torch.Tensor(x)
+    elif 'int' in str(x.dtype):
+        x = torch.LongTensor(x)
+    else:
+        return x
+
+    return x.to(device)
+
+
+def do_mixup(x, mixup_lambda):
+    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes 
+    (1, 3, 5, ...).
+
+    Args:
+      x: (batch_size * 2, ...)
+      mixup_lambda: (batch_size * 2,)
+
+    Returns:
+      out: (batch_size, ...)
+    """
+    out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
+        x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
+    return out
+    
+
+def append_to_dict(dict, key, value):
+    if key in dict.keys():
+        dict[key].append(value)
+    else:
+        dict[key] = [value]
+
+
+def forward(model, generator, return_input=False, 
+    return_target=False):
+    """Forward data to a model.
+    
+    Args: 
+      model: object
+      generator: object
+      return_input: bool
+      return_target: bool
+
+    Returns:
+      audio_name: (audios_num,)
+      clipwise_output: (audios_num, classes_num)
+      (ifexist) segmentwise_output: (audios_num, segments_num, classes_num)
+      (ifexist) framewise_output: (audios_num, frames_num, classes_num)
+      (optional) return_input: (audios_num, segment_samples)
+      (optional) return_target: (audios_num, classes_num)
+    """
+    output_dict = {}
+    device = next(model.parameters()).device
+    time1 = time.time()
+
+    # Forward data to a model in mini-batches
+    for n, batch_data_dict in enumerate(generator):
+        print(n)
+        batch_waveform = move_data_to_device(batch_data_dict['waveform'], device)
+        
+        with torch.no_grad():
+            model.eval()
+            batch_output = model(batch_waveform)
+
+        append_to_dict(output_dict, 'audio_name', batch_data_dict['audio_name'])
+
+        append_to_dict(output_dict, 'clipwise_output', 
+            batch_output['clipwise_output'].data.cpu().numpy())
+
+        if 'segmentwise_output' in batch_output.keys():
+            append_to_dict(output_dict, 'segmentwise_output', 
+                batch_output['segmentwise_output'].data.cpu().numpy())
+
+        if 'framewise_output' in batch_output.keys():
+            append_to_dict(output_dict, 'framewise_output', 
+                batch_output['framewise_output'].data.cpu().numpy())
+            
+        if return_input:
+            append_to_dict(output_dict, 'waveform', batch_data_dict['waveform'])
+            
+        if return_target:
+            if 'target' in batch_data_dict.keys():
+                append_to_dict(output_dict, 'target', batch_data_dict['target'])
+
+        if n % 10 == 0:
+            print(' --- Inference time: {:.3f} s / 10 iterations ---'.format(
+                time.time() - time1))
+            time1 = time.time()
+
+    for key in output_dict.keys():
+        output_dict[key] = np.concatenate(output_dict[key], axis=0)
+
+    return output_dict
+
+
+def interpolate(x, ratio):
+    """Interpolate data in time domain. This is used to compensate the 
+    resolution reduction in downsampling of a CNN.
+    
+    Args:
+      x: (batch_size, time_steps, classes_num)
+      ratio: int, ratio to interpolate
+
+    Returns:
+      upsampled: (batch_size, time_steps * ratio, classes_num)
+    """
+    (batch_size, time_steps, classes_num) = x.shape
+    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    return upsampled
+
+
+def pad_framewise_output(framewise_output, frames_num):
+    """Pad framewise_output to the same length as input frames. The pad value 
+    is the same as the value of the last frame.
+
+    Args:
+      framewise_output: (batch_size, frames_num, classes_num)
+      frames_num: int, number of frames to pad
+
+    Outputs:
+      output: (batch_size, frames_num, classes_num)
+    """
+    pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
+    """tensor for padding"""
+
+    output = torch.cat((framewise_output, pad), dim=1)
+    """(batch_size, frames_num, classes_num)"""
+
+    return output
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def count_flops(model, audio_length):
+    """Count flops. Code modified from others' implementation.
+    """
+    multiply_adds = True
+    list_conv2d=[]
+    def conv2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+ 
+        kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+        bias_ops = 1 if self.bias is not None else 0
+ 
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+ 
+        list_conv2d.append(flops)
+
+    list_conv1d=[]
+    def conv1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+ 
+        kernel_ops = self.kernel_size[0] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+        bias_ops = 1 if self.bias is not None else 0
+ 
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+ 
+        list_conv1d.append(flops)
+ 
+    list_linear=[] 
+    def linear_hook(self, input, output):
+        batch_size = input[0].size(0) if input[0].dim() == 2 else 1
+ 
+        weight_ops = self.weight.nelement() * (2 if multiply_adds else 1)
+        bias_ops = self.bias.nelement()
+ 
+        flops = batch_size * (weight_ops + bias_ops)
+        list_linear.append(flops)
+ 
+    list_bn=[] 
+    def bn_hook(self, input, output):
+        list_bn.append(input[0].nelement() * 2)
+ 
+    list_relu=[] 
+    def relu_hook(self, input, output):
+        list_relu.append(input[0].nelement() * 2)
+ 
+    list_pooling2d=[]
+    def pooling2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+ 
+        kernel_ops = self.kernel_size * self.kernel_size
+        bias_ops = 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+ 
+        list_pooling2d.append(flops)
+
+    list_pooling1d=[]
+    def pooling1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+ 
+        kernel_ops = self.kernel_size[0]
+        bias_ops = 0
+        
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+ 
+        list_pooling2d.append(flops)
+ 
+    def foo(net):
+        childrens = list(net.children())
+        if not childrens:
+            if isinstance(net, nn.Conv2d):
+                net.register_forward_hook(conv2d_hook)
+            elif isinstance(net, nn.Conv1d):
+                net.register_forward_hook(conv1d_hook)
+            elif isinstance(net, nn.Linear):
+                net.register_forward_hook(linear_hook)
+            elif isinstance(net, nn.BatchNorm2d) or isinstance(net, nn.BatchNorm1d):
+                net.register_forward_hook(bn_hook)
+            elif isinstance(net, nn.ReLU):
+                net.register_forward_hook(relu_hook)
+            elif isinstance(net, nn.AvgPool2d) or isinstance(net, nn.MaxPool2d):
+                net.register_forward_hook(pooling2d_hook)
+            elif isinstance(net, nn.AvgPool1d) or isinstance(net, nn.MaxPool1d):
+                net.register_forward_hook(pooling1d_hook)
+            else:
+                print('Warning: flop of module {} is not counted!'.format(net))
+            return
+        for c in childrens:
+            foo(c)
+
+    # Register hook
+    foo(model)
+    
+    device = device = next(model.parameters()).device
+    input = torch.rand(1, audio_length).to(device)
+
+    out = model(input)
+ 
+    total_flops = sum(list_conv2d) + sum(list_conv1d) + sum(list_linear) + \
+        sum(list_bn) + sum(list_relu) + sum(list_pooling2d) + sum(list_pooling1d)
+    
+    return total_flops
\ No newline at end of file
diff --git a/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png b/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png
new file mode 100644
index 0000000000000000000000000000000000000000..3c2b5d8cceac7f40a4bdba8bd1a75d590b4382ee
Binary files /dev/null and b/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png differ
diff --git a/audio_detection/audio_infer/useful_ckpts/audio_detection.pth b/audio_detection/audio_infer/useful_ckpts/audio_detection.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8bc6c65802de022080d76fc07bb68a563c6d87bf
--- /dev/null
+++ b/audio_detection/audio_infer/useful_ckpts/audio_detection.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f909808f17d424dc29063a21953ff2be103489518a4f60a6c649d2e3e7d3e81
+size 441042195
diff --git a/audio_detection/audio_infer/utils/__pycache__/config.cpython-38.pyc b/audio_detection/audio_infer/utils/__pycache__/config.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..724d543c0b401c546e16e5db5c7be6d7b1b78c8a
Binary files /dev/null and b/audio_detection/audio_infer/utils/__pycache__/config.cpython-38.pyc differ
diff --git a/audio_detection/audio_infer/utils/config.py b/audio_detection/audio_infer/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..934be1c68f4e1562e5fcef81d2f8db131cb39b9f
--- /dev/null
+++ b/audio_detection/audio_infer/utils/config.py
@@ -0,0 +1,94 @@
+import numpy as np
+import csv
+
+sample_rate = 32000
+clip_samples = sample_rate * 10     # Audio clips are 10-second
+
+# Load label
+with open('./audio_detection/audio_infer/metadata/class_labels_indices.csv', 'r') as f:
+    reader = csv.reader(f, delimiter=',')
+    lines = list(reader)
+
+labels = []
+ids = []    # Each label has a unique id such as "/m/068hy"
+for i1 in range(1, len(lines)):
+    id = lines[i1][1]
+    label = lines[i1][2]
+    ids.append(id)
+    labels.append(label)
+
+classes_num = len(labels)
+
+lb_to_ix = {label : i for i, label in enumerate(labels)}
+ix_to_lb = {i : label for i, label in enumerate(labels)}
+
+id_to_ix = {id : i for i, id in enumerate(ids)}
+ix_to_id = {i : id for i, id in enumerate(ids)}
+
+full_samples_per_class = np.array([
+        937432,  16344,   7822,  10271,   2043,  14420,    733,   1511,
+         1258,    424,   1751,    704,    369,    590,   1063,   1375,
+         5026,    743,    853,   1648,    714,   1497,   1251,   2139,
+         1093,    133,    224,  39469,   6423,    407,   1559,   4546,
+         6826,   7464,   2468,    549,   4063,    334,    587,    238,
+         1766,    691,    114,   2153,    236,    209,    421,    740,
+          269,    959,    137,   4192,    485,   1515,    655,    274,
+           69,    157,   1128,    807,   1022,    346,     98,    680,
+          890,    352,   4169,   2061,   1753,   9883,   1339,    708,
+        37857,  18504,  12864,   2475,   2182,    757,   3624,    677,
+         1683,   3583,    444,   1780,   2364,    409,   4060,   3097,
+         3143,    502,    723,    600,    230,    852,   1498,   1865,
+         1879,   2429,   5498,   5430,   2139,   1761,   1051,    831,
+         2401,   2258,   1672,   1711,    987,    646,    794,  25061,
+         5792,   4256,     96,   8126,   2740,    752,    513,    554,
+          106,    254,   1592,    556,    331,    615,   2841,    737,
+          265,   1349,    358,   1731,   1115,    295,   1070,    972,
+          174, 937780, 112337,  42509,  49200,  11415,   6092,  13851,
+         2665,   1678,  13344,   2329,   1415,   2244,   1099,   5024,
+         9872,  10948,   4409,   2732,   1211,   1289,   4807,   5136,
+         1867,  16134,  14519,   3086,  19261,   6499,   4273,   2790,
+         8820,   1228,   1575,   4420,   3685,   2019,    664,    324,
+          513,    411,    436,   2997,   5162,   3806,   1389,    899,
+         8088,   7004,   1105,   3633,   2621,   9753,   1082,  26854,
+         3415,   4991,   2129,   5546,   4489,   2850,   1977,   1908,
+         1719,   1106,   1049,    152,    136,    802,    488,    592,
+         2081,   2712,   1665,   1128,    250,    544,    789,   2715,
+         8063,   7056,   2267,   8034,   6092,   3815,   1833,   3277,
+         8813,   2111,   4662,   2678,   2954,   5227,   1472,   2591,
+         3714,   1974,   1795,   4680,   3751,   6585,   2109,  36617,
+         6083,  16264,  17351,   3449,   5034,   3931,   2599,   4134,
+         3892,   2334,   2211,   4516,   2766,   2862,   3422,   1788,
+         2544,   2403,   2892,   4042,   3460,   1516,   1972,   1563,
+         1579,   2776,   1647,   4535,   3921,   1261,   6074,   2922,
+         3068,   1948,   4407,    712,   1294,   1019,   1572,   3764,
+         5218,    975,   1539,   6376,   1606,   6091,   1138,   1169,
+         7925,   3136,   1108,   2677,   2680,   1383,   3144,   2653,
+         1986,   1800,   1308,   1344, 122231,  12977,   2552,   2678,
+         7824,    768,   8587,  39503,   3474,    661,    430,    193,
+         1405,   1442,   3588,   6280,  10515,    785,    710,    305,
+          206,   4990,   5329,   3398,   1771,   3022,   6907,   1523,
+         8588,  12203,    666,   2113,   7916,    434,   1636,   5185,
+         1062,    664,    952,   3490,   2811,   2749,   2848,  15555,
+          363,    117,   1494,   1647,   5886,   4021,    633,   1013,
+         5951,  11343,   2324,    243,    372,    943,    734,    242,
+         3161,    122,    127,    201,   1654,    768,    134,   1467,
+          642,   1148,   2156,   1368,   1176,    302,   1909,     61,
+          223,   1812,    287,    422,    311,    228,    748,    230,
+         1876,    539,   1814,    737,    689,   1140,    591,    943,
+          353,    289,    198,    490,   7938,   1841,    850,    457,
+        814,    146,    551,    728,   1627,    620,    648,   1621,
+         2731,    535,     88,   1736,    736,    328,    293,   3170,
+          344,    384,   7640,    433,    215,    715,    626,    128,
+         3059,   1833,   2069,   3732,   1640,   1508,    836,    567,
+         2837,   1151,   2068,    695,   1494,   3173,    364,     88,
+          188,    740,    677,    273,   1533,    821,   1091,    293,
+          647,    318,   1202,    328,    532,   2847,    526,    721,
+          370,    258,    956,   1269,   1641,    339,   1322,   4485,
+          286,   1874,    277,    757,   1393,   1330,    380,    146,
+          377,    394,    318,    339,   1477,   1886,    101,   1435,
+          284,   1425,    686,    621,    221,    117,     87,   1340,
+          201,   1243,   1222,    651,   1899,    421,    712,   1016,
+         1279,    124,    351,    258,   7043,    368,    666,    162,
+         7664,    137,  70159,  26179,   6321,  32236,  33320,    771,
+         1169,    269,   1103,    444,    364,   2710,    121,    751,
+         1609,    855,   1141,   2287,   1940,   3943,    289])
diff --git a/audio_detection/audio_infer/utils/crash.py b/audio_detection/audio_infer/utils/crash.py
new file mode 100644
index 0000000000000000000000000000000000000000..98a06e20bc793687ec259e23c8b9e503887b34f5
--- /dev/null
+++ b/audio_detection/audio_infer/utils/crash.py
@@ -0,0 +1,12 @@
+import sys
+
+class ExceptionHook:
+    instance = None
+    def __call__(self, *args, **kwargs):
+        if self.instance is None:
+            from IPython.core import ultratb
+            self.instance = ultratb.FormattedTB(mode='Plain',
+                 color_scheme='Linux', call_pdb=1)
+        return self.instance(*args, **kwargs)
+
+sys.excepthook = ExceptionHook()
diff --git a/audio_detection/audio_infer/utils/create_black_list.py b/audio_detection/audio_infer/utils/create_black_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadbe94599997e3476f37f8c4cdd30ca86a8720e
--- /dev/null
+++ b/audio_detection/audio_infer/utils/create_black_list.py
@@ -0,0 +1,64 @@
+import argparse
+import csv
+import os
+
+from utilities import create_folder
+
+
+def dcase2017task4(args):
+    """Create black list. Black list is a list of audio ids that will be 
+    skipped in training. 
+    """
+
+    # Augments & parameters
+    workspace = args.workspace
+    
+    # Black list from DCASE 2017 Task 4
+    test_weak_csv = 'metadata/black_list/groundtruth_weak_label_testing_set.csv'
+    evaluation_weak_csv = 'metadata/black_list/groundtruth_weak_label_evaluation_set.csv'
+    
+    black_list_csv = os.path.join(workspace, 'black_list', 'dcase2017task4.csv')
+    create_folder(os.path.dirname(black_list_csv))
+    
+    def get_id_sets(csv_path):
+        with open(csv_path, 'r') as fr:
+            reader = csv.reader(fr, delimiter='\t')
+            lines = list(reader)
+         
+        ids_set = [] 
+        
+        for line in lines:
+            """line: ['-5QrBL6MzLg_60.000_70.000.wav', '60.000', '70.000', 'Train horn']"""
+            ids_set.append(line[0][0 : 11])
+            
+        ids_set = list(set(ids_set))
+        return ids_set
+        
+    test_ids_set = get_id_sets(test_weak_csv)
+    evaluation_ids_set = get_id_sets(evaluation_weak_csv)
+    
+    full_ids_set = test_ids_set + evaluation_ids_set
+    
+    # Write black list
+    fw = open(black_list_csv, 'w')
+    
+    for id in full_ids_set:
+        fw.write('{}\n'.format(id))
+        
+    print('Write black list to {}'.format(black_list_csv))
+    
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    subparsers = parser.add_subparsers(dest='mode')
+
+    parser_dcase2017task4 = subparsers.add_parser('dcase2017task4')
+    parser_dcase2017task4.add_argument('--workspace', type=str, required=True)
+        
+    args = parser.parse_args()
+
+    if args.mode == 'dcase2017task4':
+        dcase2017task4(args)
+        
+    else:
+        raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/create_indexes.py b/audio_detection/audio_infer/utils/create_indexes.py
new file mode 100644
index 0000000000000000000000000000000000000000..78be38cb3c693fa9ef7b44c52c407640e9e32aab
--- /dev/null
+++ b/audio_detection/audio_infer/utils/create_indexes.py
@@ -0,0 +1,126 @@
+import numpy as np
+import argparse
+import csv
+import os
+import glob
+import datetime
+import time
+import logging
+import h5py
+import librosa
+
+from utilities import create_folder, get_sub_filepaths
+import config
+
+
+def create_indexes(args):
+    """Create indexes a for dataloader to read for training. When users have 
+    a new task and their own data, they need to create similar indexes. The 
+    indexes contain meta information of "where to find the data for training".
+    """
+
+    # Arguments & parameters
+    waveforms_hdf5_path = args.waveforms_hdf5_path
+    indexes_hdf5_path = args.indexes_hdf5_path
+
+    # Paths
+    create_folder(os.path.dirname(indexes_hdf5_path))
+
+    with h5py.File(waveforms_hdf5_path, 'r') as hr:
+        with h5py.File(indexes_hdf5_path, 'w') as hw:
+            audios_num = len(hr['audio_name'])
+            hw.create_dataset('audio_name', data=hr['audio_name'][:], dtype='S20')
+            hw.create_dataset('target', data=hr['target'][:], dtype=np.bool)
+            hw.create_dataset('hdf5_path', data=[waveforms_hdf5_path.encode()] * audios_num, dtype='S200')
+            hw.create_dataset('index_in_hdf5', data=np.arange(audios_num), dtype=np.int32)
+
+    print('Write to {}'.format(indexes_hdf5_path))
+          
+
+def combine_full_indexes(args):
+    """Combine all balanced and unbalanced indexes hdf5s to a single hdf5. This 
+    combined indexes hdf5 is used for training with full data (~20k balanced 
+    audio clips + ~1.9m unbalanced audio clips).
+    """
+
+    # Arguments & parameters
+    indexes_hdf5s_dir = args.indexes_hdf5s_dir
+    full_indexes_hdf5_path = args.full_indexes_hdf5_path
+
+    classes_num = config.classes_num
+
+    # Paths
+    paths = get_sub_filepaths(indexes_hdf5s_dir)
+    paths = [path for path in paths if (
+        'train' in path and 'full_train' not in path and 'mini' not in path)]
+
+    print('Total {} hdf5 to combine.'.format(len(paths)))
+
+    with h5py.File(full_indexes_hdf5_path, 'w') as full_hf:
+        full_hf.create_dataset(
+            name='audio_name', 
+            shape=(0,), 
+            maxshape=(None,), 
+            dtype='S20')
+        
+        full_hf.create_dataset(
+            name='target', 
+            shape=(0, classes_num), 
+            maxshape=(None, classes_num), 
+            dtype=np.bool)
+
+        full_hf.create_dataset(
+            name='hdf5_path', 
+            shape=(0,), 
+            maxshape=(None,), 
+            dtype='S200')
+
+        full_hf.create_dataset(
+            name='index_in_hdf5', 
+            shape=(0,), 
+            maxshape=(None,), 
+            dtype=np.int32)
+
+        for path in paths:
+            with h5py.File(path, 'r') as part_hf:
+                print(path)
+                n = len(full_hf['audio_name'][:])
+                new_n = n + len(part_hf['audio_name'][:])
+
+                full_hf['audio_name'].resize((new_n,))
+                full_hf['audio_name'][n : new_n] = part_hf['audio_name'][:]
+
+                full_hf['target'].resize((new_n, classes_num))
+                full_hf['target'][n : new_n] = part_hf['target'][:]
+
+                full_hf['hdf5_path'].resize((new_n,))
+                full_hf['hdf5_path'][n : new_n] = part_hf['hdf5_path'][:]
+
+                full_hf['index_in_hdf5'].resize((new_n,))
+                full_hf['index_in_hdf5'][n : new_n] = part_hf['index_in_hdf5'][:]
+                
+    print('Write combined full hdf5 to {}'.format(full_indexes_hdf5_path))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(dest='mode')
+
+    parser_create_indexes = subparsers.add_parser('create_indexes')
+    parser_create_indexes.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path of packed waveforms hdf5.')
+    parser_create_indexes.add_argument('--indexes_hdf5_path', type=str, required=True, help='Path to write out indexes hdf5.')
+
+    parser_combine_full_indexes = subparsers.add_parser('combine_full_indexes')
+    parser_combine_full_indexes.add_argument('--indexes_hdf5s_dir', type=str, required=True, help='Directory containing indexes hdf5s to be combined.')
+    parser_combine_full_indexes.add_argument('--full_indexes_hdf5_path', type=str, required=True, help='Path to write out full indexes hdf5 file.')
+
+    args = parser.parse_args()
+    
+    if args.mode == 'create_indexes':
+        create_indexes(args)
+
+    elif args.mode == 'combine_full_indexes':
+        combine_full_indexes(args)
+
+    else:
+        raise Exception('Incorrect arguments!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/data_generator.py b/audio_detection/audio_infer/utils/data_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94b6d990b6726c791cbb4cb660abdb93233f965
--- /dev/null
+++ b/audio_detection/audio_infer/utils/data_generator.py
@@ -0,0 +1,421 @@
+import numpy as np
+import h5py
+import csv
+import time
+import logging
+
+from utilities import int16_to_float32
+
+
+def read_black_list(black_list_csv):
+    """Read audio names from black list. 
+    """
+    with open(black_list_csv, 'r') as fr:
+        reader = csv.reader(fr)
+        lines = list(reader)
+
+    black_list_names = ['Y{}.wav'.format(line[0]) for line in lines]
+    return black_list_names
+
+
+class AudioSetDataset(object):
+    def __init__(self, sample_rate=32000):
+        """This class takes the meta of an audio clip as input, and return 
+        the waveform and target of the audio clip. This class is used by DataLoader. 
+        """
+        self.sample_rate = sample_rate
+    
+    def __getitem__(self, meta):
+        """Load waveform and target of an audio clip.
+        
+        Args:
+          meta: {
+            'hdf5_path': str, 
+            'index_in_hdf5': int}
+
+        Returns: 
+          data_dict: {
+            'audio_name': str, 
+            'waveform': (clip_samples,), 
+            'target': (classes_num,)}
+        """
+        hdf5_path = meta['hdf5_path']
+        index_in_hdf5 = meta['index_in_hdf5']
+        with h5py.File(hdf5_path, 'r') as hf:
+            audio_name = hf['audio_name'][index_in_hdf5].decode()
+            waveform = int16_to_float32(hf['waveform'][index_in_hdf5])
+            waveform = self.resample(waveform)
+            target = hf['target'][index_in_hdf5].astype(np.float32)
+
+        data_dict = {
+            'audio_name': audio_name, 'waveform': waveform, 'target': target}
+            
+        return data_dict
+
+    def resample(self, waveform):
+        """Resample.
+
+        Args:
+          waveform: (clip_samples,)
+
+        Returns:
+          (resampled_clip_samples,)
+        """
+        if self.sample_rate == 32000:
+            return waveform
+        elif self.sample_rate == 16000:
+            return waveform[0 :: 2]
+        elif self.sample_rate == 8000:
+            return waveform[0 :: 4]
+        else:
+            raise Exception('Incorrect sample rate!')
+
+
+class Base(object):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv, random_seed):
+        """Base class of train sampler.
+        
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        self.batch_size = batch_size
+        self.random_state = np.random.RandomState(random_seed)
+
+        # Black list
+        if black_list_csv:
+            self.black_list_names = read_black_list(black_list_csv)
+        else:
+            self.black_list_names = []
+
+        logging.info('Black list samples: {}'.format(len(self.black_list_names)))
+
+        # Load target
+        load_time = time.time()
+
+        with h5py.File(indexes_hdf5_path, 'r') as hf:
+            self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]]
+            self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]]
+            self.indexes_in_hdf5 = hf['index_in_hdf5'][:]
+            self.targets = hf['target'][:].astype(np.float32)
+        
+        (self.audios_num, self.classes_num) = self.targets.shape
+        logging.info('Training number: {}'.format(self.audios_num))
+        logging.info('Load target time: {:.3f} s'.format(time.time() - load_time))
+
+
+class TrainSampler(Base):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None, 
+        random_seed=1234):
+        """Balanced sampler. Generate batch meta for training.
+        
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        super(TrainSampler, self).__init__(indexes_hdf5_path, batch_size, 
+            black_list_csv, random_seed)
+        
+        self.indexes = np.arange(self.audios_num)
+            
+        # Shuffle indexes
+        self.random_state.shuffle(self.indexes)
+        
+        self.pointer = 0
+
+    def __iter__(self):
+        """Generate batch meta for training. 
+        
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string, 'index_in_hdf5': int}, 
+            ...]
+        """
+        batch_size = self.batch_size
+
+        while True:
+            batch_meta = []
+            i = 0
+            while i < batch_size:
+                index = self.indexes[self.pointer]
+                self.pointer += 1
+
+                # Shuffle indexes and reset pointer
+                if self.pointer >= self.audios_num:
+                    self.pointer = 0
+                    self.random_state.shuffle(self.indexes)
+                
+                # If audio in black list then continue
+                if self.audio_names[index] in self.black_list_names:
+                    continue
+                else:
+                    batch_meta.append({
+                        'hdf5_path': self.hdf5_paths[index], 
+                        'index_in_hdf5': self.indexes_in_hdf5[index]})
+                    i += 1
+
+            yield batch_meta
+
+    def state_dict(self):
+        state = {
+            'indexes': self.indexes,
+            'pointer': self.pointer}
+        return state
+            
+    def load_state_dict(self, state):
+        self.indexes = state['indexes']
+        self.pointer = state['pointer']
+
+
+class BalancedTrainSampler(Base):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None, 
+        random_seed=1234):
+        """Balanced sampler. Generate batch meta for training. Data are equally 
+        sampled from different sound classes.
+        
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        super(BalancedTrainSampler, self).__init__(indexes_hdf5_path, 
+            batch_size, black_list_csv, random_seed)
+        
+        self.samples_num_per_class = np.sum(self.targets, axis=0)
+        logging.info('samples_num_per_class: {}'.format(
+            self.samples_num_per_class.astype(np.int32)))
+        
+        # Training indexes of all sound classes. E.g.: 
+        # [[0, 11, 12, ...], [3, 4, 15, 16, ...], [7, 8, ...], ...]
+        self.indexes_per_class = []
+        
+        for k in range(self.classes_num):
+            self.indexes_per_class.append(
+                np.where(self.targets[:, k] == 1)[0])
+            
+        # Shuffle indexes
+        for k in range(self.classes_num):
+            self.random_state.shuffle(self.indexes_per_class[k])
+        
+        self.queue = []
+        self.pointers_of_classes = [0] * self.classes_num
+
+    def expand_queue(self, queue):
+        classes_set = np.arange(self.classes_num).tolist()
+        self.random_state.shuffle(classes_set)
+        queue += classes_set
+        return queue
+
+    def __iter__(self):
+        """Generate batch meta for training. 
+        
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string, 'index_in_hdf5': int}, 
+            ...]
+        """
+        batch_size = self.batch_size
+
+        while True:
+            batch_meta = []
+            i = 0
+            while i < batch_size:
+                if len(self.queue) == 0:
+                    self.queue = self.expand_queue(self.queue)
+
+                class_id = self.queue.pop(0)
+                pointer = self.pointers_of_classes[class_id]
+                self.pointers_of_classes[class_id] += 1
+                index = self.indexes_per_class[class_id][pointer]
+                
+                # When finish one epoch of a sound class, then shuffle its indexes and reset pointer
+                if self.pointers_of_classes[class_id] >= self.samples_num_per_class[class_id]:
+                    self.pointers_of_classes[class_id] = 0
+                    self.random_state.shuffle(self.indexes_per_class[class_id])
+
+                # If audio in black list then continue
+                if self.audio_names[index] in self.black_list_names:
+                    continue
+                else:
+                    batch_meta.append({
+                        'hdf5_path': self.hdf5_paths[index], 
+                        'index_in_hdf5': self.indexes_in_hdf5[index]})
+                    i += 1
+
+            yield batch_meta
+
+    def state_dict(self):
+        state = {
+            'indexes_per_class': self.indexes_per_class, 
+            'queue': self.queue, 
+            'pointers_of_classes': self.pointers_of_classes}
+        return state
+            
+    def load_state_dict(self, state):
+        self.indexes_per_class = state['indexes_per_class']
+        self.queue = state['queue']
+        self.pointers_of_classes = state['pointers_of_classes']
+
+
+class AlternateTrainSampler(Base):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
+        random_seed=1234):
+        """AlternateSampler is a combination of Sampler and Balanced Sampler. 
+        AlternateSampler alternately sample data from Sampler and Blanced Sampler.
+        
+        Args:
+          indexes_hdf5_path: string          
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        self.sampler1 = TrainSampler(indexes_hdf5_path, batch_size, 
+            black_list_csv, random_seed)
+
+        self.sampler2 = BalancedTrainSampler(indexes_hdf5_path, batch_size, 
+            black_list_csv, random_seed)
+
+        self.batch_size = batch_size
+        self.count = 0
+
+    def __iter__(self):
+        """Generate batch meta for training. 
+        
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string, 'index_in_hdf5': int}, 
+            ...]
+        """
+        batch_size = self.batch_size
+
+        while True:
+            self.count += 1
+
+            if self.count % 2 == 0:
+                batch_meta = []
+                i = 0
+                while i < batch_size:
+                    index = self.sampler1.indexes[self.sampler1.pointer]
+                    self.sampler1.pointer += 1
+
+                    # Shuffle indexes and reset pointer
+                    if self.sampler1.pointer >= self.sampler1.audios_num:
+                        self.sampler1.pointer = 0
+                        self.sampler1.random_state.shuffle(self.sampler1.indexes)
+                    
+                    # If audio in black list then continue
+                    if self.sampler1.audio_names[index] in self.sampler1.black_list_names:
+                        continue
+                    else:
+                        batch_meta.append({
+                            'hdf5_path': self.sampler1.hdf5_paths[index], 
+                            'index_in_hdf5': self.sampler1.indexes_in_hdf5[index]})
+                        i += 1
+
+            elif self.count % 2 == 1:
+                batch_meta = []
+                i = 0
+                while i < batch_size:
+                    if len(self.sampler2.queue) == 0:
+                        self.sampler2.queue = self.sampler2.expand_queue(self.sampler2.queue)
+
+                    class_id = self.sampler2.queue.pop(0)
+                    pointer = self.sampler2.pointers_of_classes[class_id]
+                    self.sampler2.pointers_of_classes[class_id] += 1
+                    index = self.sampler2.indexes_per_class[class_id][pointer]
+                    
+                    # When finish one epoch of a sound class, then shuffle its indexes and reset pointer
+                    if self.sampler2.pointers_of_classes[class_id] >= self.sampler2.samples_num_per_class[class_id]:
+                        self.sampler2.pointers_of_classes[class_id] = 0
+                        self.sampler2.random_state.shuffle(self.sampler2.indexes_per_class[class_id])
+
+                    # If audio in black list then continue
+                    if self.sampler2.audio_names[index] in self.sampler2.black_list_names:
+                        continue
+                    else:
+                        batch_meta.append({
+                            'hdf5_path': self.sampler2.hdf5_paths[index], 
+                            'index_in_hdf5': self.sampler2.indexes_in_hdf5[index]})
+                        i += 1
+
+            yield batch_meta
+
+    def state_dict(self):
+        state = {
+            'sampler1': self.sampler1.state_dict(), 
+            'sampler2': self.sampler2.state_dict()}
+        return state
+
+    def load_state_dict(self, state):
+        self.sampler1.load_state_dict(state['sampler1'])
+        self.sampler2.load_state_dict(state['sampler2'])
+
+
+class EvaluateSampler(object):
+    def __init__(self, indexes_hdf5_path, batch_size):
+        """Evaluate sampler. Generate batch meta for evaluation.
+        
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+        """
+        self.batch_size = batch_size
+
+        with h5py.File(indexes_hdf5_path, 'r') as hf:
+            self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]]
+            self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]]
+            self.indexes_in_hdf5 = hf['index_in_hdf5'][:]
+            self.targets = hf['target'][:].astype(np.float32)
+            
+        self.audios_num = len(self.audio_names)
+
+    def __iter__(self):
+        """Generate batch meta for training. 
+        
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string, 
+             'index_in_hdf5': int}
+            ...]
+        """
+        batch_size = self.batch_size
+        pointer = 0
+
+        while pointer < self.audios_num:
+            batch_indexes = np.arange(pointer, 
+                min(pointer + batch_size, self.audios_num))
+
+            batch_meta = []
+
+            for index in batch_indexes:
+                batch_meta.append({
+                    'audio_name': self.audio_names[index], 
+                    'hdf5_path': self.hdf5_paths[index], 
+                    'index_in_hdf5': self.indexes_in_hdf5[index], 
+                    'target': self.targets[index]})
+
+            pointer += batch_size
+            yield batch_meta
+
+
+def collate_fn(list_data_dict):
+    """Collate data.
+    Args:
+      list_data_dict, e.g., [{'audio_name': str, 'waveform': (clip_samples,), ...}, 
+                             {'audio_name': str, 'waveform': (clip_samples,), ...},
+                             ...]
+    Returns:
+      np_data_dict, dict, e.g.,
+          {'audio_name': (batch_size,), 'waveform': (batch_size, clip_samples), ...}
+    """
+    np_data_dict = {}
+    
+    for key in list_data_dict[0].keys():
+        np_data_dict[key] = np.array([data_dict[key] for data_dict in list_data_dict])
+    
+    return np_data_dict
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/dataset.py b/audio_detection/audio_infer/utils/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f11755a027de97d236e447e576c4b1ed4e8a36
--- /dev/null
+++ b/audio_detection/audio_infer/utils/dataset.py
@@ -0,0 +1,224 @@
+import numpy as np
+import argparse
+import csv
+import os
+import glob
+import datetime
+import time
+import logging
+import h5py
+import librosa
+
+from utilities import (create_folder, get_filename, create_logging, 
+    float32_to_int16, pad_or_truncate, read_metadata)
+import config
+
+
+def split_unbalanced_csv_to_partial_csvs(args):
+    """Split unbalanced csv to part csvs. Each part csv contains up to 50000 ids. 
+    """
+    
+    unbalanced_csv_path = args.unbalanced_csv
+    unbalanced_partial_csvs_dir = args.unbalanced_partial_csvs_dir
+    
+    create_folder(unbalanced_partial_csvs_dir)
+    
+    with open(unbalanced_csv_path, 'r') as f:
+        lines = f.readlines()
+
+    lines = lines[3:]   # Remove head info
+    audios_num_per_file = 50000
+    
+    files_num = int(np.ceil(len(lines) / float(audios_num_per_file)))
+    
+    for r in range(files_num):
+        lines_per_file = lines[r * audios_num_per_file : 
+            (r + 1) * audios_num_per_file]
+        
+        out_csv_path = os.path.join(unbalanced_partial_csvs_dir, 
+            'unbalanced_train_segments_part{:02d}.csv'.format(r))
+
+        with open(out_csv_path, 'w') as f:
+            f.write('empty\n')
+            f.write('empty\n')
+            f.write('empty\n')
+            for line in lines_per_file:
+                f.write(line)
+        
+        print('Write out csv to {}'.format(out_csv_path))
+
+
+def download_wavs(args):
+    """Download videos and extract audio in wav format.
+    """
+
+    # Paths
+    csv_path = args.csv_path
+    audios_dir = args.audios_dir
+    mini_data = args.mini_data
+    
+    if mini_data:
+        logs_dir = '_logs/download_dataset/{}'.format(get_filename(csv_path))
+    else:
+        logs_dir = '_logs/download_dataset_minidata/{}'.format(get_filename(csv_path))
+    
+    create_folder(audios_dir)
+    create_folder(logs_dir)
+    create_logging(logs_dir, filemode='w')
+    logging.info('Download log is saved to {}'.format(logs_dir))
+
+    # Read csv
+    with open(csv_path, 'r') as f:
+        lines = f.readlines()
+    
+    lines = lines[3:]   # Remove csv head info
+
+    if mini_data:
+        lines = lines[0 : 10]   # Download partial data for debug
+    
+    download_time = time.time()
+
+    # Download
+    for (n, line) in enumerate(lines):
+        
+        items = line.split(', ')
+        audio_id = items[0]
+        start_time = float(items[1])
+        end_time = float(items[2])
+        duration = end_time - start_time
+        
+        logging.info('{} {} start_time: {:.1f}, end_time: {:.1f}'.format(
+            n, audio_id, start_time, end_time))
+        
+        # Download full video of whatever format
+        video_name = os.path.join(audios_dir, '_Y{}.%(ext)s'.format(audio_id))
+        os.system("youtube-dl --quiet -o '{}' -x https://www.youtube.com/watch?v={}"\
+            .format(video_name, audio_id))
+
+        video_paths = glob.glob(os.path.join(audios_dir, '_Y' + audio_id + '.*'))
+
+        # If download successful
+        if len(video_paths) > 0:
+            video_path = video_paths[0]     # Choose one video
+
+            # Add 'Y' to the head because some video ids are started with '-'
+            # which will cause problem
+            audio_path = os.path.join(audios_dir, 'Y' + audio_id + '.wav')
+
+            # Extract audio in wav format
+            os.system("ffmpeg -loglevel panic -i {} -ac 1 -ar 32000 -ss {} -t 00:00:{} {} "\
+                .format(video_path, 
+                str(datetime.timedelta(seconds=start_time)), duration, 
+                audio_path))
+            
+            # Remove downloaded video
+            os.system("rm {}".format(video_path))
+            
+            logging.info("Download and convert to {}".format(audio_path))
+                
+    logging.info('Download finished! Time spent: {:.3f} s'.format(
+        time.time() - download_time))
+
+    logging.info('Logs can be viewed in {}'.format(logs_dir))
+
+
+def pack_waveforms_to_hdf5(args):
+    """Pack waveform and target of several audio clips to a single hdf5 file. 
+    This can speed up loading and training.
+    """
+
+    # Arguments & parameters
+    audios_dir = args.audios_dir
+    csv_path = args.csv_path
+    waveforms_hdf5_path = args.waveforms_hdf5_path
+    mini_data = args.mini_data
+
+    clip_samples = config.clip_samples
+    classes_num = config.classes_num
+    sample_rate = config.sample_rate
+    id_to_ix = config.id_to_ix
+
+    # Paths
+    if mini_data:
+        prefix = 'mini_'
+        waveforms_hdf5_path += '.mini'
+    else:
+        prefix = ''
+
+    create_folder(os.path.dirname(waveforms_hdf5_path))
+
+    logs_dir = '_logs/pack_waveforms_to_hdf5/{}{}'.format(prefix, get_filename(csv_path))
+    create_folder(logs_dir)
+    create_logging(logs_dir, filemode='w')
+    logging.info('Write logs to {}'.format(logs_dir))
+    
+    # Read csv file
+    meta_dict = read_metadata(csv_path, classes_num, id_to_ix)
+
+    if mini_data:
+        mini_num = 10
+        for key in meta_dict.keys():
+            meta_dict[key] = meta_dict[key][0 : mini_num]
+
+    audios_num = len(meta_dict['audio_name'])
+
+    # Pack waveform to hdf5
+    total_time = time.time()
+
+    with h5py.File(waveforms_hdf5_path, 'w') as hf:
+        hf.create_dataset('audio_name', shape=((audios_num,)), dtype='S20')
+        hf.create_dataset('waveform', shape=((audios_num, clip_samples)), dtype=np.int16)
+        hf.create_dataset('target', shape=((audios_num, classes_num)), dtype=np.bool)
+        hf.attrs.create('sample_rate', data=sample_rate, dtype=np.int32)
+
+        # Pack waveform & target of several audio clips to a single hdf5 file
+        for n in range(audios_num):
+            audio_path = os.path.join(audios_dir, meta_dict['audio_name'][n])
+
+            if os.path.isfile(audio_path):
+                logging.info('{} {}'.format(n, audio_path))
+                (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+                audio = pad_or_truncate(audio, clip_samples)
+
+                hf['audio_name'][n] = meta_dict['audio_name'][n].encode()
+                hf['waveform'][n] = float32_to_int16(audio)
+                hf['target'][n] = meta_dict['target'][n]
+            else:
+                logging.info('{} File does not exist! {}'.format(n, audio_path))
+
+    logging.info('Write to {}'.format(waveforms_hdf5_path))
+    logging.info('Pack hdf5 time: {:.3f}'.format(time.time() - total_time))
+          
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(dest='mode')
+
+    parser_split = subparsers.add_parser('split_unbalanced_csv_to_partial_csvs')
+    parser_split.add_argument('--unbalanced_csv', type=str, required=True, help='Path of unbalanced_csv file to read.')
+    parser_split.add_argument('--unbalanced_partial_csvs_dir', type=str, required=True, help='Directory to save out split unbalanced partial csv.')
+
+    parser_download_wavs = subparsers.add_parser('download_wavs')
+    parser_download_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
+    parser_download_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
+    parser_download_wavs.add_argument('--mini_data', action='store_true', default=True, help='Set true to only download 10 audios for debugging.')
+
+    parser_pack_wavs = subparsers.add_parser('pack_waveforms_to_hdf5')
+    parser_pack_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
+    parser_pack_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
+    parser_pack_wavs.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path to save out packed hdf5.')
+    parser_pack_wavs.add_argument('--mini_data', action='store_true', default=False, help='Set true to only download 10 audios for debugging.')
+
+    args = parser.parse_args()
+    
+    if args.mode == 'split_unbalanced_csv_to_partial_csvs':
+        split_unbalanced_csv_to_partial_csvs(args)
+    
+    elif args.mode == 'download_wavs':
+        download_wavs(args)
+
+    elif args.mode == 'pack_waveforms_to_hdf5':
+        pack_waveforms_to_hdf5(args)
+
+    else:
+        raise Exception('Incorrect arguments!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/plot_for_paper.py b/audio_detection/audio_infer/utils/plot_for_paper.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e799a7e7eea9ffc5bced214a8beb0a558842eb
--- /dev/null
+++ b/audio_detection/audio_infer/utils/plot_for_paper.py
@@ -0,0 +1,565 @@
+import os
+import sys
+import numpy as np
+import argparse
+import h5py
+import time
+import pickle
+import matplotlib.pyplot as plt
+import csv
+from sklearn import metrics
+
+from utilities import (create_folder, get_filename, d_prime)
+import config
+
+
+def load_statistics(statistics_path):
+    statistics_dict = pickle.load(open(statistics_path, 'rb'))
+
+    bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+    bal_map = np.mean(bal_map, axis=-1)
+    test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+    test_map = np.mean(test_map, axis=-1)
+
+    return bal_map, test_map
+
+
+def crop_label(label):
+    max_len = 16
+    if len(label) <= max_len:
+        return label
+    else:
+        words = label.split(' ')
+        cropped_label = ''
+        for w in words:
+            if len(cropped_label + ' ' + w) > max_len:
+                break
+            else:
+                cropped_label += ' {}'.format(w)
+    return cropped_label
+
+
+def add_comma(integer):
+    """E.g., 1234567 -> 1,234,567
+    """
+    integer = int(integer)
+    if integer >= 1000:
+        return str(integer // 1000) + ',' + str(integer % 1000)
+    else:
+        return str(integer)
+
+
+def plot_classwise_iteration_map(args):
+    
+    # Paths
+    save_out_path = 'results/classwise_iteration_map.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    # Load statistics
+    statistics_dict = pickle.load(open('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl', 'rb'))
+
+    mAP_mat = np.array([e['average_precision'] for e in statistics_dict['test']])
+    mAP_mat = mAP_mat[0 : 300, :]   # 300 * 2000 = 600k iterations
+    sorted_indexes = np.argsort(config.full_samples_per_class)[::-1]
+
+    fig, axs = plt.subplots(1, 3, figsize=(20, 5))
+    ranges = [np.arange(0, 10), np.arange(250, 260), np.arange(517, 527)]
+    axs[0].set_ylabel('AP')
+
+    for col in range(0, 3):
+        axs[col].set_ylim(0, 1.)
+        axs[col].set_xlim(0, 301)
+        axs[col].set_xlabel('Iterations')
+        axs[col].set_ylabel('AP')
+        axs[col].xaxis.set_ticks(np.arange(0, 301, 100))
+        axs[col].xaxis.set_ticklabels(['0', '200k', '400k', '600k'])
+        lines = []
+        for _ix in ranges[col]:
+            _label = crop_label(config.labels[sorted_indexes[_ix]]) + \
+                ' ({})'.format(add_comma(config.full_samples_per_class[sorted_indexes[_ix]]))
+            line, = axs[col].plot(mAP_mat[:, sorted_indexes[_ix]], label=_label)
+            lines.append(line)
+        box = axs[col].get_position()
+        axs[col].set_position([box.x0, box.y0, box.width * 1., box.height])
+        axs[col].legend(handles=lines, bbox_to_anchor=(1., 1.))
+        axs[col].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+ 
+    plt.tight_layout(pad=4, w_pad=1, h_pad=1)
+    plt.savefig(save_out_path)
+    print(save_out_path)
+
+
+def plot_six_figures(args):
+    
+    # Arguments & parameters
+    classes_num = config.classes_num
+    labels = config.labels
+    max_plot_iteration = 540000
+    iterations = np.arange(0, max_plot_iteration, 2000)
+
+    # Paths
+    class_labels_indices_path = os.path.join('metadata', 'class_labels_indices.csv')
+    save_out_path = 'results/six_figures.pdf'
+    create_folder(os.path.dirname(save_out_path))
+    
+    # Plot
+    fig, ax = plt.subplots(2, 3, figsize=(14, 7))
+    bal_alpha = 0.3
+    test_alpha = 1.0
+    linewidth = 1.
+
+    # (a) Comparison of architectures
+    if True:
+        lines = []
+
+        # Wavegram-Logmel-CNN
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl')
+        line, = ax[0, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Cnn14
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # MobileNetV1
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_MobileNetV1_balanced_mixup_bs32.pkl')
+        line, = ax[0, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 0].legend(handles=lines, loc=2)
+        ax[0, 0].set_title('(a) Comparison of architectures')
+
+    # (b) Comparison of training data and augmentation'
+    if True:
+        lines = []
+
+        # Full data + balanced sampler + mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Full data + balanced sampler + mixup in time domain
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_timedomain_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Full data + balanced sampler + no mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_nomixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Full data + uniform sampler + no mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_nobalanced_nomixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Balanced data + balanced sampler + mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Balanced data + balanced sampler + no mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_nomixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 1].legend(handles=lines, loc=2, fontsize=8)
+        ax[0, 1].set_title('(b) Comparison of training data and augmentation')
+
+    # (c) Comparison of embedding size
+    if True:
+        lines = []
+
+        # Embedding size 2048
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Embedding size 128
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb128_balanced_mixup_bs32.pkl')
+        line, = ax[0, 2].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Embedding size 32
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb32_balanced_mixup_bs32.pkl')
+        line, = ax[0, 2].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 2].legend(handles=lines, loc=2)
+        ax[0, 2].set_title('(c) Comparison of embedding size')
+
+    # (d) Comparison of amount of training data
+    if True:
+        lines = []
+
+        # 100% of full training data
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # 80% of full training data
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.8full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # 50% of full training data
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.5full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 0].legend(handles=lines, loc=2)
+        ax[1, 0].set_title('(d) Comparison of amount of training data')
+
+    # (e) Comparison of sampling rate
+    if True:
+        lines = []
+
+        # Cnn14 + 32 kHz
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Cnn14 + 16 kHz
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_16k_balanced_mixup_bs32.pkl')
+        line, = ax[1, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Cnn14 + 8 kHz
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_8k_balanced_mixup_bs32.pkl')
+        line, = ax[1, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 1].legend(handles=lines, loc=2)
+        ax[1, 1].set_title('(e) Comparison of sampling rate')
+
+    # (f) Comparison of mel bins number
+    if True:
+        lines = []
+
+        # Cnn14 + 128 mel bins
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel128_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 2].plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax[1, 2].plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Cnn14 + 64 mel bins
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 2].plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Cnn14 + 32 mel bins
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel32_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 2].plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax[1, 2].plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 2].legend(handles=lines, loc=2)
+        ax[1, 2].set_title('(f) Comparison of mel bins number')
+
+    for i in range(2):
+        for j in range(3):
+            ax[i, j].set_ylim(0, 0.8)
+            ax[i, j].set_xlim(0, len(iterations))
+            ax[i, j].set_xlabel('Iterations')
+            ax[i, j].set_ylabel('mAP')
+            ax[i, j].xaxis.set_ticks(np.arange(0, len(iterations), 50))
+            ax[i, j].xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k'])
+            ax[i, j].yaxis.set_ticks(np.arange(0, 0.81, 0.05))
+            ax[i, j].yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3', 
+                '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8'])
+            ax[i, j].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+            ax[i, j].xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+
+    plt.tight_layout(0, 1, 0)
+    plt.savefig(save_out_path)
+    print('Save figure to {}'.format(save_out_path))
+
+
+def plot_complexity_map(args):
+    
+    # Paths
+    save_out_path = 'results/complexity_mAP.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    plt.figure(figsize=(5, 5))
+    fig, ax = plt.subplots(1, 1)
+
+    model_types = np.array(['Cnn6', 'Cnn10', 'Cnn14', 'ResNet22', 'ResNet38', 'ResNet54', 
+        'MobileNetV1', 'MobileNetV2', 'DaiNet', 'LeeNet', 'LeeNet18', 
+        'Res1dNet30', 'Res1dNet44', 'Wavegram-CNN', 'Wavegram-\nLogmel-CNN'])
+    flops = np.array([21.986, 28.166, 42.220, 30.081, 48.962, 54.563, 3.614, 2.810, 
+        30.395, 4.741, 26.369, 32.688, 61.833, 44.234, 53.510])
+    mAPs = np.array([0.343, 0.380, 0.431, 0.430, 0.434, 0.429, 0.389, 0.383, 0.295, 
+        0.266, 0.336, 0.365, 0.355, 0.389, 0.439])
+
+    sorted_indexes = np.sort(flops)
+    ax.scatter(flops, mAPs)
+
+    shift = [[-5.5, -0.004], [1, -0.004], [-1, -0.014], [-2, 0.006], [-7, 0.006], 
+        [1, -0.01], [0.5, 0.004], [-1, -0.014], [1, -0.007], [0.8, -0.008], 
+        [1, -0.007], [1, 0.002], [-6, -0.015], [1, -0.008], [0.8, 0]]
+
+    for i, model_type in enumerate(model_types):
+        ax.annotate(model_type, (flops[i] + shift[i][0], mAPs[i] + shift[i][1]))
+
+    ax.plot(flops[[0, 1, 2]], mAPs[[0, 1, 2]])
+    ax.plot(flops[[3, 4, 5]], mAPs[[3, 4, 5]])
+    ax.plot(flops[[6, 7]], mAPs[[6, 7]])
+    ax.plot(flops[[9, 10]], mAPs[[9, 10]])
+    ax.plot(flops[[11, 12]], mAPs[[11, 12]])
+    ax.plot(flops[[13, 14]], mAPs[[13, 14]])
+
+    ax.set_xlim(0, 70)
+    ax.set_ylim(0.2, 0.5)
+    ax.set_xlabel('Multi-load_statisticss (million)', fontsize=15)
+    ax.set_ylabel('mAP', fontsize=15)
+    ax.tick_params(axis='x', labelsize=12)
+    ax.tick_params(axis='y', labelsize=12)
+
+    plt.tight_layout(0, 0, 0)
+
+    plt.savefig(save_out_path)
+    print('Write out figure to {}'.format(save_out_path))
+
+
+def plot_long_fig(args):
+    
+    # Paths
+    stats = pickle.load(open('paper_statistics/stats_for_long_fig.pkl', 'rb'))
+
+    save_out_path = 'results/long_fig.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    # Load meta
+    N = len(config.labels)
+    sorted_indexes = stats['sorted_indexes_for_plot']
+    sorted_labels = np.array(config.labels)[sorted_indexes]
+    audio_clips_per_class = stats['official_balanced_training_samples'] + stats['official_unbalanced_training_samples']
+    audio_clips_per_class = audio_clips_per_class[sorted_indexes]
+
+    # Prepare axes for plot
+    (ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b) = prepare_plot_long_4_rows(sorted_labels)
+ 
+    # plot the number of training samples
+    ax1a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax2a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax3a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax4a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+   
+    # Load mAP of different systems
+    """Average instance system of [1] with an mAP of 0.317.
+    [1] Kong, Qiuqiang, Changsong Yu, Yong Xu, Turab Iqbal, Wenwu Wang, and 
+    Mark D. Plumbley. "Weakly labelled audioset tagging with attention neural 
+    networks." IEEE/ACM Transactions on Audio, Speech, and Language Processing 
+    27, no. 11 (2019): 1791-1802."""
+    maps_avg_instances = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision']
+    maps_avg_instances = maps_avg_instances[sorted_indexes]
+
+    # PANNs Cnn14
+    maps_panns_cnn14 = stats['panns_cnn14']['eval']['average_precision']
+    maps_panns_cnn14 = maps_panns_cnn14[sorted_indexes]
+
+    # PANNs MobileNetV1
+    maps_panns_mobilenetv1 = stats['panns_mobilenetv1']['eval']['average_precision']
+    maps_panns_mobilenetv1 = maps_panns_mobilenetv1[sorted_indexes]
+
+    # PANNs Wavegram-Logmel-Cnn14
+    maps_panns_wavegram_logmel_cnn14 = stats['panns_wavegram_logmel_cnn14']['eval']['average_precision']
+    maps_panns_wavegram_logmel_cnn14 = maps_panns_wavegram_logmel_cnn14[sorted_indexes]
+
+    # Plot mAPs
+    _scatter_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='g')
+    _scatter_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='r')
+    _scatter_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, s=5, c='b')
+    _scatter_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, s=5, c='k')
+    
+    linewidth = 0.7
+    line0te = _plot_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b, 
+        c='g', linewidth=linewidth, label='AP with Wavegram-Logmel-CNN')
+    line1te = _plot_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, c='r', 
+        linewidth=linewidth, label='AP with CNN14')
+    line2te = _plot_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, c='b', 
+        linewidth=linewidth, label='AP with MobileNetV1')
+    line3te = _plot_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, c='k', 
+        linewidth=linewidth, label='AP with averaging instances (baseline)')
+
+    # Plot label quality
+    label_quality = stats['label_quality']
+    sorted_label_quality = np.array(label_quality)[sorted_indexes]
+    for k in range(len(sorted_label_quality)):
+        if sorted_label_quality[k] and sorted_label_quality[k] == 1:
+            sorted_label_quality[k] = 0.99
+    
+    ax1b.scatter(np.arange(N)[sorted_label_quality != None], 
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+    ax2b.scatter(np.arange(N)[sorted_label_quality != None], 
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+    ax3b.scatter(np.arange(N)[sorted_label_quality != None], 
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+    line_label_quality = ax4b.scatter(np.arange(N)[sorted_label_quality != None], 
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+', label='Label quality')
+    ax1b.scatter(np.arange(N)[sorted_label_quality == None], 
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax2b.scatter(np.arange(N)[sorted_label_quality == None], 
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax3b.scatter(np.arange(N)[sorted_label_quality == None], 
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax4b.scatter(np.arange(N)[sorted_label_quality == None], 
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    
+    plt.legend(handles=[line0te, line1te, line2te, line3te, line_label_quality], fontsize=6, loc=1)
+    plt.tight_layout(0, 0, 0)
+    plt.savefig(save_out_path)
+    print('Save fig to {}'.format(save_out_path))
+
+
+def prepare_plot_long_4_rows(sorted_lbs):
+    N = len(sorted_lbs)
+
+    f,(ax1a, ax2a, ax3a, ax4a) = plt.subplots(4, 1, sharey=False, facecolor='w', figsize=(10, 10.5))
+
+    fontsize = 5
+
+    K = 132
+    ax1a.set_xlim(0, K)
+    ax2a.set_xlim(K, 2 * K)
+    ax3a.set_xlim(2 * K, 3 * K)
+    ax4a.set_xlim(3 * K, N)
+    
+    truncated_sorted_lbs = []
+    for lb in sorted_lbs:
+        lb = lb[0 : 25]
+        words = lb.split(' ')
+        if len(words[-1]) < 3:
+            lb = ' '.join(words[0:-1])
+        truncated_sorted_lbs.append(lb)
+  
+    ax1a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax2a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax3a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax4a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    
+    ax1a.set_yscale('log')
+    ax2a.set_yscale('log')
+    ax3a.set_yscale('log')
+    ax4a.set_yscale('log')
+    
+    ax1b = ax1a.twinx()
+    ax2b = ax2a.twinx()
+    ax3b = ax3a.twinx()
+    ax4b = ax4a.twinx()
+    ax1b.set_ylim(0., 1.)
+    ax2b.set_ylim(0., 1.)
+    ax3b.set_ylim(0., 1.)
+    ax4b.set_ylim(0., 1.)
+    ax1b.set_ylabel('Average precision')
+    ax2b.set_ylabel('Average precision')
+    ax3b.set_ylabel('Average precision')
+    ax4b.set_ylabel('Average precision')
+    
+    ax1b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax2b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax3b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax4b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    
+    ax1a.xaxis.set_ticks(np.arange(K))
+    ax1a.xaxis.set_ticklabels(truncated_sorted_lbs[0:K], rotation=90, fontsize=fontsize)
+    ax1a.xaxis.tick_bottom()
+    ax1a.set_ylabel("Number of audio clips")
+    
+    ax2a.xaxis.set_ticks(np.arange(K, 2*K))
+    ax2a.xaxis.set_ticklabels(truncated_sorted_lbs[K:2*K], rotation=90, fontsize=fontsize)
+    ax2a.xaxis.tick_bottom()
+    ax2a.set_ylabel("Number of audio clips")
+    
+    ax3a.xaxis.set_ticks(np.arange(2*K, 3*K))
+    ax3a.xaxis.set_ticklabels(truncated_sorted_lbs[2*K:3*K], rotation=90, fontsize=fontsize)
+    ax3a.xaxis.tick_bottom()
+    ax3a.set_ylabel("Number of audio clips")
+    
+    ax4a.xaxis.set_ticks(np.arange(3*K, N))
+    ax4a.xaxis.set_ticklabels(truncated_sorted_lbs[3*K:], rotation=90, fontsize=fontsize)
+    ax4a.xaxis.tick_bottom()
+    ax4a.set_ylabel("Number of audio clips")
+    
+    ax1a.spines['right'].set_visible(False)
+    ax1b.spines['right'].set_visible(False)
+    ax2a.spines['left'].set_visible(False)
+    ax2b.spines['left'].set_visible(False)
+    ax2a.spines['right'].set_visible(False)
+    ax2b.spines['right'].set_visible(False)
+    ax3a.spines['left'].set_visible(False)
+    ax3b.spines['left'].set_visible(False)
+    ax3a.spines['right'].set_visible(False)
+    ax3b.spines['right'].set_visible(False)
+    ax4a.spines['left'].set_visible(False)
+    ax4b.spines['left'].set_visible(False)
+    
+    plt.subplots_adjust(hspace = 0.8)
+    
+    return ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b
+
+
+def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.):
+    N = len(x)
+    ax.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax2.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax3.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax4.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+
+
+def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, label=""):
+    N = len(x)
+    ax.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    ax2.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    ax3.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    line, = ax4.plot(x, c=c, linewidth=linewidth, alpha=alpha, label=label)
+    return line
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='')
+    subparsers = parser.add_subparsers(dest='mode')
+    
+    parser_classwise_iteration_map = subparsers.add_parser('plot_classwise_iteration_map')
+    parser_six_figures = subparsers.add_parser('plot_six_figures')
+    parser_complexity_map = subparsers.add_parser('plot_complexity_map')
+    parser_long_fig = subparsers.add_parser('plot_long_fig')
+    
+    args = parser.parse_args()
+
+    if args.mode == 'plot_classwise_iteration_map':
+        plot_classwise_iteration_map(args)
+
+    elif args.mode == 'plot_six_figures':
+        plot_six_figures(args)
+    
+    elif args.mode == 'plot_complexity_map':
+        plot_complexity_map(args)
+
+    elif args.mode == 'plot_long_fig':
+        plot_long_fig(args)
+
+    else:
+    	raise Exception('Incorrect argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/plot_statistics.py b/audio_detection/audio_infer/utils/plot_statistics.py
new file mode 100644
index 0000000000000000000000000000000000000000..bebb28af3e3468e8422c6901e1aba9600270ef89
--- /dev/null
+++ b/audio_detection/audio_infer/utils/plot_statistics.py
@@ -0,0 +1,2034 @@
+import os
+import sys
+import numpy as np
+import argparse
+import h5py
+import time
+import _pickle as cPickle
+import _pickle
+import matplotlib.pyplot as plt
+import csv
+from sklearn import metrics
+
+from utilities import (create_folder, get_filename, d_prime)
+import config
+
+
+def _load_metrics0(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+    fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+    workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+    statistics_path = os.path.join(workspace0, 'statistics', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+        'statistics.pkl')
+
+    statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+    bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+    bal_map = np.mean(bal_map, axis=-1)
+    test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+    test_map = np.mean(test_map, axis=-1)
+    legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+    # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+    return bal_map, test_map, legend
+
+
+def _load_metrics0_classwise(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+    fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+    workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+    statistics_path = os.path.join(workspace0, 'statistics', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+        'statistics.pkl')
+
+    statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+    return statistics_dict['test'][300]['average_precision']
+
+
+def _load_metrics0_classwise2(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+    fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+    workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+    statistics_path = os.path.join(workspace0, 'statistics', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+        'statistics.pkl')
+
+    statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+    k = 270
+    mAP = np.mean(statistics_dict['test'][k]['average_precision'])
+    mAUC = np.mean(statistics_dict['test'][k]['auc'])
+    dprime = d_prime(mAUC)
+    return mAP, mAUC, dprime
+
+
+def _load_metrics_classwise(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+    fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+    workspace = '/mnt/cephfs_new_wj/speechsv/kongqiuqiang/workspaces/cvssp/pub_audioset_tagging_cnn'
+    statistics_path = os.path.join(workspace, 'statistics', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+        'statistics.pkl')
+
+    statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+    
+    k = 300
+    mAP = np.mean(statistics_dict['test'][k]['average_precision'])
+    mAUC = np.mean(statistics_dict['test'][k]['auc'])
+    dprime = d_prime(mAUC)
+    return mAP, mAUC, dprime
+
+
+def plot(args):
+    
+    # Arguments & parameters
+    dataset_dir = args.dataset_dir
+    workspace = args.workspace
+    select = args.select
+    
+    classes_num = config.classes_num
+    max_plot_iteration = 1000000
+    iterations = np.arange(0, max_plot_iteration, 2000)
+
+    class_labels_indices_path = os.path.join(dataset_dir, 'metadata', 
+        'class_labels_indices.csv')
+        
+    save_out_path = 'results/{}.pdf'.format(select)
+    create_folder(os.path.dirname(save_out_path))
+    
+    # Read labels
+    labels = config.labels
+    
+    # Plot
+    fig, ax = plt.subplots(1, 1, figsize=(15, 8))
+    lines = []
+        
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+        statistics_path = os.path.join(workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+        bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+        bal_map = np.mean(bal_map, axis=-1)
+        test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+        test_map = np.mean(test_map, axis=-1)
+        legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+        # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+        return bal_map, test_map, legend
+        
+    bal_alpha = 0.3
+    test_alpha = 1.0
+    lines = []
+
+    if select == '1_cnn13':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_no_dropout', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_no_specaug', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_no_specaug', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_no_dropout', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_no_mixup', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_mixup_in_wave', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_mixup_in_wave', color='c', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_pooling':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_gwrp', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_gmpgapgwrp', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_att', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_gmpgapatt', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_resnet':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet18', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='ResNet18', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='resnet34', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet50', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='resnet50', color='c', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_densenet':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'DenseNet121', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='densenet121', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'DenseNet201', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='densenet201', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_cnn9':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn5', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn9', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_hop':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            500, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_hop500', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            640, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_hop640', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            1000, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_hop1000', color='k', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_emb':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_emb32', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_emb128', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_emb512', color='k', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_mobilenet':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='mobilenetv1', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV2', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='mobilenetv2', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_waveform':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_LeeNet', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet18', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_LeeNet18', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_DaiNet', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_DaiNet', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='c', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet50', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_ResNet50', color='m', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_waveform_cnn2d':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_SpAndWav', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_WavCnn2d', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_decision_level':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelMax', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_DecisionLevelMax', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAvg', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_DecisionLevelAvg', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAtt', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_DecisionLevelAtt', color='k', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_transformer':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer1', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_Transformer1', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer3', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_Transformer3', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer6', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_Transformer6', color='k', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_aug':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,mixup', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,none,none', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,none', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup_from_0_epoch', 32)
+        line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,mixup_from_0_epoch', color='m', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_bal_train_aug':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,mixup', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,none,none', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,none', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup_from_0_epoch', 32)
+        line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,mixup_from_0_epoch', color='m', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_sr':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_16k', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_8k', color='b', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_time_domain':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_time_domain', color='b', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_partial_full':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.9_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,partial_0.9', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,partial_0.8', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.7_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,partial_0.7', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,partial_0.5', color='m', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_window':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 2048, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_win2048', color='b', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_melbins':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_mel32', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_mel128', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_alternate':
+        max_plot_iteration = 2000000
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'alternate', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_alternate', color='b', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '2_all':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='MobileNetV1', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='ResNet34', color='grey', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_WavCnn2d', color='m', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_SpAndWav', color='orange', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '2_emb':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_emb32', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_128', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_512', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '2_aug':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+                320, 64, 50, 14000, 'full_train', 'Cnn13_no_specaug', 'clip_bce', 'none', 'none', 32)
+        line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,none,none', color='c', alpha=test_alpha)
+        lines.append(line)
+
+        
+
+    ax.set_ylim(0, 1.)
+    ax.set_xlim(0, len(iterations))
+    ax.xaxis.set_ticks(np.arange(0, len(iterations), 25))
+    ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000))
+    ax.yaxis.set_ticks(np.arange(0, 1.01, 0.05))
+    ax.yaxis.set_ticklabels(np.around(np.arange(0, 1.01, 0.05), decimals=2))        
+    ax.grid(color='b', linestyle='solid', linewidth=0.3)
+    plt.legend(handles=lines, loc=2)
+    # box = ax.get_position()
+    # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+    # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0))
+
+    plt.savefig(save_out_path)
+    print('Save figure to {}'.format(save_out_path))
+
+
+def plot_for_paper(args):
+    
+    # Arguments & parameters
+    dataset_dir = args.dataset_dir
+    workspace = args.workspace
+    select = args.select
+    
+    classes_num = config.classes_num
+    max_plot_iteration = 1000000
+    iterations = np.arange(0, max_plot_iteration, 2000)
+
+    class_labels_indices_path = os.path.join(dataset_dir, 'metadata', 
+        'class_labels_indices.csv')
+        
+    save_out_path = 'results/paper_{}.pdf'.format(select)
+    create_folder(os.path.dirname(save_out_path))
+    
+    # Read labels
+    labels = config.labels
+    
+    # Plot
+    fig, ax = plt.subplots(1, 1, figsize=(6, 4))
+    lines = []
+        
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+        statistics_path = os.path.join(workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+        bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+        bal_map = np.mean(bal_map, axis=-1)
+        test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+        test_map = np.mean(test_map, axis=-1)
+        legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+        # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+        return bal_map, test_map, legend
+
+    bal_alpha = 0.3
+    test_alpha = 1.0
+    lines = []
+    linewidth = 1.
+
+    max_plot_iteration = 540000
+
+    if select == '2_all':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha)
+        # lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha)
+        # lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        # line, = ax.plot(test_map, label='Wavegram-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+    elif select == '2_emb':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='Cnn13_512', color='g', alpha=test_alpha)
+        # lines.append(line)
+
+    elif select == '2_bal':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+    elif select == '2_sr':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+    elif select == '2_partial':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'partial_0.9_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        # line, = ax.plot(test_map, label='cnn14,partial_0.9', color='b', alpha=test_alpha, linewidth=linewidth)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'partial_0.7_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+        # line, = ax.plot(test_map, label='cnn14,partial_0.7', color='k', alpha=test_alpha, linewidth=linewidth)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+    elif select == '2_melbins':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+    ax.set_ylim(0, 0.8)
+    ax.set_xlim(0, len(iterations))
+    ax.set_xlabel('Iterations')
+    ax.set_ylabel('mAP')
+    ax.xaxis.set_ticks(np.arange(0, len(iterations), 50))
+    # ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000))
+    ax.xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k'])
+    ax.yaxis.set_ticks(np.arange(0, 0.81, 0.05))
+    ax.yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3', '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8'])
+    # ax.yaxis.set_ticklabels(np.around(np.arange(0, 0.81, 0.05), decimals=2))        
+    ax.yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+    ax.xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+    plt.legend(handles=lines, loc=2)
+    plt.tight_layout(0, 0, 0)
+    # box = ax.get_position()
+    # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+    # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0))
+
+    plt.savefig(save_out_path)
+    print('Save figure to {}'.format(save_out_path))
+
+
+def plot_for_paper2(args):
+    
+    # Arguments & parameters
+    dataset_dir = args.dataset_dir
+    workspace = args.workspace
+    
+    classes_num = config.classes_num
+    max_plot_iteration = 1000000
+    iterations = np.arange(0, max_plot_iteration, 2000)
+
+    class_labels_indices_path = os.path.join(dataset_dir, 'metadata', 
+        'class_labels_indices.csv')
+        
+    save_out_path = 'results/paper2.pdf'
+    create_folder(os.path.dirname(save_out_path))
+    
+    # Read labels
+    labels = config.labels
+    
+    # Plot
+    fig, ax = plt.subplots(2, 3, figsize=(14, 7))
+    lines = []
+        
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+        statistics_path = os.path.join(workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+        bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+        bal_map = np.mean(bal_map, axis=-1)
+        test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+        test_map = np.mean(test_map, axis=-1)
+        legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+        # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+        return bal_map, test_map, legend
+
+    def _load_metrics0(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+        workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+        statistics_path = os.path.join(workspace0, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+        bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+        bal_map = np.mean(bal_map, axis=-1)
+        test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+        test_map = np.mean(test_map, axis=-1)
+        legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+        # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+        return bal_map, test_map, legend
+        
+    bal_alpha = 0.3
+    test_alpha = 1.0
+    lines = []
+    linewidth = 1.
+
+    max_plot_iteration = 540000
+
+    if True:
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha)
+        # lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha)
+        # lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax[0, 0].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+        # line, = ax[0, 0].plot(test_map, label='ResNet38', color='k', alpha=test_alpha, linewidth=linewidth)
+        # lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        # line, = ax.plot(test_map, label='Wavegram-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 0].legend(handles=lines, loc=2)
+        ax[0, 0].set_title('(a) Comparison of architectures')
+
+    if True:
+        lines = []
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+        line, = ax[0, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 1].plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax[0, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax[0, 1].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 1].plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 1].legend(handles=lines, loc=2, fontsize=8)
+
+        ax[0, 1].set_title('(b) Comparison of training data and augmentation')
+
+    if True:
+        lines = []
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 2].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 2].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 2].legend(handles=lines, loc=2)
+        ax[0, 2].set_title('(c) Comparison of embedding size')
+
+    if True:
+        lines = []
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 0].legend(handles=lines, loc=2)
+        ax[1, 0].set_title('(d) Comparison of amount of training data')
+
+    if True:
+        lines = []
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 1].legend(handles=lines, loc=2)
+        ax[1, 1].set_title('(e) Comparison of sampling rate')
+
+    if True:
+        lines = []
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 2].plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 2].plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax[1, 2].plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 2].plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax[1, 2].plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 2].legend(handles=lines, loc=2)
+        ax[1, 2].set_title('(f) Comparison of mel bins number')
+
+    for i in range(2):
+        for j in range(3):
+            ax[i, j].set_ylim(0, 0.8)
+            ax[i, j].set_xlim(0, len(iterations))
+            ax[i, j].set_xlabel('Iterations')
+            ax[i, j].set_ylabel('mAP')
+            ax[i, j].xaxis.set_ticks(np.arange(0, len(iterations), 50))
+            # ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000))
+            ax[i, j].xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k'])
+            ax[i, j].yaxis.set_ticks(np.arange(0, 0.81, 0.05))
+            ax[i, j].yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3', '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8'])
+            # ax.yaxis.set_ticklabels(np.around(np.arange(0, 0.81, 0.05), decimals=2))        
+            ax[i, j].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+            ax[i, j].xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+
+    plt.tight_layout(0, 1, 0)
+    # box = ax.get_position()
+    # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+    # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0))
+
+    plt.savefig(save_out_path)
+    print('Save figure to {}'.format(save_out_path))
+
+
+def table_values(args):
+    
+    # Arguments & parameters
+    dataset_dir = args.dataset_dir
+    workspace = args.workspace
+    select = args.select
+    
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration):
+        statistics_path = os.path.join(workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+ 
+        idx = iteration // 2000
+        mAP = np.mean(statistics_dict['test'][idx]['average_precision'])
+        mAUC = np.mean(statistics_dict['test'][idx]['auc'])
+        dprime = d_prime(mAUC)
+
+        print('mAP: {:.3f}'.format(mAP))
+        print('mAUC: {:.3f}'.format(mAUC))
+        print('dprime: {:.3f}'.format(dprime))
+
+
+    if select == 'cnn13':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn5':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn9':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_decisionlevelmax':
+        iteration = 400000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelMax', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+    
+    elif select == 'cnn13_decisionlevelavg':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAvg', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_decisionlevelatt':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAtt', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_emb32':
+        iteration = 560000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_emb128':
+        iteration = 560000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_emb512':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_hop500':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            500, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_hop640':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            640, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_hop1000':
+        iteration = 540000
+        _load_metrics('main', 32000, 1024, 
+            1000, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'mobilenetv1':
+        iteration = 560000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'mobilenetv2':
+        iteration = 560000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV2', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'resnet18':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet18', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'resnet34':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'resnet50':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet50', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'dainet':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_DaiNet', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'leenet':
+        iteration = 540000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'leenet18':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet18', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'resnet34_1d':
+        iteration = 500000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'resnet50_1d':
+        iteration = 500000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet50', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'waveform_cnn2d':
+        iteration = 660000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'waveform_spandwav':
+        iteration = 700000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ 
+def crop_label(label):
+    max_len = 16
+    if len(label) <= max_len:
+        return label
+    else:
+        words = label.split(' ')
+        cropped_label = ''
+        for w in words:
+            if len(cropped_label + ' ' + w) > max_len:
+                break
+            else:
+                cropped_label += ' {}'.format(w)
+    return cropped_label
+
+def add_comma(integer):
+    integer = int(integer)
+    if integer >= 1000:
+        return str(integer // 1000) + ',' + str(integer % 1000)
+    else:
+        return str(integer)
+
+
+def plot_class_iteration(args):
+    
+    # Arguments & parameters
+    workspace = args.workspace
+    select = args.select
+    
+    save_out_path = 'results_map/class_iteration_map.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration):
+        statistics_path = os.path.join(workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+        return statistics_dict
+
+    iteration = 600000
+    statistics_dict = _load_metrics('main', 32000, 1024, 
+        320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    mAP_mat = np.array([e['average_precision'] for e in statistics_dict['test']])
+    mAP_mat = mAP_mat[0 : 300, :]
+    sorted_indexes = np.argsort(config.full_samples_per_class)[::-1]
+
+
+    fig, axs = plt.subplots(1, 3, figsize=(20, 5))
+    ranges = [np.arange(0, 10), np.arange(250, 260), np.arange(517, 527)]
+    axs[0].set_ylabel('AP')
+
+    for col in range(0, 3):
+        axs[col].set_ylim(0, 1.)
+        axs[col].set_xlim(0, 301)
+        axs[col].set_xlabel('Iterations')
+        axs[col].set_ylabel('AP')
+        axs[col].xaxis.set_ticks(np.arange(0, 301, 100))
+        axs[col].xaxis.set_ticklabels(['0', '200k', '400k', '600k'])
+        lines = []
+        for _ix in ranges[col]:
+            _label = crop_label(config.labels[sorted_indexes[_ix]]) + \
+                ' ({})'.format(add_comma(config.full_samples_per_class[sorted_indexes[_ix]]))
+            line, = axs[col].plot(mAP_mat[:, sorted_indexes[_ix]], label=_label)
+            lines.append(line)
+        box = axs[col].get_position()
+        axs[col].set_position([box.x0, box.y0, box.width * 1., box.height])
+        axs[col].legend(handles=lines, bbox_to_anchor=(1., 1.))
+        axs[col].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+ 
+    plt.tight_layout(pad=4, w_pad=1, h_pad=1)
+    plt.savefig(save_out_path)
+    print(save_out_path)
+
+
+def _load_old_metrics(workspace, filename, iteration, data_type):
+    
+    assert data_type in ['train', 'test']
+    
+    stat_name = "stat_{}_iters.p".format(iteration)
+
+    # Load stats
+    stat_path = os.path.join(workspace, "stats", filename, data_type, stat_name)
+    try:
+        stats = cPickle.load(open(stat_path, 'rb'))
+    except:
+        stats = cPickle.load(open(stat_path, 'rb'), encoding='latin1')
+
+    precisions = [stat['precisions'] for stat in stats]
+    recalls = [stat['recalls'] for stat in stats]
+    maps = np.array([stat['AP'] for stat in stats])
+    aucs = np.array([stat['auc'] for stat in stats])
+    
+    return {'average_precision': maps, 'AUC': aucs}
+
+def _sort(ys):
+    sorted_idxes = np.argsort(ys)
+    sorted_idxes = sorted_idxes[::-1]
+    sorted_ys = ys[sorted_idxes]
+    sorted_lbs = [config.labels[e] for e in sorted_idxes]
+    return sorted_ys, sorted_idxes, sorted_lbs
+
+def load_data(hdf5_path):
+    with h5py.File(hdf5_path, 'r') as hf:
+        x = hf['x'][:]
+        y = hf['y'][:]
+        video_id_list = list(hf['video_id_list'][:])
+    return x, y, video_id_list
+
+def get_avg_stats(workspace, bgn_iter, fin_iter, interval_iter, filename, data_type):
+    
+    assert data_type in ['train', 'test']
+    bal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/bal_train.h5"
+    eval_hdf5 = "/vol/vssp/msos/audioset/packed_features/eval.h5"
+    unbal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/unbal_train.h5"
+    
+    t1 = time.time()
+    if data_type == 'test':
+        (te_x, te_y, te_id_list) = load_data(eval_hdf5)
+    elif data_type == 'train':
+        (te_x, te_y, te_id_list) = load_data(bal_train_hdf5)
+    y = te_y
+    
+    prob_dir = os.path.join(workspace, "probs", filename, data_type)
+    names = os.listdir(prob_dir)
+    
+    probs = []
+    iters = range(bgn_iter, fin_iter, interval_iter)
+    for iter in iters:
+        pickle_path = os.path.join(prob_dir, "prob_%d_iters.p" % iter)
+        try:
+            prob = cPickle.load(open(pickle_path, 'rb'))
+        except:
+            prob = cPickle.load(open(pickle_path, 'rb'), encoding='latin1')
+        probs.append(prob)
+    
+    avg_prob = np.mean(np.array(probs), axis=0)
+    
+    n_out = y.shape[1]
+    stats = []
+    for k in range(n_out): # around 7 seconds
+        (precisions, recalls, thresholds) = metrics.precision_recall_curve(y[:, k], avg_prob[:, k])
+        avg_precision = metrics.average_precision_score(y[:, k], avg_prob[:, k], average=None)
+        (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], avg_prob[:, k])
+        auc = metrics.roc_auc_score(y[:, k], avg_prob[:, k], average=None)
+        # eer = pp_data.eer(avg_prob[:, k], y[:, k])
+        
+        skip = 1000
+        dict = {'precisions': precisions[0::skip], 'recalls': recalls[0::skip], 'AP': avg_precision, 
+                'fpr': fpr[0::skip], 'fnr': 1. - tpr[0::skip], 'auc': auc}
+        
+        stats.append(dict)
+        
+    mAPs = np.array([e['AP'] for e in stats])
+    aucs = np.array([e['auc'] for e in stats])
+        
+    print("Get avg time: {}".format(time.time() - t1))
+        
+    return {'average_precision': mAPs, 'auc': aucs}
+
+
+def _samples_num_per_class():
+    bal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/bal_train.h5"
+    eval_hdf5 = "/vol/vssp/msos/audioset/packed_features/eval.h5"
+    unbal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/unbal_train.h5"
+
+    (x, y, id_list) = load_data(eval_hdf5)
+    eval_num = np.sum(y, axis=0)
+
+    (x, y, id_list) = load_data(bal_train_hdf5)
+    bal_num = np.sum(y, axis=0)
+
+    (x, y, id_list) = load_data(unbal_train_hdf5)
+    unbal_num = np.sum(y, axis=0)
+
+    return bal_num, unbal_num, eval_num
+
+
+def get_label_quality():
+    
+    rate_csv = '/vol/vssp/msos/qk/workspaces/pub_audioset_tagging_cnn_transfer/metadata/qa_true_counts.csv'
+    
+    with open(rate_csv, 'r') as f:
+        reader = csv.reader(f, delimiter=',')
+        lis = list(reader)
+        
+    rates = []
+
+    for n in range(1, len(lis)):
+        li = lis[n]
+        if float(li[1]) == 0:
+            rate = None
+        else:
+            rate = float(li[2]) / float(li[1])
+        rates.append(rate)
+    
+    return rates
+
+
+def summary_stats(args):
+    # Arguments & parameters
+    workspace = args.workspace
+
+    out_stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl')
+    create_folder(os.path.dirname(out_stat_path))
+
+    # Old workspace
+    old_workspace = '/vol/vssp/msos/qk/workspaces/audioset_classification'
+
+    # bal_train_metrics = _load_old_metrics(old_workspace, 'tmp127', 20000, 'train')
+    # eval_metrics = _load_old_metrics(old_workspace, 'tmp127', 20000, 'test')
+    
+    bal_train_metrics = get_avg_stats(old_workspace, bgn_iter=10000, fin_iter=50001, interval_iter=5000, filename='tmp127_re', data_type='train')
+    eval_metrics = get_avg_stats(old_workspace, bgn_iter=10000, fin_iter=50001, interval_iter=5000, filename='tmp127_re', data_type='test')
+
+    maps0te = eval_metrics['average_precision']
+    (maps0te, sorted_idxes, sorted_lbs) = _sort(maps0te)
+
+    bal_num, unbal_num, eval_num = _samples_num_per_class()
+
+    output_dict = {
+        'labels': config.labels, 
+        'label_quality': get_label_quality(), 
+        'sorted_indexes_for_plot': sorted_idxes, 
+        'official_balanced_trainig_samples': bal_num, 
+        'official_unbalanced_training_samples': unbal_num, 
+        'official_eval_samples': eval_num, 
+        'downloaded_full_training_samples': config.full_samples_per_class, 
+        'averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations': 
+            {'bal_train': bal_train_metrics, 'eval': eval_metrics}
+        }
+
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration):
+        _workspace = '/vol/vssp/msos/qk/bytedance/workspaces_important/pub_audioset_tagging_cnn_transfer'
+        statistics_path = os.path.join(_workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+        _idx = iteration // 2000
+        _dict = {'bal_train': {'average_precision': statistics_dict['bal'][_idx]['average_precision'], 
+                                'auc': statistics_dict['bal'][_idx]['auc']}, 
+                'eval': {'average_precision': statistics_dict['test'][_idx]['average_precision'], 
+                        'auc': statistics_dict['test'][_idx]['auc']}}
+        return _dict
+
+    iteration = 600000
+    output_dict['cnn13_system_iteration60k'] = _load_metrics('main', 32000, 1024, 
+        320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    iteration = 560000
+    output_dict['mobilenetv1_system_iteration56k'] = _load_metrics('main', 32000, 1024, 
+        320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    cPickle.dump(output_dict, open(out_stat_path, 'wb'))
+    print('Write stats for paper to {}'.format(out_stat_path))
+
+ 
+def prepare_plot_long_4_rows(sorted_lbs):
+    N = len(sorted_lbs)
+
+    f,(ax1a, ax2a, ax3a, ax4a) = plt.subplots(4, 1,sharey=False, facecolor='w', figsize=(10, 12))
+
+    fontsize = 5
+
+    K = 132
+    ax1a.set_xlim(0, K)
+    ax2a.set_xlim(K, 2 * K)
+    ax3a.set_xlim(2 * K, 3 * K)
+    ax4a.set_xlim(3 * K, N)
+    
+    truncated_sorted_lbs = []
+    for lb in sorted_lbs:
+        lb = lb[0 : 25]
+        words = lb.split(' ')
+        if len(words[-1]) < 3:
+            lb = ' '.join(words[0:-1])
+        truncated_sorted_lbs.append(lb)
+  
+    ax1a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax2a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax3a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax4a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    
+    ax1a.set_yscale('log')
+    ax2a.set_yscale('log')
+    ax3a.set_yscale('log')
+    ax4a.set_yscale('log')
+    
+    ax1b = ax1a.twinx()
+    ax2b = ax2a.twinx()
+    ax3b = ax3a.twinx()
+    ax4b = ax4a.twinx()
+    ax1b.set_ylim(0., 1.)
+    ax2b.set_ylim(0., 1.)
+    ax3b.set_ylim(0., 1.)
+    ax4b.set_ylim(0., 1.)
+    ax1b.set_ylabel('Average precision')
+    ax2b.set_ylabel('Average precision')
+    ax3b.set_ylabel('Average precision')
+    ax4b.set_ylabel('Average precision')
+    
+    ax1b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax2b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax3b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax4b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    
+    ax1a.xaxis.set_ticks(np.arange(K))
+    ax1a.xaxis.set_ticklabels(truncated_sorted_lbs[0:K], rotation=90, fontsize=fontsize)
+    ax1a.xaxis.tick_bottom()
+    ax1a.set_ylabel("Number of audio clips")
+    
+    ax2a.xaxis.set_ticks(np.arange(K, 2*K))
+    ax2a.xaxis.set_ticklabels(truncated_sorted_lbs[K:2*K], rotation=90, fontsize=fontsize)
+    ax2a.xaxis.tick_bottom()
+    # ax2a.tick_params(left='off', which='both')
+    ax2a.set_ylabel("Number of audio clips")
+    
+    ax3a.xaxis.set_ticks(np.arange(2*K, 3*K))
+    ax3a.xaxis.set_ticklabels(truncated_sorted_lbs[2*K:3*K], rotation=90, fontsize=fontsize)
+    ax3a.xaxis.tick_bottom()
+    ax3a.set_ylabel("Number of audio clips")
+    
+    ax4a.xaxis.set_ticks(np.arange(3*K, N))
+    ax4a.xaxis.set_ticklabels(truncated_sorted_lbs[3*K:], rotation=90, fontsize=fontsize)
+    ax4a.xaxis.tick_bottom()
+    # ax4a.tick_params(left='off', which='both')
+    ax4a.set_ylabel("Number of audio clips")
+    
+    ax1a.spines['right'].set_visible(False)
+    ax1b.spines['right'].set_visible(False)
+    ax2a.spines['left'].set_visible(False)
+    ax2b.spines['left'].set_visible(False)
+    ax2a.spines['right'].set_visible(False)
+    ax2b.spines['right'].set_visible(False)
+    ax3a.spines['left'].set_visible(False)
+    ax3b.spines['left'].set_visible(False)
+    ax3a.spines['right'].set_visible(False)
+    ax3b.spines['right'].set_visible(False)
+    ax4a.spines['left'].set_visible(False)
+    ax4b.spines['left'].set_visible(False)
+    
+    plt.subplots_adjust(hspace = 0.8)
+    
+    return ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b
+
+def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.):
+    N = len(x)
+    ax.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax2.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax3.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax4.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+
+def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, label=""):
+    N = len(x)
+    ax.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    ax2.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    ax3.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    line, = ax4.plot(x, c=c, linewidth=linewidth, alpha=alpha, label=label)
+    return line
+
+def plot_long_fig(args):
+    # Arguments & parameters
+    workspace = args.workspace
+    
+    # Paths
+    stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl')
+    save_out_path = 'results/long_fig.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    # Stats
+    stats = cPickle.load(open(stat_path, 'rb'))
+
+    N = len(config.labels)
+    sorted_indexes = stats['sorted_indexes_for_plot']
+    sorted_labels = np.array(config.labels)[sorted_indexes]
+    audio_clips_per_class = stats['official_balanced_trainig_samples'] + stats['official_unbalanced_training_samples']
+    audio_clips_per_class = audio_clips_per_class[sorted_indexes]
+
+    (ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b) = prepare_plot_long_4_rows(sorted_labels)
+ 
+    # plot the same data on both axes
+    ax1a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax2a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax3a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax4a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+   
+    maps_avg_instances = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision']
+    maps_avg_instances = maps_avg_instances[sorted_indexes]
+
+    maps_cnn13 = stats['cnn13_system_iteration60k']['eval']['average_precision']
+    maps_cnn13 = maps_cnn13[sorted_indexes]
+
+    maps_mobilenetv1 = stats['mobilenetv1_system_iteration56k']['eval']['average_precision']
+    maps_mobilenetv1 = maps_mobilenetv1[sorted_indexes]
+
+    maps_logmel_wavegram_cnn = _load_metrics0_classwise('main', 32000, 1024, 
+        320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+    maps_logmel_wavegram_cnn = maps_logmel_wavegram_cnn[sorted_indexes]
+
+    _scatter_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, s=5, c='k')
+    _scatter_4_rows(maps_cnn13, ax1b, ax2b, ax3b, ax4b, s=5, c='r')
+    _scatter_4_rows(maps_mobilenetv1, ax1b, ax2b, ax3b, ax4b, s=5, c='b')
+    _scatter_4_rows(maps_logmel_wavegram_cnn, ax1b, ax2b, ax3b, ax4b, s=5, c='g')
+    
+    linewidth = 0.7
+    line0te = _plot_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, c='k', linewidth=linewidth, label='AP with averaging instances (baseline)')
+    line1te = _plot_4_rows(maps_cnn13, ax1b, ax2b, ax3b, ax4b, c='r', linewidth=linewidth, label='AP with CNN14')
+    line2te = _plot_4_rows(maps_mobilenetv1, ax1b, ax2b, ax3b, ax4b, c='b', linewidth=linewidth, label='AP with MobileNetV1')
+    line3te = _plot_4_rows(maps_logmel_wavegram_cnn, ax1b, ax2b, ax3b, ax4b, c='g', linewidth=linewidth, label='AP with Wavegram-Logmel-CNN')
+
+    label_quality = stats['label_quality']
+    sorted_rate = np.array(label_quality)[sorted_indexes]
+    for k in range(len(sorted_rate)):
+        if sorted_rate[k] and sorted_rate[k] == 1:
+            sorted_rate[k] = 0.99
+    
+    ax1b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+')
+    ax2b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+')
+    ax3b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+')
+    line_label_quality = ax4b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+', label='Label quality')
+    ax1b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax2b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax3b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax4b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+    
+    plt.legend(handles=[line0te, line1te, line2te, line3te, line_label_quality], fontsize=6, loc=1)
+    
+    plt.savefig(save_out_path)
+    print('Save fig to {}'.format(save_out_path))
+ 
+def plot_flops(args):
+
+    # Arguments & parameters
+    workspace = args.workspace
+    
+    # Paths
+    save_out_path = 'results_map/flops.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    plt.figure(figsize=(5, 5))
+    fig, ax = plt.subplots(1, 1)
+
+    model_types = np.array(['Cnn6', 'Cnn10', 'Cnn14', 'ResNet22', 'ResNet38', 'ResNet54', 
+        'MobileNetV1', 'MobileNetV2', 'DaiNet', 'LeeNet', 'LeeNet18', 
+        'Res1dNet30', 'Res1dNet44', 'Wavegram-CNN', 'Wavegram-\nLogmel-CNN'])
+    flops = np.array([21.986, 21.986, 42.220, 30.081, 48.962, 54.563, 3.614, 2.810, 
+        30.395, 4.741, 26.369, 32.688, 61.833, 44.234, 53.510])
+    mAPs = np.array([0.343, 0.380, 0.431, 0.430, 0.434, 0.429, 0.389, 0.383, 0.295, 
+        0.266, 0.336, 0.365, 0.355, 0.389, 0.439])
+
+    sorted_indexes = np.sort(flops)
+    ax.scatter(flops, mAPs)
+
+    shift = [[1, 0.002], [1, -0.006], [-1, -0.014], [-2, 0.006], [-7, 0.006], 
+        [1, -0.01], [0.5, 0.004], [-1, -0.014], [1, -0.007], [0.8, -0.008], 
+        [1, -0.007], [1, 0.002], [-6, -0.015], [1, -0.008], [0.8, 0]]
+
+    for i, model_type in enumerate(model_types):
+        ax.annotate(model_type, (flops[i] + shift[i][0], mAPs[i] + shift[i][1]))
+
+    ax.plot(flops[[0, 1, 2]], mAPs[[0, 1, 2]])
+    ax.plot(flops[[3, 4, 5]], mAPs[[3, 4, 5]])
+    ax.plot(flops[[6, 7]], mAPs[[6, 7]])
+    ax.plot(flops[[9, 10]], mAPs[[9, 10]])
+    ax.plot(flops[[11, 12]], mAPs[[11, 12]])
+    ax.plot(flops[[13, 14]], mAPs[[13, 14]])
+
+    ax.set_xlim(0, 70)
+    ax.set_ylim(0.2, 0.5)
+    ax.set_xlabel('Multi-adds (million)')
+    ax.set_ylabel('mAP')
+
+    plt.tight_layout(0, 0, 0)
+
+    plt.savefig(save_out_path)
+    print('Write out figure to {}'.format(save_out_path))
+
+
+def spearman(args):
+
+    # Arguments & parameters
+    workspace = args.workspace
+
+    # Paths
+    stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl')
+
+    # Stats
+    stats = cPickle.load(open(stat_path, 'rb'))
+
+    label_quality = np.array([qu if qu else 0.5 for qu in stats['label_quality']])
+    training_samples = np.array(stats['official_balanced_trainig_samples']) + \
+        np.array(stats['official_unbalanced_training_samples'])
+    mAP = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision']
+
+    import scipy
+    samples_spearman = scipy.stats.spearmanr(training_samples, mAP)[0]
+    quality_spearman = scipy.stats.spearmanr(label_quality, mAP)[0]
+
+    print('Training samples spearman: {:.3f}'.format(samples_spearman))
+    print('Quality spearman: {:.3f}'.format(quality_spearman))
+
+
+def print_results(args):
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+    # 
+    (mAP, mAUC, dprime) = _load_metrics0_classwise2('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics0_classwise2('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+
+    # partial
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+    # Sample rate
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+
+    # Mel bins
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+
+    import crash
+    asdf
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='')
+    subparsers = parser.add_subparsers(dest='mode')
+    
+    parser_plot = subparsers.add_parser('plot')
+    parser_plot.add_argument('--dataset_dir', type=str, required=True)
+    parser_plot.add_argument('--workspace', type=str, required=True)
+    parser_plot.add_argument('--select', type=str, required=True)
+    
+    parser_plot = subparsers.add_parser('plot_for_paper')
+    parser_plot.add_argument('--dataset_dir', type=str, required=True)
+    parser_plot.add_argument('--workspace', type=str, required=True)
+    parser_plot.add_argument('--select', type=str, required=True)
+
+    parser_plot = subparsers.add_parser('plot_for_paper2')
+    parser_plot.add_argument('--dataset_dir', type=str, required=True)
+    parser_plot.add_argument('--workspace', type=str, required=True)
+
+    parser_values = subparsers.add_parser('plot_class_iteration')
+    parser_values.add_argument('--workspace', type=str, required=True)
+    parser_values.add_argument('--select', type=str, required=True)
+
+    parser_summary_stats = subparsers.add_parser('summary_stats')
+    parser_summary_stats.add_argument('--workspace', type=str, required=True)
+
+    parser_plot_long = subparsers.add_parser('plot_long_fig')
+    parser_plot_long.add_argument('--workspace', type=str, required=True)
+
+    parser_plot_flops = subparsers.add_parser('plot_flops')
+    parser_plot_flops.add_argument('--workspace', type=str, required=True)
+ 
+    parser_spearman = subparsers.add_parser('spearman')
+    parser_spearman.add_argument('--workspace', type=str, required=True)
+
+    parser_print = subparsers.add_parser('print')
+    parser_print.add_argument('--workspace', type=str, required=True)
+
+    args = parser.parse_args()
+
+    if args.mode == 'plot':
+        plot(args)
+
+    elif args.mode == 'plot_for_paper':
+        plot_for_paper(args)
+
+    elif args.mode == 'plot_for_paper2':
+        plot_for_paper2(args)
+        
+    elif args.mode == 'table_values':
+        table_values(args)
+
+    elif args.mode == 'plot_class_iteration':
+        plot_class_iteration(args)
+
+    elif args.mode == 'summary_stats':
+        summary_stats(args)
+
+    elif args.mode == 'plot_long_fig':
+        plot_long_fig(args)
+
+    elif args.mode == 'plot_flops':
+        plot_flops(args)
+
+    elif args.mode == 'spearman':
+        spearman(args)
+
+    elif args.mode == 'print':
+        print_results(args)
+
+    else:
+        raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/utilities.py b/audio_detection/audio_infer/utils/utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d1604579b88e7e1e79f6350376f89d9c1c85f44
--- /dev/null
+++ b/audio_detection/audio_infer/utils/utilities.py
@@ -0,0 +1,172 @@
+import os
+import logging
+import h5py
+import soundfile
+import librosa
+import numpy as np
+import pandas as pd
+from scipy import stats 
+import datetime
+import pickle
+
+
+def create_folder(fd):
+    if not os.path.exists(fd):
+        os.makedirs(fd)
+        
+        
+def get_filename(path):
+    path = os.path.realpath(path)
+    na_ext = path.split('/')[-1]
+    na = os.path.splitext(na_ext)[0]
+    return na
+
+
+def get_sub_filepaths(folder):
+    paths = []
+    for root, dirs, files in os.walk(folder):
+        for name in files:
+            path = os.path.join(root, name)
+            paths.append(path)
+    return paths
+    
+    
+def create_logging(log_dir, filemode):
+    create_folder(log_dir)
+    i1 = 0
+
+    while os.path.isfile(os.path.join(log_dir, '{:04d}.log'.format(i1))):
+        i1 += 1
+        
+    log_path = os.path.join(log_dir, '{:04d}.log'.format(i1))
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
+        datefmt='%a, %d %b %Y %H:%M:%S',
+        filename=log_path,
+        filemode=filemode)
+
+    # Print to console
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
+    console.setFormatter(formatter)
+    logging.getLogger('').addHandler(console)
+    
+    return logging
+
+
+def read_metadata(csv_path, classes_num, id_to_ix):
+    """Read metadata of AudioSet from a csv file.
+
+    Args:
+      csv_path: str
+
+    Returns:
+      meta_dict: {'audio_name': (audios_num,), 'target': (audios_num, classes_num)}
+    """
+
+    with open(csv_path, 'r') as fr:
+        lines = fr.readlines()
+        lines = lines[3:]   # Remove heads
+
+    audios_num = len(lines)
+    targets = np.zeros((audios_num, classes_num), dtype=np.bool)
+    audio_names = []
+ 
+    for n, line in enumerate(lines):
+        items = line.split(', ')
+        """items: ['--4gqARaEJE', '0.000', '10.000', '"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"\n']"""
+
+        audio_name = 'Y{}.wav'.format(items[0])   # Audios are started with an extra 'Y' when downloading
+        label_ids = items[3].split('"')[1].split(',')
+
+        audio_names.append(audio_name)
+
+        # Target
+        for id in label_ids:
+            ix = id_to_ix[id]
+            targets[n, ix] = 1
+    
+    meta_dict = {'audio_name': np.array(audio_names), 'target': targets}
+    return meta_dict
+
+
+def float32_to_int16(x):
+    assert np.max(np.abs(x)) <= 1.2
+    x = np.clip(x, -1, 1)
+    return (x * 32767.).astype(np.int16)
+
+def int16_to_float32(x):
+    return (x / 32767.).astype(np.float32)
+    
+
+def pad_or_truncate(x, audio_length):
+    """Pad all audio to specific length."""
+    if len(x) <= audio_length:
+        return np.concatenate((x, np.zeros(audio_length - len(x))), axis=0)
+    else:
+        return x[0 : audio_length]
+
+
+def d_prime(auc):
+    d_prime = stats.norm().ppf(auc) * np.sqrt(2.0)
+    return d_prime
+
+
+class Mixup(object):
+    def __init__(self, mixup_alpha, random_seed=1234):
+        """Mixup coefficient generator.
+        """
+        self.mixup_alpha = mixup_alpha
+        self.random_state = np.random.RandomState(random_seed)
+
+    def get_lambda(self, batch_size):
+        """Get mixup random coefficients.
+        Args:
+          batch_size: int
+        Returns:
+          mixup_lambdas: (batch_size,)
+        """
+        mixup_lambdas = []
+        for n in range(0, batch_size, 2):
+            lam = self.random_state.beta(self.mixup_alpha, self.mixup_alpha, 1)[0]
+            mixup_lambdas.append(lam)
+            mixup_lambdas.append(1. - lam)
+
+        return np.array(mixup_lambdas)
+
+
+class StatisticsContainer(object):
+    def __init__(self, statistics_path):
+        """Contain statistics of different training iterations.
+        """
+        self.statistics_path = statistics_path
+
+        self.backup_statistics_path = '{}_{}.pkl'.format(
+            os.path.splitext(self.statistics_path)[0], 
+            datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
+
+        self.statistics_dict = {'bal': [], 'test': []}
+
+    def append(self, iteration, statistics, data_type):
+        statistics['iteration'] = iteration
+        self.statistics_dict[data_type].append(statistics)
+        
+    def dump(self):
+        pickle.dump(self.statistics_dict, open(self.statistics_path, 'wb'))
+        pickle.dump(self.statistics_dict, open(self.backup_statistics_path, 'wb'))
+        logging.info('    Dump statistics to {}'.format(self.statistics_path))
+        logging.info('    Dump statistics to {}'.format(self.backup_statistics_path))
+        
+    def load_state_dict(self, resume_iteration):
+        self.statistics_dict = pickle.load(open(self.statistics_path, 'rb'))
+
+        resume_statistics_dict = {'bal': [], 'test': []}
+        
+        for key in self.statistics_dict.keys():
+            for statistics in self.statistics_dict[key]:
+                if statistics['iteration'] <= resume_iteration:
+                    resume_statistics_dict[key].append(statistics)
+                
+        self.statistics_dict = resume_statistics_dict
\ No newline at end of file
diff --git a/audio_detection/target_sound_detection/src/__pycache__/models.cpython-38.pyc b/audio_detection/target_sound_detection/src/__pycache__/models.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0982846c06b554669d8f290a24eb2fdb172893a
Binary files /dev/null and b/audio_detection/target_sound_detection/src/__pycache__/models.cpython-38.pyc differ
diff --git a/audio_detection/target_sound_detection/src/__pycache__/utils.cpython-38.pyc b/audio_detection/target_sound_detection/src/__pycache__/utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c90bde06c05575070a8743337c5b2bc4e139be3b
Binary files /dev/null and b/audio_detection/target_sound_detection/src/__pycache__/utils.cpython-38.pyc differ
diff --git a/audio_detection/target_sound_detection/src/models.py b/audio_detection/target_sound_detection/src/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..3016b9274aeb86091d30d980803c7106f15ddd54
--- /dev/null
+++ b/audio_detection/target_sound_detection/src/models.py
@@ -0,0 +1,1288 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2021/3/9 16:33
+# @Author  : dongchao yang
+# @File    : train.py
+from itertools import zip_longest
+import numpy as np
+from scipy import ndimage
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import time
+from torchlibrosa.augmentation import SpecAugmentation
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+import math
+from sklearn.cluster import KMeans
+import os
+import time
+from functools import partial
+# import timm
+# from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+import warnings
+from functools import partial
+# from timm.models.registry import register_model
+# from timm.models.vision_transformer import _cfg
+# from mmdet.utils import get_root_logger
+# from mmcv.runner import load_checkpoint
+# from mmcv.runner import _load_checkpoint, load_state_dict
+# import mmcv.runner
+import copy
+from collections import OrderedDict
+import io
+import re
+DEBUG=0
+event_labels = ['Alarm', 'Alarm_clock', 'Animal', 'Applause', 'Arrow', 'Artillery_fire', 
+                'Babbling', 'Baby_laughter', 'Bark', 'Basketball_bounce', 'Battle_cry', 
+                'Bell', 'Bird', 'Bleat', 'Bouncing', 'Breathing', 'Buzz', 'Camera', 
+                'Cap_gun', 'Car', 'Car_alarm', 'Cat', 'Caw', 'Cheering', 'Child_singing', 
+                'Choir', 'Chop', 'Chopping_(food)', 'Clapping', 'Clickety-clack', 'Clicking', 
+                'Clip-clop', 'Cluck', 'Coin_(dropping)', 'Computer_keyboard', 'Conversation', 
+                'Coo', 'Cough', 'Cowbell', 'Creak', 'Cricket', 'Croak', 'Crow', 'Crowd', 'DTMF', 
+                'Dog', 'Door', 'Drill', 'Drip', 'Engine', 'Engine_starting', 'Explosion', 'Fart', 
+                'Female_singing', 'Filing_(rasp)', 'Finger_snapping', 'Fire', 'Fire_alarm', 'Firecracker', 
+                'Fireworks', 'Frog', 'Gasp', 'Gears', 'Giggle', 'Glass', 'Glass_shatter', 'Gobble', 'Groan', 
+                'Growling', 'Hammer', 'Hands', 'Hiccup', 'Honk', 'Hoot', 'Howl', 'Human_sounds', 'Human_voice', 
+                'Insect', 'Laughter', 'Liquid', 'Machine_gun', 'Male_singing', 'Mechanisms', 'Meow', 'Moo', 
+                'Motorcycle', 'Mouse', 'Music', 'Oink', 'Owl', 'Pant', 'Pant_(dog)', 'Patter', 'Pig', 'Plop',
+                'Pour', 'Power_tool', 'Purr', 'Quack', 'Radio', 'Rain_on_surface', 'Rapping', 'Rattle', 
+                'Reversing_beeps', 'Ringtone', 'Roar', 'Run', 'Rustle', 'Scissors', 'Scrape', 'Scratch', 
+                'Screaming', 'Sewing_machine', 'Shout', 'Shuffle', 'Shuffling_cards', 'Singing', 
+                'Single-lens_reflex_camera', 'Siren', 'Skateboard', 'Sniff', 'Snoring', 'Speech', 
+                'Speech_synthesizer', 'Spray', 'Squeak', 'Squeal', 'Steam', 'Stir', 'Surface_contact', 
+                'Tap', 'Tap_dance', 'Telephone_bell_ringing', 'Television', 'Tick', 'Tick-tock', 'Tools', 
+                'Train', 'Train_horn', 'Train_wheels_squealing', 'Truck', 'Turkey', 'Typewriter', 'Typing', 
+                'Vehicle', 'Video_game_sound', 'Water', 'Whimper_(dog)', 'Whip', 'Whispering', 'Whistle', 
+                'Whistling', 'Whoop', 'Wind', 'Writing', 'Yip', 'and_pans', 'bird_song', 'bleep', 'clink', 
+                'cock-a-doodle-doo', 'crinkling', 'dove', 'dribble', 'eructation', 'faucet', 'flapping_wings', 
+                'footsteps', 'gunfire', 'heartbeat', 'infant_cry', 'kid_speaking', 'man_speaking', 'mastication', 
+                'mice', 'river', 'rooster', 'silverware', 'skidding', 'smack', 'sobbing', 'speedboat', 'splatter',
+                'surf', 'thud', 'thwack', 'toot', 'truck_horn', 'tweet', 'vroom', 'waterfowl', 'woman_speaking']
+def load_checkpoint(model,
+                    filename,
+                    map_location=None,
+                    strict=False,
+                    logger=None,
+                    revise_keys=[(r'^module\.', '')]):
+    """Load checkpoint from a file or URI.
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Default: strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    checkpoint = _load_checkpoint(filename, map_location, logger)
+    '''
+    new_proj = torch.nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=2).unsqueeze(2).repeat(1,1,3,1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=3).unsqueeze(3).repeat(1,1,1,3))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    '''
+    new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+def init_weights(m):
+    if isinstance(m, (nn.Conv2d, nn.Conv1d)):
+        nn.init.kaiming_normal_(m.weight)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.BatchNorm2d):
+        nn.init.constant_(m.weight, 1)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    if isinstance(m, nn.Linear):
+        nn.init.kaiming_uniform_(m.weight)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+            
+    
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+
+class MaxPool(nn.Module):
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+
+    def forward(self, logits, decision):
+        return torch.max(decision, dim=self.pooldim)[0]
+
+
+class LinearSoftPool(nn.Module):
+    """LinearSoftPool
+    Linear softmax, takes logits and returns a probability, near to the actual maximum value.
+    Taken from the paper:
+        A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling
+    https://arxiv.org/abs/1810.09050
+    """
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+
+    def forward(self, logits, time_decision):
+        return (time_decision**2).sum(self.pooldim) / (time_decision.sum(
+            self.pooldim)+1e-7)
+
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        
+        super(ConvBlock, self).__init__()
+        
+        self.conv1 = nn.Conv2d(in_channels=in_channels, 
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+                              
+        self.conv2 = nn.Conv2d(in_channels=out_channels, 
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+                              
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+
+        self.init_weight()
+        
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+
+        
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        
+        return x
+
+class ConvBlock_GLU(nn.Module):
+    def __init__(self, in_channels, out_channels,kernel_size=(3,3)):
+        super(ConvBlock_GLU, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels, 
+                              out_channels=out_channels,
+                              kernel_size=kernel_size, stride=(1, 1),
+                              padding=(1, 1), bias=False)                         
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.sigmoid = nn.Sigmoid()
+        self.init_weight()
+        
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_bn(self.bn1)
+
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = self.bn1(self.conv1(x))
+        cnn1 = self.sigmoid(x[:, :x.shape[1]//2, :, :])
+        cnn2 = x[:,x.shape[1]//2:,:,:]
+        x = cnn1*cnn2
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        elif pool_type == 'None':
+            pass
+        elif pool_type == 'LP':
+            pass
+            #nn.LPPool2d(4, pool_size)
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+
+class Mul_scale_GLU(nn.Module):
+    def __init__(self):
+        super(Mul_scale_GLU,self).__init__()
+        self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1)) # 1*1
+        self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3)) # 3*3
+        self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5)) # 5*5
+        self.conv_block2 = ConvBlock_GLU(in_channels=96, out_channels=128*2)
+        # self.conv_block3 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock_GLU(in_channels=128, out_channels=128*2)
+        self.conv_block4 = ConvBlock_GLU(in_channels=128, out_channels=256*2)
+        self.conv_block5 = ConvBlock_GLU(in_channels=256, out_channels=256*2)
+        self.conv_block6 = ConvBlock_GLU(in_channels=256, out_channels=512*2)
+        self.conv_block7 = ConvBlock_GLU(in_channels=512, out_channels=512*2)
+        self.padding = nn.ReplicationPad2d((0,1,0,1))
+
+    def forward(self, input, fi=None):
+        """
+        Input: (batch_size, data_length)"""
+        x1 = self.conv_block1_1(input, pool_size=(2, 2), pool_type='avg')
+        x1 = x1[:,:,:500,:32]
+        #print('x1 ',x1.shape)
+        x2 = self.conv_block1_2(input,pool_size=(2,2),pool_type='avg')
+        #print('x2 ',x2.shape)
+        x3 = self.conv_block1_3(input,pool_size=(2,2),pool_type='avg')
+        x3 = self.padding(x3)
+        #print('x3 ',x3.shape)
+        # assert 1==2
+        x = torch.cat([x1,x2],dim=1)
+        x = torch.cat([x,x3],dim=1)
+        #print('x ',x.shape)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='None')
+        x = self.conv_block3(x,pool_size=(2,2),pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training) # 
+        #print('x2,3 ',x.shape)
+        x = self.conv_block4(x, pool_size=(2, 4), pool_type='None')
+        x = self.conv_block5(x,pool_size=(2,4),pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        #print('x4,5 ',x.shape)
+
+        x = self.conv_block6(x, pool_size=(1, 4), pool_type='None')
+        x = self.conv_block7(x, pool_size=(1, 4), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        # print('x6,7 ',x.shape)
+        # assert 1==2
+        return x
+
+class Cnn14(nn.Module):
+    def __init__(self, sample_rate=32000, window_size=1024, hop_size=320, mel_bins=64, fmin=50, 
+        fmax=14000, classes_num=527):
+        
+        super(Cnn14, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+
+        self.bn0 = nn.BatchNorm2d(64)
+
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+
+        self.fc1 = nn.Linear(2048, 128, bias=True)
+        self.fc_audioset = nn.Linear(128, classes_num, bias=True)
+        
+        self.init_weight()
+
+    def init_weight(self):
+        init_layer(self.fc1)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input_, mixup_lambda=None):
+        """
+        Input: (batch_size, data_length)"""
+        input_ = input_.unsqueeze(1)
+        x = self.conv_block1(input_, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        # print(x.shape)
+        # x = torch.mean(x, dim=3)
+        x = x.transpose(1, 2).contiguous().flatten(-2)
+        x = self.fc1(x)
+        # print(x.shape)
+        # assert 1==2
+        # (x1,_) = torch.max(x, dim=2)
+        # x2 = torch.mean(x, dim=2)
+        # x = x1 + x2
+        # x = F.dropout(x, p=0.5, training=self.training)
+        # x = F.relu_(self.fc1(x))
+        # embedding = F.dropout(x, p=0.5, training=self.training)
+        return x
+
+class Cnn10_fi(nn.Module):
+    def __init__(self):  
+        super(Cnn10_fi, self).__init__()
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+
+        # self.fc1 = nn.Linear(512, 512, bias=True)
+        # self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+        
+        # self.init_weight()
+ 
+    def forward(self, input, fi=None):
+        """
+        Input: (batch_size, data_length)"""
+
+        x = self.conv_block1(input, pool_size=(2, 2), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 4), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(1, 4), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        return x
+
+class Cnn10_mul_scale(nn.Module):
+    def __init__(self,scale=8):  
+        super(Cnn10_mul_scale, self).__init__()
+        self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1))
+        self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3))
+        self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5))
+        self.conv_block2 = ConvBlock(in_channels=96, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.scale = scale
+        self.padding = nn.ReplicationPad2d((0,1,0,1))
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        """
+        Input: (batch_size, data_length)"""
+        if self.scale == 8:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (2,4)
+            pool_size4 = (1,4)
+        elif self.scale == 4:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        elif self.scale == 2:
+            pool_size1 = (2,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        else:
+            pool_size1 = (1,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        # print('input ',input.shape)
+        x1 = self.conv_block1_1(input, pool_size=pool_size1, pool_type='avg')
+        x1 = x1[:,:,:500,:32]
+        #print('x1 ',x1.shape)
+        x2 = self.conv_block1_2(input, pool_size=pool_size1, pool_type='avg')
+        #print('x2 ',x2.shape)
+        x3 = self.conv_block1_3(input, pool_size=pool_size1, pool_type='avg')
+        x3 = self.padding(x3)
+        #print('x3 ',x3.shape)
+        # assert 1==2
+        m_i = min(x3.shape[2],min(x1.shape[2],x2.shape[2]))
+        #print('m_i ', m_i)
+        x = torch.cat([x1[:,:,:m_i,:],x2[:,:, :m_i,:],x3[:,:, :m_i,:]],dim=1)
+        # x = torch.cat([x,x3],dim=1)
+
+        # x = self.conv_block1(input, pool_size=pool_size1, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        return x
+
+
+class Cnn10(nn.Module):
+    def __init__(self,scale=8):  
+        super(Cnn10, self).__init__()
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.scale = scale
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        """
+        Input: (batch_size, data_length)"""
+        if self.scale == 8:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (2,4)
+            pool_size4 = (1,4)
+        elif self.scale == 4:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        elif self.scale == 2:
+            pool_size1 = (2,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        else:
+            pool_size1 = (1,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        x = self.conv_block1(input, pool_size=pool_size1, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        return x
+
+class MeanPool(nn.Module):
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+
+    def forward(self, logits, decision):
+        return torch.mean(decision, dim=self.pooldim)
+
+class ResPool(nn.Module):
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+        self.linPool = LinearSoftPool(pooldim=1)
+
+class AutoExpPool(nn.Module):
+    def __init__(self, outputdim=10, pooldim=1):
+        super().__init__()
+        self.outputdim = outputdim
+        self.alpha = nn.Parameter(torch.full((outputdim, ), 1))
+        self.pooldim = pooldim
+
+    def forward(self, logits, decision):
+        scaled = self.alpha * decision  # \alpha * P(Y|x) in the paper
+        return (logits * torch.exp(scaled)).sum(
+            self.pooldim) / torch.exp(scaled).sum(self.pooldim)
+
+
+class SoftPool(nn.Module):
+    def __init__(self, T=1, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+        self.T = T
+
+    def forward(self, logits, decision):
+        w = torch.softmax(decision / self.T, dim=self.pooldim)
+        return torch.sum(decision * w, dim=self.pooldim)
+
+
+class AutoPool(nn.Module):
+    """docstring for AutoPool"""
+    def __init__(self, outputdim=10, pooldim=1):
+        super().__init__()
+        self.outputdim = outputdim
+        self.alpha = nn.Parameter(torch.ones(outputdim))
+        self.dim = pooldim
+
+    def forward(self, logits, decision):
+        scaled = self.alpha * decision  # \alpha * P(Y|x) in the paper
+        weight = torch.softmax(scaled, dim=self.dim)
+        return torch.sum(decision * weight, dim=self.dim)  # B x C
+
+
+class ExtAttentionPool(nn.Module):
+    def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
+        super().__init__()
+        self.inputdim = inputdim
+        self.outputdim = outputdim
+        self.pooldim = pooldim
+        self.attention = nn.Linear(inputdim, outputdim)
+        nn.init.zeros_(self.attention.weight)
+        nn.init.zeros_(self.attention.bias)
+        self.activ = nn.Softmax(dim=self.pooldim)
+
+    def forward(self, logits, decision):
+        # Logits of shape (B, T, D), decision of shape (B, T, C)
+        w_x = self.activ(self.attention(logits) / self.outputdim)
+        h = (logits.permute(0, 2, 1).contiguous().unsqueeze(-2) *
+             w_x.unsqueeze(-1)).flatten(-2).contiguous()
+        return torch.sum(h, self.pooldim)
+
+
+class AttentionPool(nn.Module):
+    """docstring for AttentionPool"""
+    def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
+        super().__init__()
+        self.inputdim = inputdim
+        self.outputdim = outputdim
+        self.pooldim = pooldim
+        self.transform = nn.Linear(inputdim, outputdim)
+        self.activ = nn.Softmax(dim=self.pooldim)
+        self.eps = 1e-7
+
+    def forward(self, logits, decision):
+        # Input is (B, T, D)
+        # B, T , D
+        w = self.activ(torch.clamp(self.transform(logits), -15, 15))
+        detect = (decision * w).sum(
+            self.pooldim) / (w.sum(self.pooldim) + self.eps)
+        # B, T, D
+        return detect
+
+class Block2D(nn.Module):
+    def __init__(self, cin, cout, kernel_size=3, padding=1):
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.BatchNorm2d(cin),
+            nn.Conv2d(cin,
+                      cout,
+                      kernel_size=kernel_size,
+                      padding=padding,
+                      bias=False),
+            nn.LeakyReLU(inplace=True, negative_slope=0.1))
+
+    def forward(self, x):
+        return self.block(x)
+
+class AudioCNN(nn.Module):
+    def __init__(self, classes_num):
+        super(AudioCNN, self).__init__()
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.fc1 = nn.Linear(512,128,bias=True)
+        self.fc = nn.Linear(128, classes_num, bias=True)
+        self.init_weights()
+
+    def init_weights(self):
+        init_layer(self.fc)
+
+    def forward(self, input):
+        '''
+        Input: (batch_size, times_steps, freq_bins)'''
+        # [128, 801, 168] --> [128,1,801,168]
+        x = input[:, None, :, :]
+        '''(batch_size, 1, times_steps, freq_bins)'''
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') # 128,64,400,84
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') # 128,128,200,42
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') # 128,256,100,21
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') # 128,512,50,10
+        '''(batch_size, feature_maps, time_steps, freq_bins)'''
+        x = torch.mean(x, dim=3)        # (batch_size, feature_maps, time_stpes) # 128,512,50
+        (x, _) = torch.max(x, dim=2)    # (batch_size, feature_maps) 128,512
+        x = self.fc1(x) # 128,128
+        output = self.fc(x) # 128,10
+        return x,output
+
+    def extract(self,input):
+        '''Input: (batch_size, times_steps, freq_bins)'''
+        x = input[:, None, :, :]
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        '''(batch_size, feature_maps, time_steps, freq_bins)'''
+        x = torch.mean(x, dim=3)        # (batch_size, feature_maps, time_stpes)
+        (x, _) = torch.max(x, dim=2)    # (batch_size, feature_maps)
+        x = self.fc1(x) # 128,128
+        return x
+
+def parse_poolingfunction(poolingfunction_name='mean', **kwargs):
+    """parse_poolingfunction
+    A heler function to parse any temporal pooling
+    Pooling is done on dimension 1
+    :param poolingfunction_name:
+    :param **kwargs:
+    """
+    poolingfunction_name = poolingfunction_name.lower()
+    if poolingfunction_name == 'mean':
+        return MeanPool(pooldim=1)
+    elif poolingfunction_name == 'max':
+        return MaxPool(pooldim=1)
+    elif poolingfunction_name == 'linear':
+        return LinearSoftPool(pooldim=1)
+    elif poolingfunction_name == 'expalpha':
+        return AutoExpPool(outputdim=kwargs['outputdim'], pooldim=1)
+
+    elif poolingfunction_name == 'soft':
+        return SoftPool(pooldim=1)
+    elif poolingfunction_name == 'auto':
+        return AutoPool(outputdim=kwargs['outputdim'])
+    elif poolingfunction_name == 'attention':
+        return AttentionPool(inputdim=kwargs['inputdim'],
+                             outputdim=kwargs['outputdim'])
+class conv1d(nn.Module):
+    def __init__(self, nin, nout, kernel_size=3, stride=1, padding='VALID', dilation=1):
+        super(conv1d, self).__init__()
+        if padding == 'VALID':
+            dconv_pad = 0
+        elif padding == 'SAME':
+            dconv_pad = dilation * ((kernel_size - 1) // 2)
+        else:
+            raise ValueError("Padding Mode Error!")
+        self.conv = nn.Conv1d(nin, nout, kernel_size=kernel_size, stride=stride, padding=dconv_pad)
+        self.act = nn.ReLU()
+        self.init_layer(self.conv)
+
+    def init_layer(self, layer, nonlinearity='relu'):
+        """Initialize a Linear or Convolutional layer. """
+        nn.init.kaiming_normal_(layer.weight, nonlinearity=nonlinearity)
+        nn.init.constant_(layer.bias, 0.1)
+
+    def forward(self, x):
+        out = self.act(self.conv(x))
+        return out
+
+class Atten_1(nn.Module):
+    def __init__(self, input_dim, context=2, dropout_rate=0.2):
+        super(Atten_1, self).__init__()
+        self._matrix_k = nn.Linear(input_dim, input_dim // 4)
+        self._matrix_q = nn.Linear(input_dim, input_dim // 4)
+        self.relu = nn.ReLU()
+        self.context = context
+        self._dropout_layer = nn.Dropout(dropout_rate)
+        self.init_layer(self._matrix_k)
+        self.init_layer(self._matrix_q)
+
+    def init_layer(self, layer, nonlinearity='leaky_relu'):
+        """Initialize a Linear or Convolutional layer. """
+        nn.init.kaiming_uniform_(layer.weight, nonlinearity=nonlinearity)
+        if hasattr(layer, 'bias'):
+            if layer.bias is not None:
+                layer.bias.data.fill_(0.)
+
+    def forward(self, input_x):
+        k_x = input_x
+        k_x = self.relu(self._matrix_k(k_x))
+        k_x = self._dropout_layer(k_x)
+        # print('k_x ',k_x.shape)
+        q_x = input_x[:, self.context, :]
+        # print('q_x ',q_x.shape)
+        q_x = q_x[:, None, :]
+        # print('q_x1 ',q_x.shape)
+        q_x = self.relu(self._matrix_q(q_x))
+        q_x = self._dropout_layer(q_x)
+        # print('q_x2 ',q_x.shape)
+        x_ = torch.matmul(k_x, q_x.transpose(-2, -1) / math.sqrt(k_x.size(-1)))
+        # print('x_ ',x_.shape)
+        x_ = x_.squeeze(2)
+        alpha = F.softmax(x_, dim=-1)
+        att_ = alpha
+        # print('alpha ',alpha)
+        alpha = alpha.unsqueeze(2).repeat(1,1,input_x.shape[2])
+        # print('alpha ',alpha)
+        # alpha = alpha.view(alpha.size(0), alpha.size(1), alpha.size(2), 1)
+        out = alpha * input_x
+        # print('out ', out.shape)
+        # out = out.mean(2)
+        out = out.mean(1)
+        # print('out ',out.shape)
+        # assert 1==2
+        #y = alpha * input_x
+        #return y, att_
+        out = input_x[:, self.context, :] + out
+        return out
+
+class Fusion(nn.Module):
+    def __init__(self, inputdim, inputdim2, n_fac):
+        super().__init__()
+        self.fuse_layer1 = conv1d(inputdim, inputdim2*n_fac,1)
+        self.fuse_layer2 = conv1d(inputdim2, inputdim2*n_fac,1)
+        self.avg_pool = nn.AvgPool1d(n_fac, stride=n_fac) # 沿着最后一个维度进行pooling
+
+    def forward(self,embedding,mix_embed):
+        embedding = embedding.permute(0,2,1)
+        fuse1_out = self.fuse_layer1(embedding) # [2, 501, 2560] ,512*5, 1D卷积融合,spk_embeding ,扩大其维度 
+        fuse1_out = fuse1_out.permute(0,2,1)
+
+        mix_embed = mix_embed.permute(0,2,1)
+        fuse2_out = self.fuse_layer2(mix_embed) # [2, 501, 2560] ,512*5, 1D卷积融合,spk_embeding ,扩大其维度 
+        fuse2_out = fuse2_out.permute(0,2,1)
+        as_embs = torch.mul(fuse1_out, fuse2_out) # 相乘 [2, 501, 2560]
+        # (10, 501, 512)
+        as_embs = self.avg_pool(as_embs) # [2, 501, 512] 相当于 2560//5
+        return as_embs
+
+class CDur_fusion(nn.Module):
+    def __init__(self, inputdim, outputdim, **kwargs):
+        super().__init__()
+        self.features = nn.Sequential(
+            Block2D(1, 32),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(32, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(128, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (1, 4)),
+            nn.Dropout(0.3),
+        )
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+
+        self.gru = nn.GRU(128, 128, bidirectional=True, batch_first=True)
+        self.fusion = Fusion(128,2)
+        self.fc = nn.Linear(256,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+
+    def forward(self, x, embedding): # 
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = self.fusion(embedding,x)
+        #x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur(nn.Module):
+    def __init__(self, inputdim, outputdim,time_resolution, **kwargs):
+        super().__init__()
+        self.features = nn.Sequential(
+            Block2D(1, 32),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(32, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(128, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 4)),
+            nn.Dropout(0.3),
+        )
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+
+        self.gru = nn.GRU(256, 256, bidirectional=True, batch_first=True)
+        self.fc = nn.Linear(512,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+
+    def forward(self, x, embedding,one_hot=None): # 
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur_big(nn.Module):
+    def __init__(self, inputdim, outputdim, **kwargs):
+        super().__init__()
+        self.features = nn.Sequential(
+            Block2D(1, 64),
+            Block2D(64, 64),
+            nn.LPPool2d(4, (2, 2)),
+            Block2D(64, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 2)),
+            Block2D(128, 256),
+            Block2D(256, 256),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(256, 512),
+            Block2D(512, 512),
+            nn.LPPool2d(4, (1, 4)),
+            nn.Dropout(0.3),)
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+
+    def forward(self, x, embedding): # 
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur_GLU(nn.Module):
+    def __init__(self, inputdim, outputdim, **kwargs):
+        super().__init__()
+        self.features = Mul_scale_GLU()
+        # with torch.no_grad():
+        #     rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+        #     rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        self.gru = nn.GRU(640, 512,1, bidirectional=True, batch_first=True) # previous is 640
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+        
+    def forward(self, x, embedding,one_hot=None): # 
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur_CNN14(nn.Module):
+    def __init__(self, inputdim, outputdim,time_resolution,**kwargs):
+        super().__init__()
+        if time_resolution==125:
+            self.features = Cnn10(8)
+        elif time_resolution == 250:
+            #print('time_resolution ',time_resolution)
+            self.features = Cnn10(4)
+        elif time_resolution == 500:
+            self.features = Cnn10(2)
+        else:
+            self.features = Cnn10(0)
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        # self.features = Cnn10()
+        self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    
+    def forward(self, x, embedding,one_hot=None):
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur_CNN_mul_scale(nn.Module):
+    def __init__(self, inputdim, outputdim,time_resolution,**kwargs):
+        super().__init__()
+        if time_resolution==125:
+            self.features = Cnn10_mul_scale(8)
+        elif time_resolution == 250:
+            #print('time_resolution ',time_resolution)
+            self.features = Cnn10_mul_scale(4)
+        elif time_resolution == 500:
+            self.features = Cnn10_mul_scale(2)
+        else:
+            self.features = Cnn10_mul_scale(0)
+        # with torch.no_grad():
+        #     rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+        #     rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        # self.features = Cnn10()
+        self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    
+    def forward(self, x, embedding,one_hot=None):
+        # print('x ',x.shape)
+        # assert 1==2
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur_CNN_mul_scale_fusion(nn.Module):
+    def __init__(self, inputdim, outputdim, time_resolution,**kwargs):
+        super().__init__()
+        if time_resolution==125:
+            self.features = Cnn10_mul_scale(8)
+        elif time_resolution == 250:
+            #print('time_resolution ',time_resolution)
+            self.features = Cnn10_mul_scale(4)
+        elif time_resolution == 500:
+            self.features = Cnn10_mul_scale(2)
+        else:
+            self.features = Cnn10_mul_scale(0)
+        # with torch.no_grad():
+        #     rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+        #     rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        # self.features = Cnn10()
+        self.gru = nn.GRU(512, 512, bidirectional=True, batch_first=True)
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.fusion = Fusion(128,512,2)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    
+    def forward(self, x, embedding,one_hot=None):
+        # print('x ',x.shape)
+        # assert 1==2
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = self.fusion(embedding, x)
+        #x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+
+class RaDur_fusion(nn.Module):
+    def __init__(self, model_config, inputdim, outputdim, time_resolution, **kwargs):
+        super().__init__()
+        self.encoder = Cnn14()
+        self.detection = CDur_CNN_mul_scale_fusion(inputdim, outputdim, time_resolution)
+        self.softmax = nn.Softmax(dim=2)
+        #self.temperature = 5
+        # if model_config['pre_train']:
+        #     self.encoder.load_state_dict(torch.load(model_config['encoder_path'])['model'])
+        #     self.detection.load_state_dict(torch.load(model_config['CDur_path']))
+        
+        self.q = nn.Linear(128,128)
+        self.k = nn.Linear(128,128)
+        self.q_ee = nn.Linear(128, 128)
+        self.k_ee = nn.Linear(128, 128)
+        self.temperature = 11.3 # sqrt(128)
+        self.att_pool = model_config['att_pool']
+        self.enhancement = model_config['enhancement'] 
+        self.tao = model_config['tao']
+        self.top = model_config['top']
+        self.bn = nn.BatchNorm1d(128)
+        self.EE_fusion = Fusion(128, 128, 4)
+
+    def get_w(self,q,k):
+        q = self.q(q)
+        k = self.k(k)
+        q = q.unsqueeze(1)
+        attn = torch.bmm(q, k.transpose(1, 2))
+        attn = attn/self.temperature
+        attn = self.softmax(attn)
+        return attn
+    
+    def get_w_ee(self,q,k):
+        q = self.q_ee(q)
+        k = self.k_ee(k)
+        q = q.unsqueeze(1)
+        attn = torch.bmm(q, k.transpose(1, 2))
+        attn = attn/self.temperature
+        attn = self.softmax(attn)
+        return attn
+    
+    def attention_pooling(self, embeddings, mean_embedding):
+        att_pool_w = self.get_w(mean_embedding,embeddings)
+        embedding = torch.bmm(att_pool_w, embeddings).squeeze(1)
+        # print(embedding.shape)
+        # print(att_pool_w.shape)
+        # print(att_pool_w[0])
+        # assert 1==2
+        return embedding
+    
+    def select_topk_embeddings(self, scores, embeddings, k):
+        _, idx_DESC = scores.sort(descending=True, dim=1) # 根据分数进行排序
+        top_k = _[:,:k]
+        # print('top_k ', top_k)
+        # top_k = top_k.mean(1)
+        idx_topk = idx_DESC[:, :k] # 取top_k个
+        # print('index ', idx_topk)
+        idx_topk = idx_topk.unsqueeze(2).expand([-1, -1, embeddings.shape[2]])
+        selected_embeddings = torch.gather(embeddings, 1, idx_topk)
+        return selected_embeddings,top_k
+    
+    def sum_with_attention(self, embedding, top_k, selected_embeddings):
+        # print('embedding ',embedding)
+        # print('selected_embeddings ',selected_embeddings.shape)
+        att_1 = self.get_w_ee(embedding, selected_embeddings)
+        att_1 = att_1.squeeze(1)
+        #print('att_1 ',att_1.shape)
+        larger = top_k > self.tao
+        # print('larger ',larger)
+        top_k = top_k*larger
+        # print('top_k ',top_k.shape)
+        # print('top_k ',top_k)
+        att_1 = att_1*top_k
+        #print('att_1 ',att_1.shape)
+        # assert 1==2
+        att_2 = att_1.unsqueeze(2).repeat(1,1,128)
+        Es = selected_embeddings*att_2
+        return Es
+    
+    def orcal_EE(self, x, embedding, label):
+        batch, time, dim = x.shape
+
+        mixture_embedding = self.encoder(x) # 8, 125, 128
+        mixture_embedding = mixture_embedding.transpose(1,2)
+        mixture_embedding = self.bn(mixture_embedding)
+        mixture_embedding = mixture_embedding.transpose(1,2)
+
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.detection.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding_pre = embedding.unsqueeze(1)
+        embedding_pre = embedding_pre.repeat(1, x.shape[1], 1)
+        f = self.detection.fusion(embedding_pre, x) # the first stage results
+        #f = torch.cat((x, embedding_pre), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.detection.gru.flatten_parameters()
+        f, _ = self.detection.gru(f) #  x  torch.Size([16, 125, 256])
+        f = self.detection.fc(f)
+        decision_time = torch.softmax(self.detection.outputlayer(f),dim=2) # x  torch.Size([16, 125, 2])
+        
+        selected_embeddings, top_k = self.select_topk_embeddings(decision_time[:,:,0], mixture_embedding, self.top)
+        
+        selected_embeddings = self.sum_with_attention(embedding, top_k, selected_embeddings) # add the weight
+
+        mix_embedding = selected_embeddings.mean(1).unsqueeze(1) # 
+        mix_embedding = mix_embedding.repeat(1, x.shape[1], 1)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        mix_embedding = self.EE_fusion(mix_embedding, embedding) # 使用神经网络进行融合
+        # mix_embedding2 = selected_embeddings2.mean(1)
+        #mix_embedding =  embedding + mix_embedding # 直接相加
+        # new detection results
+        # embedding_now = mix_embedding.unsqueeze(1)
+        # embedding_now = embedding_now.repeat(1, x.shape[1], 1)
+        f_now = self.detection.fusion(mix_embedding, x) 
+        #f_now = torch.cat((x, embedding_now), dim=2) # 
+        f_now, _ = self.detection.gru(f_now) #  x  torch.Size([16, 125, 256])
+        f_now = self.detection.fc(f_now)
+        decision_time_now = torch.softmax(self.detection.outputlayer(f_now), dim=2) # x  torch.Size([16, 125, 2])
+        
+        top_k = top_k.mean(1)  # get avg score,higher score will have more weight
+        larger = top_k > self.tao
+        top_k = top_k * larger
+        top_k = top_k/2.0
+        # print('top_k ',top_k)
+        # assert 1==2
+        # print('tok_k[ ',top_k.shape)
+        # print('decision_time ',decision_time.shape)
+        # print('decision_time_now ',decision_time_now.shape)
+        neg_w = top_k.unsqueeze(1).unsqueeze(2)
+        neg_w = neg_w.repeat(1, decision_time_now.shape[1], decision_time_now.shape[2])
+        # print('neg_w ',neg_w.shape)
+        #print('neg_w ',neg_w[:,0:10,0])
+        pos_w = 1-neg_w
+        #print('pos_w ',pos_w[:,0:10,0])
+        decision_time_final = decision_time*pos_w + neg_w*decision_time_now
+        #print('decision_time_final ',decision_time_final[0,0:10,0])
+        # print(decision_time_final[0,:,:])
+        #assert 1==2
+        return decision_time_final
+    
+    def forward(self, x, ref, label=None):
+        batch, time, dim = x.shape
+        logit = torch.zeros(1).cuda()
+        embeddings  = self.encoder(ref)
+        mean_embedding = embeddings.mean(1)
+        if self.att_pool == True:
+            mean_embedding = self.bn(mean_embedding)
+            embeddings = embeddings.transpose(1,2)
+            embeddings = self.bn(embeddings)
+            embeddings = embeddings.transpose(1,2)
+            embedding = self.attention_pooling(embeddings, mean_embedding)
+        else:
+            embedding = mean_embedding
+        if self.enhancement == True:
+            decision_time = self.orcal_EE(x, embedding, label)
+            decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+            return decision_time[:,:,0], decision_up, logit
+
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.detection.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        # x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        x = self.detection.fusion(embedding, x) 
+        # embedding = embedding.unsqueeze(1)
+        # embedding = embedding.repeat(1, x.shape[1], 1)
+        # x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.detection.gru.flatten_parameters()
+        x, _ = self.detection.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.detection.fc(x)
+        decision_time = torch.softmax(self.detection.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2),
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0], decision_up, logit
\ No newline at end of file
diff --git a/audio_detection/target_sound_detection/src/utils.py b/audio_detection/target_sound_detection/src/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf1deeaef4e51fcc7cc42f4f3e2d9a34296371f9
--- /dev/null
+++ b/audio_detection/target_sound_detection/src/utils.py
@@ -0,0 +1,353 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2021/3/9 16:33
+# @Author  : dongchao yang
+# @File    : train.py
+
+import collections
+import sys
+from loguru import logger
+from pprint import pformat
+
+import numpy as np
+import pandas as pd
+import scipy
+import six
+import sklearn.preprocessing as pre
+import torch
+import tqdm
+import yaml
+
+from scipy.interpolate import interp1d
+
+def parse_config_or_kwargs(config_file, **kwargs):
+    """parse_config_or_kwargs
+    :param config_file: Config file that has parameters, yaml format
+    :param **kwargs: Other alternative parameters or overwrites for config
+    """
+    with open(config_file) as con_read:
+        yaml_config = yaml.load(con_read, Loader=yaml.FullLoader)
+    arguments = dict(yaml_config, **kwargs)
+    return arguments
+
+
+def find_contiguous_regions(activity_array): # in this part, if you cannot understand the binary operation, I think you can write a O(n) complexity method
+    """Find contiguous regions from bool valued numpy.array.
+    Copy of https://dcase-repo.github.io/dcase_util/_modules/dcase_util/data/decisions.html#DecisionEncoder
+    Reason is:
+    1. This does not belong to a class necessarily
+    2. Import DecisionEncoder requires sndfile over some other imports..which causes some problems on clusters
+    """
+    change_indices = np.logical_xor(activity_array[1:], activity_array[:-1]).nonzero()[0] 
+    change_indices += 1
+    if activity_array[0]:
+        # If the first element of activity_array is True add 0 at the beginning
+        change_indices = np.r_[0, change_indices]
+
+    if activity_array[-1]:
+        # If the last element of activity_array is True, add the length of the array
+        change_indices = np.r_[change_indices, activity_array.size]
+    # print(change_indices.reshape((-1, 2)))
+    # Reshape the result into two columns
+    return change_indices.reshape((-1, 2))
+
+
+def split_train_cv(
+        data_frame: pd.DataFrame,
+        frac: float = 0.9,
+        y=None,  # Only for stratified, computes necessary split
+        **kwargs):
+    """split_train_cv
+
+    :param data_frame:
+    :type data_frame: pd.DataFrame
+    :param frac:
+    :type frac: float
+    """
+    if kwargs.get('mode',
+                  None) == 'urbansed':  # Filenames are DATA_-1 DATA_-2 etc
+        data_frame.loc[:, 'id'] = data_frame.groupby(
+            data_frame['filename'].str.split('_').apply(
+                lambda x: '_'.join(x[:-1]))).ngroup()
+        sampler = np.random.permutation(data_frame['id'].nunique())
+        num_train = int(frac * len(sampler))
+        train_indexes = sampler[:num_train]
+        cv_indexes = sampler[num_train:]
+        train_data = data_frame[data_frame['id'].isin(train_indexes)]
+        cv_data = data_frame[data_frame['id'].isin(cv_indexes)]
+        del train_data['id']
+        del cv_data['id']
+    elif kwargs.get('mode', None) == 'stratified': #  stratified --> 分层的 ?
+        # Use statified sampling
+        from skmultilearn.model_selection import iterative_train_test_split
+        index_train, _, index_cv, _ = iterative_train_test_split(
+            data_frame.index.values.reshape(-1, 1), y, test_size=1. - frac)
+        train_data = data_frame[data_frame.index.isin(index_train.squeeze())]
+        cv_data = data_frame[data_frame.index.isin(index_cv.squeeze())] # cv --> cross validation
+    else:
+        # Simply split train_test
+        train_data = data_frame.sample(frac=frac, random_state=10)
+        cv_data = data_frame[~data_frame.index.isin(train_data.index)]
+    return train_data, cv_data
+
+
+
+def pprint_dict(in_dict, outputfun=sys.stdout.write, formatter='yaml'): # print yaml file
+    """pprint_dict
+    :param outputfun: function to use, defaults to sys.stdout
+    :param in_dict: dict to print
+    """
+    if formatter == 'yaml':
+        format_fun = yaml.dump
+    elif formatter == 'pretty':
+        format_fun = pformat
+    for line in format_fun(in_dict).split('\n'):
+        outputfun(line)
+
+
+def getfile_outlogger(outputfile):
+    log_format = "[<green>{time:YYYY-MM-DD HH:mm:ss}</green>] {message}"
+    logger.configure(handlers=[{"sink": sys.stderr, "format": log_format}])
+    if outputfile:
+        logger.add(outputfile, enqueue=True, format=log_format)
+    return logger
+
+# according label, get encoder
+def train_labelencoder(labels: pd.Series, sparse=True):
+    """encode_labels
+
+    Encodes labels
+
+    :param labels: pd.Series representing the raw labels e.g., Speech, Water
+    :param encoder (optional): Encoder already fitted 
+    returns encoded labels (many hot) and the encoder
+    """
+    assert isinstance(labels, pd.Series), "Labels need to be series"
+    if isinstance(labels[0], six.string_types):
+        # In case of using non processed strings, e.g., Vaccum, Speech
+        label_array = labels.str.split(',').values.tolist() # split label according to ','
+    elif isinstance(labels[0], np.ndarray):
+        # Encoder does not like to see numpy array
+        label_array = [lab.tolist() for lab in labels]
+    elif isinstance(labels[0], collections.Iterable):
+        label_array = labels
+    encoder = pre.MultiLabelBinarizer(sparse_output=sparse)
+    encoder.fit(label_array)
+    return encoder
+
+
+def encode_labels(labels: pd.Series, encoder=None, sparse=True):
+    """encode_labels
+
+    Encodes labels
+
+    :param labels: pd.Series representing the raw labels e.g., Speech, Water
+    :param encoder (optional): Encoder already fitted 
+    returns encoded labels (many hot) and the encoder
+    """
+    assert isinstance(labels, pd.Series), "Labels need to be series"
+    instance = labels.iloc[0]
+    if isinstance(instance, six.string_types):
+        # In case of using non processed strings, e.g., Vaccum, Speech
+        label_array = labels.str.split(',').values.tolist()
+    elif isinstance(instance, np.ndarray):
+        # Encoder does not like to see numpy array
+        label_array = [lab.tolist() for lab in labels]
+    elif isinstance(instance, collections.Iterable):
+        label_array = labels
+    # get label_array, it is a list ,contain a lot of label, this label are string type
+    if not encoder:
+        encoder = pre.MultiLabelBinarizer(sparse_output=sparse) # if we encoder is None, we should init a encoder firstly.
+        encoder.fit(label_array)
+    labels_encoded = encoder.transform(label_array) # transform string to digit
+    return labels_encoded, encoder
+
+    # return pd.arrays.SparseArray(
+    # [row.toarray().ravel() for row in labels_encoded]), encoder
+
+
+def decode_with_timestamps(events,labels: np.array):
+    """decode_with_timestamps
+    Decodes the predicted label array (2d) into a list of
+    [(Labelname, onset, offset), ...]
+
+    :param encoder: Encoder during training
+    :type encoder: pre.MultiLabelBinarizer
+    :param labels: n-dim array
+    :type labels: np.array
+    """
+    # print('events ',events)
+    # print('labels ',labels.shape)
+    #assert 1==2
+    if labels.ndim == 2:
+        #print('...')
+        return [_decode_with_timestamps(events[i],labels[i]) for i in range(labels.shape[0])]
+    else:
+        return _decode_with_timestamps(events,labels)
+
+
+def median_filter(x, window_size, threshold=0.5):
+    """median_filter
+    :param x: input prediction array of shape (B, T, C) or (B, T).
+        Input is a sequence of probabilities 0 <= x <= 1
+    :param window_size: An integer to use 
+    :param threshold: Binary thresholding threshold
+    """
+    x = binarize(x, threshold=threshold) # transfer to 0 or 1
+    if x.ndim == 3:
+        size = (1, window_size, 1)
+    elif x.ndim == 2 and x.shape[0] == 1:
+        # Assume input is class-specific median filtering
+        # E.g, Batch x Time  [1, 501]
+        size = (1, window_size)
+    elif x.ndim == 2 and x.shape[0] > 1:
+        # Assume input is standard median pooling, class-independent
+        # E.g., Time x Class [501, 10]
+        size = (window_size, 1)
+    return scipy.ndimage.median_filter(x, size=size)
+
+
+def _decode_with_timestamps(events,labels):
+    result_labels = []
+    # print('.......')
+    # print('labels ',labels.shape)
+    # print(labels)
+    change_indices = find_contiguous_regions(labels)
+    # print(change_indices)
+    # assert 1==2
+    for row in change_indices:
+        result_labels.append((events,row[0], row[1]))
+    return result_labels
+
+def inverse_transform_labels(encoder, pred):
+    if pred.ndim == 3:
+        return [encoder.inverse_transform(x) for x in pred]
+    else:
+        return encoder.inverse_transform(pred)
+
+
+def binarize(pred, threshold=0.5):
+    # Batch_wise
+    if pred.ndim == 3:
+        return np.array(
+            [pre.binarize(sub, threshold=threshold) for sub in pred])
+    else:
+        return pre.binarize(pred, threshold=threshold)
+
+
+def double_threshold(x, high_thres, low_thres, n_connect=1):
+    """double_threshold
+    Helper function to calculate double threshold for n-dim arrays
+
+    :param x: input array
+    :param high_thres: high threshold value
+    :param low_thres: Low threshold value
+    :param n_connect: Distance of <= n clusters will be merged
+    """
+    assert x.ndim <= 3, "Whoops something went wrong with the input ({}), check if its <= 3 dims".format(
+        x.shape)
+    if x.ndim == 3:
+        apply_dim = 1
+    elif x.ndim < 3:
+        apply_dim = 0
+    # x is assumed to be 3d: (batch, time, dim)
+    # Assumed to be 2d : (time, dim)
+    # Assumed to be 1d : (time)
+    # time axis is therefore at 1 for 3d and 0 for 2d (
+    return np.apply_along_axis(lambda x: _double_threshold(
+        x, high_thres, low_thres, n_connect=n_connect),
+                               axis=apply_dim,
+                               arr=x)
+
+
+def _double_threshold(x, high_thres, low_thres, n_connect=1, return_arr=True): # in nature, double_threshold considers boundary question
+    """_double_threshold
+    Computes a double threshold over the input array
+
+    :param x: input array, needs to be 1d
+    :param high_thres: High threshold over the array
+    :param low_thres: Low threshold over the array
+    :param n_connect: Postprocessing, maximal distance between clusters to connect
+    :param return_arr: By default this function returns the filtered indiced, but if return_arr = True it returns an array of tsame size as x filled with ones and zeros.
+    """
+    assert x.ndim == 1, "Input needs to be 1d"
+    high_locations = np.where(x > high_thres)[0] # return the index, where value is greater than high_thres
+    locations = x > low_thres # return true of false
+    encoded_pairs = find_contiguous_regions(locations)
+    # print('encoded_pairs ',encoded_pairs)
+    filtered_list = list(
+        filter(
+            lambda pair:
+            ((pair[0] <= high_locations) & (high_locations <= pair[1])).any(),
+            encoded_pairs)) # find encoded_pair where inclide a high_lacations
+    #print('filtered_list ',filtered_list)
+    filtered_list = connect_(filtered_list, n_connect) # if the distance of two pair is less than n_connect, we can merge them
+    if return_arr:
+        zero_one_arr = np.zeros_like(x, dtype=int)
+        for sl in filtered_list:
+            zero_one_arr[sl[0]:sl[1]] = 1
+        return zero_one_arr
+    return filtered_list
+
+
+def connect_clusters(x, n=1):
+    if x.ndim == 1:
+        return connect_clusters_(x, n)
+    if x.ndim >= 2:
+        return np.apply_along_axis(lambda a: connect_clusters_(a, n=n), -2, x)
+
+
+def connect_clusters_(x, n=1):
+    """connect_clusters_
+    Connects clustered predictions (0,1) in x with range n
+
+    :param x: Input array. zero-one format
+    :param n: Number of frames to skip until connection can be made
+    """
+    assert x.ndim == 1, "input needs to be 1d"
+    reg = find_contiguous_regions(x)
+    start_end = connect_(reg, n=n)
+    zero_one_arr = np.zeros_like(x, dtype=int)
+    for sl in start_end:
+        zero_one_arr[sl[0]:sl[1]] = 1
+    return zero_one_arr
+
+
+def connect_(pairs, n=1):
+    """connect_
+    Connects two adjacent clusters if their distance is <= n
+
+    :param pairs: Clusters of iterateables e.g., [(1,5),(7,10)]
+    :param n: distance between two clusters 
+    """
+    if len(pairs) == 0:
+        return []
+    start_, end_ = pairs[0]
+    new_pairs = []
+    for i, (next_item, cur_item) in enumerate(zip(pairs[1:], pairs[0:])):
+        end_ = next_item[1]
+        if next_item[0] - cur_item[1] <= n:
+            pass
+        else:
+            new_pairs.append((start_, cur_item[1]))
+            start_ = next_item[0]
+    new_pairs.append((start_, end_))
+    return new_pairs
+
+
+def predictions_to_time(df, ratio):
+    df.onset = df.onset * ratio
+    df.offset = df.offset * ratio
+    return df
+
+def upgrade_resolution(arr, scale):
+    print('arr ',arr.shape)
+    x = np.arange(0, arr.shape[0])
+    f = interp1d(x, arr, kind='linear', axis=0, fill_value='extrapolate')
+    scale_x = np.arange(0, arr.shape[0], 1 / scale)
+    up_scale = f(scale_x)
+    return up_scale
+# a = [0.1,0.2,0.3,0.8,0.4,0.1,0.3,0.9,0.4]
+# a = np.array(a)
+# b = a>0.2
+# _double_threshold(a,0.7,0.2)
\ No newline at end of file
diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth b/audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth
new file mode 100644
index 0000000000000000000000000000000000000000..30ee4a84d0ad9ada87a5ec32dc40ec789e559e82
--- /dev/null
+++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e4525ad12621117c3a0fcfe974fd55e51583cd219106bf510438f4bec4edc18
+size 140604911
diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth
new file mode 100644
index 0000000000000000000000000000000000000000..23719b4c8deee6c6bcac7d7704f6ced56fa289e1
--- /dev/null
+++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1331dab1e4c3ac2bc5850156f2000a95fe333bdf06d08ce9b490550726548ab0
+size 2479
diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3bae9021caa4dd01659303bc05d2227436e7a64d
--- /dev/null
+++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9b44e30c4800462c177806bbd7009953d70d531c873e3791ca9aa85375d524d
+size 343538489
diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth b/audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth
new file mode 100644
index 0000000000000000000000000000000000000000..80e1bacdfbba7071092e562b4ddfb1d8fbee6e83
--- /dev/null
+++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de482358747778181e4dc530ec61ae94f53ae0b202ac92e99491fe4ceb3cbb1c
+size 255398
diff --git a/audio_to_text/__init__.py b/audio_to_text/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/audio_to_text/__pycache__/__init__.cpython-38.pyc b/audio_to_text/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd50a50ca70fb2a85f608f8dddd11a6abb7b807d
Binary files /dev/null and b/audio_to_text/__pycache__/__init__.cpython-38.pyc differ
diff --git a/audio_to_text/__pycache__/inference_waveform.cpython-38.pyc b/audio_to_text/__pycache__/inference_waveform.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbe230a79bcf0de51c959381e83483d5a9f322b8
Binary files /dev/null and b/audio_to_text/__pycache__/inference_waveform.cpython-38.pyc differ
diff --git a/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dcdbfafa6487b60aeb8e60f7ad80da2cd1150308
--- /dev/null
+++ b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml
@@ -0,0 +1,23 @@
+model:
+    encoder:
+        type: Cnn14RnnEncoder
+        args:
+            sample_rate: 32000
+            pretrained: ./audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
+            freeze_cnn: True
+            freeze_cnn_bn: True
+            bidirectional: True
+            dropout: 0.5
+            hidden_size: 256
+            num_layers: 3
+    decoder:
+        type: TransformerDecoder
+        args:
+            attn_emb_dim: 512
+            dropout: 0.2
+            emb_dim: 256
+            fc_emb_dim: 512
+            nlayers: 2
+    type: TransformerModel
+    args: {}
+
diff --git a/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth
new file mode 100644
index 0000000000000000000000000000000000000000..916026e45ca268db286047dacb1161a6a91a9613
--- /dev/null
+++ b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d22099e1025baae0f32ce09ec02c3d5fea001e295512fbf8754b5c66db21b0ec
+size 43027289
diff --git a/audio_to_text/captioning/__init__.py b/audio_to_text/captioning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/audio_to_text/captioning/__pycache__/__init__.cpython-38.pyc b/audio_to_text/captioning/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6c2da4d396315c560620b86eb2737a07e067ee9
Binary files /dev/null and b/audio_to_text/captioning/__pycache__/__init__.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__init__.py b/audio_to_text/captioning/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7259d671aaa8a7278b5aaa12069dc25caaad3cd8
--- /dev/null
+++ b/audio_to_text/captioning/models/__init__.py
@@ -0,0 +1,3 @@
+from .base_model import *
+from .transformer_model import *
+
diff --git a/audio_to_text/captioning/models/__pycache__/__init__.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c08c874fac4d909a82f27f959e743a4aba5436a8
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/__init__.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/attn_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/attn_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c38109625aa375a8953c1adb9e8493ba1c592dcb
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/attn_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/base_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/base_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8610e38e506ce60292444561bc0a7652bf2d718f
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/base_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/decoder.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/decoder.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9848a5dd6fd832d108179372880bed510ebc7da
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/decoder.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/encoder.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/encoder.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a00468fcad9293ac03d90700e32320a3fa9e474
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/encoder.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/fc_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/fc_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c9b1b8f984d37e0daed0fc541737be2f24a5e94
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/fc_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/rl_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/rl_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06caef1f8b20de29821f255f2bf3263b5aa65211
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/rl_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/style_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/style_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d383220b793d5a36727995d722ff8bbb7affbab
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/style_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/transformer_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/transformer_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..597cea7bac6d491e52c98c1f4e9f5f0ee9659e24
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/transformer_model.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/__pycache__/utils.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ab5c1da309ac502cbce9dffb00956d5c668b63b
Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/utils.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/models/base_model.py b/audio_to_text/captioning/models/base_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd014e9b9e68fc80f44179ccbbe066791ecdd7c0
--- /dev/null
+++ b/audio_to_text/captioning/models/base_model.py
@@ -0,0 +1,500 @@
+# -*- coding: utf-8 -*-
+
+from typing import Dict
+
+import torch
+import torch.nn as nn
+
+from .utils import mean_with_lens, repeat_tensor
+
+
+class CaptionModel(nn.Module):
+    """
+    Encoder-decoder captioning model.
+    """
+
+    pad_idx = 0
+    start_idx = 1
+    end_idx = 2
+    max_length = 20
+
+    def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.vocab_size = decoder.vocab_size
+        self.train_forward_keys = ["cap", "cap_len", "ss_ratio"]
+        self.inference_forward_keys = ["sample_method", "max_length", "temp"]
+        freeze_encoder = kwargs.get("freeze_encoder", False)
+        if freeze_encoder:
+            for param in self.encoder.parameters():
+                param.requires_grad = False
+        self.check_decoder_compatibility()
+
+    def check_decoder_compatibility(self):
+        compatible_decoders = [x.__class__.__name__ for x in self.compatible_decoders]
+        assert isinstance(self.decoder, self.compatible_decoders), \
+            f"{self.decoder.__class__.__name__} is incompatible with " \
+            f"{self.__class__.__name__}, please use decoder in {compatible_decoders} "
+
+    @classmethod
+    def set_index(cls, start_idx, end_idx):
+        cls.start_idx = start_idx
+        cls.end_idx = end_idx
+
+    def forward(self, input_dict: Dict):
+        """
+        input_dict: {
+            (required)
+            mode: train/inference,
+            spec,
+            spec_len,
+            fc,
+            attn,
+            attn_len,
+            [sample_method: greedy],
+            [temp: 1.0] (in case of no teacher forcing)
+
+            (optional, mode=train)
+            cap,
+            cap_len,
+            ss_ratio,
+
+            (optional, mode=inference)
+            sample_method: greedy/beam,
+            max_length,
+            temp,
+            beam_size (optional, sample_method=beam),
+            n_best (optional, sample_method=beam),
+        }
+        """
+        # encoder_input_keys = ["spec", "spec_len", "fc", "attn", "attn_len"]
+        # encoder_input = { key: input_dict[key] for key in encoder_input_keys }
+        encoder_output_dict = self.encoder(input_dict)
+        if input_dict["mode"] == "train":
+            forward_dict = {
+                "mode": "train", "sample_method": "greedy", "temp": 1.0
+            }
+            for key in self.train_forward_keys:
+                forward_dict[key] = input_dict[key]
+            forward_dict.update(encoder_output_dict)
+            output = self.train_forward(forward_dict)
+        elif input_dict["mode"] == "inference":
+            forward_dict = {"mode": "inference"}
+            default_args = { "sample_method": "greedy", "max_length": self.max_length, "temp": 1.0 }
+            for key in self.inference_forward_keys:
+                if key in input_dict:
+                    forward_dict[key] = input_dict[key]
+                else:
+                    forward_dict[key] = default_args[key]
+
+            if forward_dict["sample_method"] == "beam":
+                forward_dict["beam_size"] = input_dict.get("beam_size", 3)
+                forward_dict["n_best"] = input_dict.get("n_best", False)
+                forward_dict["n_best_size"] = input_dict.get("n_best_size", forward_dict["beam_size"])
+            elif forward_dict["sample_method"] == "dbs":
+                forward_dict["beam_size"] = input_dict.get("beam_size", 6)
+                forward_dict["group_size"] = input_dict.get("group_size", 3)
+                forward_dict["diversity_lambda"] = input_dict.get("diversity_lambda", 0.5)
+                forward_dict["group_nbest"] = input_dict.get("group_nbest", True)
+
+            forward_dict.update(encoder_output_dict)
+            output = self.inference_forward(forward_dict)
+        else:
+            raise Exception("mode should be either 'train' or 'inference'")
+
+        return output
+
+    def prepare_output(self, input_dict):
+        output = {}
+        batch_size = input_dict["fc_emb"].size(0)
+        if input_dict["mode"] == "train":
+            max_length = input_dict["cap"].size(1) - 1
+        elif input_dict["mode"] == "inference":
+            max_length = input_dict["max_length"]
+        else:
+            raise Exception("mode should be either 'train' or 'inference'")
+        device = input_dict["fc_emb"].device
+        output["seq"] = torch.full((batch_size, max_length), self.end_idx,
+                                   dtype=torch.long)
+        output["logit"] = torch.empty(batch_size, max_length,
+                                      self.vocab_size).to(device)
+        output["sampled_logprob"] = torch.zeros(batch_size, max_length)
+        output["embed"] = torch.empty(batch_size, max_length,
+                                      self.decoder.d_model).to(device)
+        return output
+
+    def train_forward(self, input_dict):
+        if input_dict["ss_ratio"] != 1: # scheduled sampling training
+            input_dict["mode"] = "train"
+            return self.stepwise_forward(input_dict)
+        output = self.seq_forward(input_dict)
+        self.train_process(output, input_dict)
+        return output
+
+    def seq_forward(self, input_dict):
+        raise NotImplementedError
+
+    def train_process(self, output, input_dict):
+        pass
+
+    def inference_forward(self, input_dict):
+        if input_dict["sample_method"] == "beam":
+            return self.beam_search(input_dict)
+        elif input_dict["sample_method"] == "dbs":
+            return self.diverse_beam_search(input_dict)
+        return self.stepwise_forward(input_dict)
+
+    def stepwise_forward(self, input_dict):
+        """Step-by-step decoding"""
+        output = self.prepare_output(input_dict)
+        max_length = output["seq"].size(1)
+        # start sampling
+        for t in range(max_length):
+            input_dict["t"] = t
+            self.decode_step(input_dict, output)
+            if input_dict["mode"] == "inference": # decide whether to stop when sampling
+                unfinished_t = output["seq"][:, t] != self.end_idx
+                if t == 0:
+                    unfinished = unfinished_t
+                else:
+                    unfinished *= unfinished_t
+                output["seq"][:, t][~unfinished] = self.end_idx
+                if unfinished.sum() == 0:
+                    break
+        self.stepwise_process(output)
+        return output
+
+    def decode_step(self, input_dict, output):
+        """Decoding operation of timestep t"""
+        decoder_input = self.prepare_decoder_input(input_dict, output)
+        # feed to the decoder to get logit
+        output_t = self.decoder(decoder_input)
+        logit_t = output_t["logit"]
+        # assert logit_t.ndim == 3
+        if logit_t.size(1) == 1:
+            logit_t = logit_t.squeeze(1)
+            embed_t = output_t["embed"].squeeze(1)
+        elif logit_t.size(1) > 1:
+            logit_t = logit_t[:, -1, :]
+            embed_t = output_t["embed"][:, -1, :]
+        else:
+            raise Exception("no logit output")
+        # sample the next input word and get the corresponding logit
+        sampled = self.sample_next_word(logit_t,
+                                        method=input_dict["sample_method"],
+                                        temp=input_dict["temp"])
+
+        output_t.update(sampled)
+        output_t["t"] = input_dict["t"]
+        output_t["logit"] = logit_t
+        output_t["embed"] = embed_t
+        self.stepwise_process_step(output, output_t)
+
+    def prepare_decoder_input(self, input_dict, output):
+        """Prepare the inp ut dict for the decoder"""
+        raise NotImplementedError
+    
+    def stepwise_process_step(self, output, output_t):
+        """Postprocessing (save output values) after each timestep t"""
+        t = output_t["t"]
+        output["logit"][:, t, :] = output_t["logit"]
+        output["seq"][:, t] = output_t["word"]
+        output["sampled_logprob"][:, t] = output_t["probs"]
+        output["embed"][:, t, :] = output_t["embed"]
+
+    def stepwise_process(self, output):
+        """Postprocessing after the whole step-by-step autoregressive decoding"""
+        pass
+
+    def sample_next_word(self, logit, method, temp):
+        """Sample the next word, given probs output by the decoder"""
+        logprob = torch.log_softmax(logit, dim=1)
+        if method == "greedy":
+            sampled_logprob, word = torch.max(logprob.detach(), 1)
+        elif method == "gumbel":
+            def sample_gumbel(shape, eps=1e-20):
+                U = torch.rand(shape).to(logprob.device)
+                return -torch.log(-torch.log(U + eps) + eps)
+            def gumbel_softmax_sample(logit, temperature):
+                y = logit + sample_gumbel(logit.size())
+                return torch.log_softmax(y / temperature, dim=-1)
+            _logprob = gumbel_softmax_sample(logprob, temp)
+            _, word = torch.max(_logprob.data, 1)
+            sampled_logprob = logprob.gather(1, word.unsqueeze(-1))
+        else:
+            logprob = logprob / temp
+            if method.startswith("top"):
+                top_num = float(method[3:])
+                if 0 < top_num < 1: # top-p sampling
+                    probs = torch.softmax(logit, dim=1)
+                    sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=1)
+                    _cumsum = sorted_probs.cumsum(1)
+                    mask = _cumsum < top_num
+                    mask = torch.cat([torch.ones_like(mask[:,:1]), mask[:,:-1]], 1)
+                    sorted_probs = sorted_probs * mask.to(sorted_probs)
+                    sorted_probs = sorted_probs / sorted_probs.sum(1, keepdim=True)
+                    logprob.scatter_(1, sorted_indices, sorted_probs.log())
+                else: # top-k sampling
+                    k = int(top_num)
+                    tmp = torch.empty_like(logprob).fill_(float('-inf'))
+                    topk, indices = torch.topk(logprob, k, dim=1)
+                    tmp = tmp.scatter(1, indices, topk)
+                    logprob = tmp
+            word = torch.distributions.Categorical(logits=logprob.detach()).sample()
+            sampled_logprob = logprob.gather(1, word.unsqueeze(-1)).squeeze(1)
+        word = word.detach().long()
+        # sampled_logprob: [N,], word: [N,]
+        return {"word": word, "probs": sampled_logprob}
+
+    def beam_search(self, input_dict):
+        output = self.prepare_output(input_dict)
+        max_length = input_dict["max_length"]
+        beam_size = input_dict["beam_size"]
+        if input_dict["n_best"]:
+            n_best_size = input_dict["n_best_size"]
+            batch_size, max_length = output["seq"].size()
+            output["seq"] = torch.full((batch_size, n_best_size, max_length),
+                                        self.end_idx, dtype=torch.long)
+            
+        temp = input_dict["temp"]
+        # instance by instance beam seach
+        for i in range(output["seq"].size(0)):
+            output_i = self.prepare_beamsearch_output(input_dict)
+            input_dict["sample_idx"] = i
+            for t in range(max_length):
+                input_dict["t"] = t
+                output_t = self.beamsearch_step(input_dict, output_i)
+                #######################################
+                # merge with previous beam and select the current max prob beam
+                #######################################
+                logit_t = output_t["logit"]
+                if logit_t.size(1) == 1:
+                    logit_t = logit_t.squeeze(1)
+                elif logit_t.size(1) > 1:
+                    logit_t = logit_t[:, -1, :]
+                else:
+                    raise Exception("no logit output")
+                logprob_t = torch.log_softmax(logit_t, dim=1)
+                logprob_t = torch.log_softmax(logprob_t / temp, dim=1)
+                logprob_t = output_i["topk_logprob"].unsqueeze(1) + logprob_t
+                if t == 0: # for the first step, all k seq will have the same probs
+                    topk_logprob, topk_words = logprob_t[0].topk(
+                        beam_size, 0, True, True)
+                else: # unroll and find top logprob, and their unrolled indices
+                    topk_logprob, topk_words = logprob_t.view(-1).topk(
+                        beam_size, 0, True, True)
+                topk_words = topk_words.cpu()
+                output_i["topk_logprob"] = topk_logprob
+                # output_i["prev_words_beam"] = topk_words // self.vocab_size  # [beam_size,]
+                output_i["prev_words_beam"] = torch.div(topk_words, self.vocab_size,
+                                                        rounding_mode='trunc')
+                output_i["next_word"] = topk_words % self.vocab_size  # [beam_size,]
+                if t == 0:
+                    output_i["seq"] = output_i["next_word"].unsqueeze(1)
+                else:
+                    output_i["seq"] = torch.cat([
+                        output_i["seq"][output_i["prev_words_beam"]],
+                        output_i["next_word"].unsqueeze(1)], dim=1)
+
+                # add finished beams to results
+                is_end = output_i["next_word"] == self.end_idx
+                if t == max_length - 1:
+                    is_end.fill_(1)
+                
+                for beam_idx in range(beam_size):
+                    if is_end[beam_idx]:
+                        final_beam = {
+                            "seq": output_i["seq"][beam_idx].clone(),
+                            "score": output_i["topk_logprob"][beam_idx].item()
+                        }
+                        final_beam["score"] = final_beam["score"] / (t + 1)
+                        output_i["done_beams"].append(final_beam)
+                output_i["topk_logprob"][is_end] -= 1000
+
+                self.beamsearch_process_step(output_i, output_t)
+
+            self.beamsearch_process(output, output_i, input_dict)
+        return output
+
+    def prepare_beamsearch_output(self, input_dict):
+        beam_size = input_dict["beam_size"]
+        device = input_dict["fc_emb"].device
+        output = {
+            "topk_logprob": torch.zeros(beam_size).to(device),
+            "seq": None,
+            "prev_words_beam": None,
+            "next_word": None,
+            "done_beams": [],
+        }
+        return output
+
+    def beamsearch_step(self, input_dict, output_i):
+        decoder_input = self.prepare_beamsearch_decoder_input(input_dict, output_i)
+        output_t = self.decoder(decoder_input)
+        output_t["t"] = input_dict["t"]
+        return output_t
+
+    def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+        raise NotImplementedError
+            
+    def beamsearch_process_step(self, output_i, output_t):
+        pass
+
+    def beamsearch_process(self, output, output_i, input_dict):
+        i = input_dict["sample_idx"]
+        done_beams = sorted(output_i["done_beams"], key=lambda x: -x["score"])
+        if input_dict["n_best"]:
+            done_beams = done_beams[:input_dict["n_best_size"]]
+            for out_idx, done_beam in enumerate(done_beams):
+                seq = done_beam["seq"]
+                output["seq"][i][out_idx, :len(seq)] = seq
+        else:
+            seq = done_beams[0]["seq"]
+            output["seq"][i][:len(seq)] = seq
+    
+    def diverse_beam_search(self, input_dict):
+        
+        def add_diversity(seq_table, logprob, t, divm, diversity_lambda, bdash):
+            local_time = t - divm
+            unaug_logprob = logprob.clone()
+
+            if divm > 0:
+                change = torch.zeros(logprob.size(-1))
+                for prev_choice in range(divm):
+                    prev_decisions = seq_table[prev_choice][..., local_time]
+                    for prev_labels in range(bdash):
+                        change.scatter_add_(0, prev_decisions[prev_labels], change.new_ones(1))
+
+                change = change.to(logprob.device)
+                logprob = logprob - repeat_tensor(change, bdash) * diversity_lambda
+
+            return logprob, unaug_logprob
+
+        output = self.prepare_output(input_dict)
+        group_size = input_dict["group_size"]
+        batch_size = output["seq"].size(0)
+        beam_size = input_dict["beam_size"]
+        bdash = beam_size // group_size
+        input_dict["bdash"] = bdash
+        diversity_lambda = input_dict["diversity_lambda"]
+        device = input_dict["fc_emb"].device
+        max_length = input_dict["max_length"]
+        temp = input_dict["temp"]
+        group_nbest = input_dict["group_nbest"]
+        batch_size, max_length = output["seq"].size()
+        if group_nbest:
+            output["seq"] = torch.full((batch_size, beam_size, max_length),
+                                        self.end_idx, dtype=torch.long)
+        else:
+            output["seq"] = torch.full((batch_size, group_size, max_length),
+                                        self.end_idx, dtype=torch.long)
+
+
+        for i in range(batch_size):
+            input_dict["sample_idx"] = i
+            seq_table = [torch.LongTensor(bdash, 0) for _ in range(group_size)] # group_size x [bdash, 0]
+            logprob_table = [torch.zeros(bdash).to(device) for _ in range(group_size)]
+            done_beams_table = [[] for _ in range(group_size)]
+
+            output_i = {
+                "prev_words_beam": [None for _ in range(group_size)],
+                "next_word": [None for _ in range(group_size)],
+                "state": [None for _ in range(group_size)]
+            }
+
+            for t in range(max_length + group_size - 1):
+                input_dict["t"] = t
+                for divm in range(group_size):
+                    input_dict["divm"] = divm
+                    if t >= divm and t <= max_length + divm - 1:
+                        local_time = t - divm
+                        decoder_input = self.prepare_dbs_decoder_input(input_dict, output_i)
+                        output_t = self.decoder(decoder_input)
+                        output_t["divm"] = divm
+                        logit_t = output_t["logit"]
+                        if logit_t.size(1) == 1:
+                            logit_t = logit_t.squeeze(1)
+                        elif logit_t.size(1) > 1:
+                            logit_t = logit_t[:, -1, :]
+                        else:
+                            raise Exception("no logit output")
+                        logprob_t = torch.log_softmax(logit_t, dim=1)
+                        logprob_t = torch.log_softmax(logprob_t / temp, dim=1)
+                        logprob_t, unaug_logprob_t = add_diversity(seq_table, logprob_t, t, divm, diversity_lambda, bdash)
+                        logprob_t = logprob_table[divm].unsqueeze(-1) + logprob_t
+                        if local_time == 0: # for the first step, all k seq will have the same probs
+                            topk_logprob, topk_words = logprob_t[0].topk(
+                                bdash, 0, True, True)
+                        else: # unroll and find top logprob, and their unrolled indices
+                            topk_logprob, topk_words = logprob_t.view(-1).topk(
+                                bdash, 0, True, True)
+                        topk_words = topk_words.cpu()
+                        logprob_table[divm] = topk_logprob
+                        output_i["prev_words_beam"][divm] = topk_words // self.vocab_size  # [bdash,]
+                        output_i["next_word"][divm] = topk_words % self.vocab_size  # [bdash,]
+                        if local_time > 0:
+                            seq_table[divm] = seq_table[divm][output_i["prev_words_beam"][divm]]
+                        seq_table[divm] = torch.cat([
+                            seq_table[divm],
+                            output_i["next_word"][divm].unsqueeze(-1)], -1)
+
+                        is_end = seq_table[divm][:, t-divm] == self.end_idx
+                        assert seq_table[divm].shape[-1] == t - divm + 1
+                        if t == max_length + divm - 1:
+                            is_end.fill_(1)
+                        for beam_idx in range(bdash):
+                            if is_end[beam_idx]:
+                                final_beam = {
+                                    "seq": seq_table[divm][beam_idx].clone(),
+                                    "score": logprob_table[divm][beam_idx].item()
+                                }
+                                final_beam["score"] = final_beam["score"] / (t - divm + 1)
+                                done_beams_table[divm].append(final_beam)
+                        logprob_table[divm][is_end] -= 1000
+                        self.dbs_process_step(output_i, output_t)
+            done_beams_table = [sorted(done_beams_table[divm], key=lambda x: -x["score"])[:bdash] for divm in range(group_size)]
+            if group_nbest:
+                done_beams = sum(done_beams_table, [])
+            else:
+                done_beams = [group_beam[0] for group_beam in done_beams_table]
+            for _, done_beam in enumerate(done_beams):
+                output["seq"][i, _, :len(done_beam["seq"])] = done_beam["seq"]
+
+        return output
+            
+    def prepare_dbs_decoder_input(self, input_dict, output_i):
+        raise NotImplementedError
+
+    def dbs_process_step(self, output_i, output_t):
+        pass
+
+
+class CaptionSequenceModel(nn.Module):
+
+    def __init__(self, model, seq_output_size):
+        super().__init__()
+        self.model = model
+        if model.decoder.d_model != seq_output_size:
+            self.output_transform = nn.Linear(model.decoder.d_model, seq_output_size)
+        else:
+            self.output_transform = lambda x: x
+
+    def forward(self, input_dict):
+        output = self.model(input_dict)
+
+        if input_dict["mode"] == "train":
+            lens = input_dict["cap_len"] - 1
+            # seq_outputs: [N, d_model]
+        elif input_dict["mode"] == "inference":
+            if "sample_method" in input_dict and input_dict["sample_method"] == "beam":
+                return output
+            seq = output["seq"]
+            lens = torch.where(seq == self.model.end_idx, torch.zeros_like(seq), torch.ones_like(seq)).sum(dim=1)
+        else:
+            raise Exception("mode should be either 'train' or 'inference'")
+        seq_output = mean_with_lens(output["embed"], lens)
+        seq_output = self.output_transform(seq_output)
+        output["seq_output"] = seq_output
+        return output
+
diff --git a/audio_to_text/captioning/models/decoder.py b/audio_to_text/captioning/models/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..869eac11349f2321993e84be148aaa651892607f
--- /dev/null
+++ b/audio_to_text/captioning/models/decoder.py
@@ -0,0 +1,746 @@
+# -*- coding: utf-8 -*-
+
+import math
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from .utils import generate_length_mask, init, PositionalEncoding
+
+
+class BaseDecoder(nn.Module):
+    """
+    Take word/audio embeddings and output the next word probs
+    Base decoder, cannot be called directly
+    All decoders should inherit from this class
+    """
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim,
+                 attn_emb_dim, dropout=0.2):
+        super().__init__()
+        self.emb_dim = emb_dim
+        self.vocab_size = vocab_size
+        self.fc_emb_dim = fc_emb_dim
+        self.attn_emb_dim = attn_emb_dim
+        self.word_embedding = nn.Embedding(vocab_size, emb_dim)
+        self.in_dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        raise NotImplementedError
+
+    def load_word_embedding(self, weight, freeze=True):
+        embedding = np.load(weight)
+        assert embedding.shape[0] == self.vocab_size, "vocabulary size mismatch"
+        assert embedding.shape[1] == self.emb_dim, "embed size mismatch"
+        
+        # embeddings = torch.as_tensor(embeddings).float()
+        # self.word_embeddings.weight = nn.Parameter(embeddings)
+        # for para in self.word_embeddings.parameters():
+            # para.requires_grad = tune
+        self.word_embedding = nn.Embedding.from_pretrained(embedding,
+            freeze=freeze)
+
+
+class RnnDecoder(BaseDecoder):
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                 dropout, d_model, **kwargs):
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout,)
+        self.d_model = d_model
+        self.num_layers = kwargs.get('num_layers', 1)
+        self.bidirectional = kwargs.get('bidirectional', False)
+        self.rnn_type = kwargs.get('rnn_type', "GRU")
+        self.classifier = nn.Linear(
+            self.d_model * (self.bidirectional + 1), vocab_size)
+
+    def forward(self, x):
+        raise NotImplementedError
+
+    def init_hidden(self, bs, device):
+        num_dire = self.bidirectional + 1
+        n_layer = self.num_layers
+        hid_dim = self.d_model
+        if self.rnn_type == "LSTM":
+            return (torch.zeros(num_dire * n_layer, bs, hid_dim).to(device),
+                    torch.zeros(num_dire * n_layer, bs, hid_dim).to(device))
+        else:
+            return torch.zeros(num_dire * n_layer, bs, hid_dim).to(device)
+
+
+class RnnFcDecoder(RnnDecoder):
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, dropout, d_model, **kwargs):
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, dropout, d_model, **kwargs)
+        self.model = getattr(nn, self.rnn_type)(
+            input_size=self.emb_dim * 2,
+            hidden_size=self.d_model,
+            batch_first=True,
+            num_layers=self.num_layers,
+            bidirectional=self.bidirectional)
+        self.fc_proj = nn.Linear(self.fc_emb_dim, self.emb_dim)
+        self.apply(init)
+    
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        state = input_dict.get("state", None)
+        fc_emb = input_dict["fc_emb"]
+
+        word = word.to(fc_emb.device)
+        embed = self.in_dropout(self.word_embedding(word))
+        
+        p_fc_emb = self.fc_proj(fc_emb)
+        # embed: [N, T, embed_size]
+        embed = torch.cat((embed, p_fc_emb), dim=-1)
+
+        out, state = self.model(embed, state)
+        # out: [N, T, hs], states: [num_layers * num_dire, N, hs]
+        logits = self.classifier(out)
+        output = {
+            "state": state,
+            "embeds": out,
+            "logits": logits
+        }
+
+        return output
+
+
+class Seq2SeqAttention(nn.Module):
+
+    def __init__(self, hs_enc, hs_dec, attn_size):
+        """
+        Args:
+            hs_enc: encoder hidden size
+            hs_dec: decoder hidden size
+            attn_size: attention vector size
+        """
+        super(Seq2SeqAttention, self).__init__()
+        self.h2attn = nn.Linear(hs_enc + hs_dec, attn_size)
+        self.v = nn.Parameter(torch.randn(attn_size))
+        self.apply(init)
+
+    def forward(self, h_dec, h_enc, src_lens):
+        """
+        Args:
+            h_dec: decoder hidden (query), [N, hs_dec]
+            h_enc: encoder memory (key/value), [N, src_max_len, hs_enc]
+            src_lens: source (encoder memory) lengths, [N, ]
+        """
+        N = h_enc.size(0)
+        src_max_len = h_enc.size(1)
+        h_dec = h_dec.unsqueeze(1).repeat(1, src_max_len, 1) # [N, src_max_len, hs_dec]
+
+        attn_input = torch.cat((h_dec, h_enc), dim=-1)
+        attn_out = torch.tanh(self.h2attn(attn_input)) # [N, src_max_len, attn_size]
+
+        v = self.v.repeat(N, 1).unsqueeze(1) # [N, 1, attn_size]
+        score = torch.bmm(v, attn_out.transpose(1, 2)).squeeze(1) # [N, src_max_len]
+
+        idxs = torch.arange(src_max_len).repeat(N).view(N, src_max_len)
+        mask = (idxs < src_lens.view(-1, 1)).to(h_dec.device)
+
+        score = score.masked_fill(mask == 0, -1e10)
+        weights = torch.softmax(score, dim=-1) # [N, src_max_len]
+        ctx = torch.bmm(weights.unsqueeze(1), h_enc).squeeze(1) # [N, hs_enc]
+
+        return ctx, weights
+
+
+class AttentionProj(nn.Module):
+
+    def __init__(self, hs_enc, hs_dec, embed_dim, attn_size):
+        self.q_proj = nn.Linear(hs_dec, embed_dim)
+        self.kv_proj = nn.Linear(hs_enc, embed_dim)
+        self.h2attn = nn.Linear(embed_dim * 2, attn_size)
+        self.v = nn.Parameter(torch.randn(attn_size))
+        self.apply(init)
+
+    def init(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.kaiming_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, h_dec, h_enc, src_lens):
+        """
+        Args:
+            h_dec: decoder hidden (query), [N, hs_dec]
+            h_enc: encoder memory (key/value), [N, src_max_len, hs_enc]
+            src_lens: source (encoder memory) lengths, [N, ]
+        """
+        h_enc = self.kv_proj(h_enc) # [N, src_max_len, embed_dim]
+        h_dec = self.q_proj(h_dec) # [N, embed_dim]
+        N = h_enc.size(0)
+        src_max_len = h_enc.size(1)
+        h_dec = h_dec.unsqueeze(1).repeat(1, src_max_len, 1) # [N, src_max_len, hs_dec]
+
+        attn_input = torch.cat((h_dec, h_enc), dim=-1)
+        attn_out = torch.tanh(self.h2attn(attn_input)) # [N, src_max_len, attn_size]
+
+        v = self.v.repeat(N, 1).unsqueeze(1) # [N, 1, attn_size]
+        score = torch.bmm(v, attn_out.transpose(1, 2)).squeeze(1) # [N, src_max_len]
+
+        idxs = torch.arange(src_max_len).repeat(N).view(N, src_max_len)
+        mask = (idxs < src_lens.view(-1, 1)).to(h_dec.device)
+
+        score = score.masked_fill(mask == 0, -1e10)
+        weights = torch.softmax(score, dim=-1) # [N, src_max_len]
+        ctx = torch.bmm(weights.unsqueeze(1), h_enc).squeeze(1) # [N, hs_enc]
+
+        return ctx, weights
+
+
+class BahAttnDecoder(RnnDecoder):
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                 dropout, d_model, **kwargs):
+        """
+        concatenate fc, attn, word to feed to the rnn
+        """
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout, d_model, **kwargs)
+        attn_size = kwargs.get("attn_size", self.d_model)
+        self.model = getattr(nn, self.rnn_type)(
+            input_size=self.emb_dim * 3,
+            hidden_size=self.d_model,
+            batch_first=True,
+            num_layers=self.num_layers,
+            bidirectional=self.bidirectional)
+        self.attn = Seq2SeqAttention(self.attn_emb_dim,
+                                     self.d_model * (self.bidirectional + 1) * \
+                                             self.num_layers,
+                                     attn_size)
+        self.fc_proj = nn.Linear(self.fc_emb_dim, self.emb_dim)
+        self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim)
+        self.apply(init)
+
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+        fc_emb = input_dict["fc_emb"]
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+
+        word = word.to(fc_emb.device)
+        embed = self.in_dropout(self.word_embedding(word))
+
+        # embed: [N, 1, embed_size]
+        if state is None:
+            state = self.init_hidden(word.size(0), fc_emb.device)
+        if self.rnn_type == "LSTM":
+            query = state[0].transpose(0, 1).flatten(1)
+        else:
+            query = state.transpose(0, 1).flatten(1)
+        c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+        p_fc_emb = self.fc_proj(fc_emb)
+        p_ctx = self.ctx_proj(c)
+        rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), p_fc_emb.unsqueeze(1)),
+                              dim=-1)
+
+        out, state = self.model(rnn_input, state)
+
+        output = {
+            "state": state,
+            "embed": out,
+            "logit": self.classifier(out),
+            "attn_weight": attn_weight
+        }
+        return output
+
+
+class BahAttnDecoder2(RnnDecoder):
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                 dropout, d_model, **kwargs):
+        """
+        add fc, attn, word together to feed to the rnn
+        """
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout, d_model, **kwargs)
+        attn_size = kwargs.get("attn_size", self.d_model)
+        self.model = getattr(nn, self.rnn_type)(
+            input_size=self.emb_dim,
+            hidden_size=self.d_model,
+            batch_first=True,
+            num_layers=self.num_layers,
+            bidirectional=self.bidirectional)
+        self.attn = Seq2SeqAttention(self.emb_dim,
+                                     self.d_model * (self.bidirectional + 1) * \
+                                             self.num_layers,
+                                     attn_size)
+        self.fc_proj = nn.Linear(self.fc_emb_dim, self.emb_dim)
+        self.attn_proj = nn.Linear(self.attn_emb_dim, self.emb_dim)
+        self.apply(partial(init, method="xavier"))
+
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+        fc_emb = input_dict["fc_emb"]
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+
+        word = word.to(fc_emb.device)
+        embed = self.in_dropout(self.word_embedding(word))
+        p_attn_emb = self.attn_proj(attn_emb)
+
+        # embed: [N, 1, embed_size]
+        if state is None:
+            state = self.init_hidden(word.size(0), fc_emb.device)
+        if self.rnn_type == "LSTM":
+            query = state[0].transpose(0, 1).flatten(1)
+        else:
+            query = state.transpose(0, 1).flatten(1)
+        c, attn_weight = self.attn(query, p_attn_emb, attn_emb_len)
+
+        p_fc_emb = self.fc_proj(fc_emb)
+        rnn_input = embed + c.unsqueeze(1) + p_fc_emb.unsqueeze(1)
+
+        out, state = self.model(rnn_input, state)
+
+        output = {
+            "state": state,
+            "embed": out,
+            "logit": self.classifier(out),
+            "attn_weight": attn_weight
+        }
+        return output
+
+
+class ConditionalBahAttnDecoder(RnnDecoder):
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                 dropout, d_model, **kwargs):
+        """
+        concatenate fc, attn, word to feed to the rnn
+        """
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout, d_model, **kwargs)
+        attn_size = kwargs.get("attn_size", self.d_model)
+        self.model = getattr(nn, self.rnn_type)(
+            input_size=self.emb_dim * 3,
+            hidden_size=self.d_model,
+            batch_first=True,
+            num_layers=self.num_layers,
+            bidirectional=self.bidirectional)
+        self.attn = Seq2SeqAttention(self.attn_emb_dim,
+                                     self.d_model * (self.bidirectional + 1) * \
+                                             self.num_layers,
+                                     attn_size)
+        self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim)
+        self.condition_embedding = nn.Embedding(2, emb_dim)
+        self.apply(init)
+
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+        fc_emb = input_dict["fc_emb"]
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+        condition = input_dict["condition"]
+
+        word = word.to(fc_emb.device)
+        embed = self.in_dropout(self.word_embedding(word))
+
+        condition = torch.as_tensor([[1 - c, c] for c in condition]).to(fc_emb.device)
+        condition_emb = torch.matmul(condition, self.condition_embedding.weight)
+        # condition_embs: [N, emb_dim]
+
+        # embed: [N, 1, embed_size]
+        if state is None:
+            state = self.init_hidden(word.size(0), fc_emb.device)
+        if self.rnn_type == "LSTM":
+            query = state[0].transpose(0, 1).flatten(1)
+        else:
+            query = state.transpose(0, 1).flatten(1)
+        c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+        p_ctx = self.ctx_proj(c)
+        rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), condition_emb.unsqueeze(1)),
+                              dim=-1)
+
+        out, state = self.model(rnn_input, state)
+
+        output = {
+            "state": state,
+            "embed": out,
+            "logit": self.classifier(out),
+            "attn_weight": attn_weight
+        }
+        return output
+
+
+class StructBahAttnDecoder(RnnDecoder):
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, struct_vocab_size,
+                 attn_emb_dim, dropout, d_model, **kwargs):
+        """
+        concatenate fc, attn, word to feed to the rnn
+        """
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout, d_model, **kwargs)
+        attn_size = kwargs.get("attn_size", self.d_model)
+        self.model = getattr(nn, self.rnn_type)(
+            input_size=self.emb_dim * 3,
+            hidden_size=self.d_model,
+            batch_first=True,
+            num_layers=self.num_layers,
+            bidirectional=self.bidirectional)
+        self.attn = Seq2SeqAttention(self.attn_emb_dim,
+                                     self.d_model * (self.bidirectional + 1) * \
+                                         self.num_layers,
+                                     attn_size)
+        self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim)
+        self.struct_embedding = nn.Embedding(struct_vocab_size, emb_dim)
+        self.apply(init)
+
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+        fc_emb = input_dict["fc_emb"]
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+        structure = input_dict["structure"]
+
+        word = word.to(fc_emb.device)
+        embed = self.in_dropout(self.word_embedding(word))
+
+        struct_emb = self.struct_embedding(structure)
+        # struct_embs: [N, emb_dim]
+
+        # embed: [N, 1, embed_size]
+        if state is None:
+            state = self.init_hidden(word.size(0), fc_emb.device)
+        if self.rnn_type == "LSTM":
+            query = state[0].transpose(0, 1).flatten(1)
+        else:
+            query = state.transpose(0, 1).flatten(1)
+        c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+        p_ctx = self.ctx_proj(c)
+        rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), struct_emb.unsqueeze(1)), dim=-1)
+
+        out, state = self.model(rnn_input, state)
+
+        output = {
+            "state": state,
+            "embed": out,
+            "logit": self.classifier(out),
+            "attn_weight": attn_weight
+        }
+        return output
+
+
+class StyleBahAttnDecoder(RnnDecoder):
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                 dropout, d_model, **kwargs):
+        """
+        concatenate fc, attn, word to feed to the rnn
+        """
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout, d_model, **kwargs)
+        attn_size = kwargs.get("attn_size", self.d_model)
+        self.model = getattr(nn, self.rnn_type)(
+            input_size=self.emb_dim * 3,
+            hidden_size=self.d_model,
+            batch_first=True,
+            num_layers=self.num_layers,
+            bidirectional=self.bidirectional)
+        self.attn = Seq2SeqAttention(self.attn_emb_dim,
+                                     self.d_model * (self.bidirectional + 1) * \
+                                        self.num_layers,
+                                     attn_size)
+        self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim)
+        self.apply(init)
+
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+        fc_emb = input_dict["fc_emb"]
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+        style = input_dict["style"]
+
+        word = word.to(fc_emb.device)
+        embed = self.in_dropout(self.word_embedding(word))
+
+        # embed: [N, 1, embed_size]
+        if state is None:
+            state = self.init_hidden(word.size(0), fc_emb.device)
+        if self.rnn_type == "LSTM":
+            query = state[0].transpose(0, 1).flatten(1)
+        else:
+            query = state.transpose(0, 1).flatten(1)
+        c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+        p_ctx = self.ctx_proj(c)
+        rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), style.unsqueeze(1)),
+                              dim=-1)
+
+        out, state = self.model(rnn_input, state)
+
+        output = {
+            "state": state,
+            "embed": out,
+            "logit": self.classifier(out),
+            "attn_weight": attn_weight
+        }
+        return output
+
+
+class BahAttnDecoder3(RnnDecoder):
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                 dropout, d_model, **kwargs):
+        """
+        concatenate fc, attn, word to feed to the rnn
+        """
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout, d_model, **kwargs)
+        attn_size = kwargs.get("attn_size", self.d_model)
+        self.model = getattr(nn, self.rnn_type)(
+            input_size=self.emb_dim + attn_emb_dim,
+            hidden_size=self.d_model,
+            batch_first=True,
+            num_layers=self.num_layers,
+            bidirectional=self.bidirectional)
+        self.attn = Seq2SeqAttention(self.attn_emb_dim,
+                                     self.d_model * (self.bidirectional + 1) * \
+                                         self.num_layers,
+                                     attn_size)
+        self.ctx_proj = lambda x: x
+        self.apply(init)
+
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+        fc_emb = input_dict["fc_emb"]
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+
+        if word.size(-1) == self.fc_emb_dim: # fc_emb
+            embed = word.unsqueeze(1)
+        elif word.size(-1) == 1: # word
+            word = word.to(fc_emb.device)
+            embed = self.in_dropout(self.word_embedding(word))
+        else:
+            raise Exception(f"problem with word input size {word.size()}")
+
+        # embed: [N, 1, embed_size]
+        if state is None:
+            state = self.init_hidden(word.size(0), fc_emb.device)
+        if self.rnn_type == "LSTM":
+            query = state[0].transpose(0, 1).flatten(1)
+        else:
+            query = state.transpose(0, 1).flatten(1)
+        c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+        p_ctx = self.ctx_proj(c)
+        rnn_input = torch.cat((embed, p_ctx.unsqueeze(1)), dim=-1)
+
+        out, state = self.model(rnn_input, state)
+
+        output = {
+            "state": state,
+            "embed": out,
+            "logit": self.classifier(out),
+            "attn_weight": attn_weight
+        }
+        return output
+
+
+class SpecificityBahAttnDecoder(RnnDecoder):
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                 dropout, d_model, **kwargs):
+        """
+        concatenate fc, attn, word to feed to the rnn
+        """
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout, d_model, **kwargs)
+        attn_size = kwargs.get("attn_size", self.d_model)
+        self.model = getattr(nn, self.rnn_type)(
+            input_size=self.emb_dim + attn_emb_dim + 1,
+            hidden_size=self.d_model,
+            batch_first=True,
+            num_layers=self.num_layers,
+            bidirectional=self.bidirectional)
+        self.attn = Seq2SeqAttention(self.attn_emb_dim,
+                                     self.d_model * (self.bidirectional + 1) * \
+                                         self.num_layers,
+                                     attn_size)
+        self.ctx_proj = lambda x: x
+        self.apply(init)
+
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+        fc_emb = input_dict["fc_emb"]
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+        condition = input_dict["condition"] # [N,]
+
+        word = word.to(fc_emb.device)
+        embed = self.in_dropout(self.word_embedding(word))
+
+        # embed: [N, 1, embed_size]
+        if state is None:
+            state = self.init_hidden(word.size(0), fc_emb.device)
+        if self.rnn_type == "LSTM":
+            query = state[0].transpose(0, 1).flatten(1)
+        else:
+            query = state.transpose(0, 1).flatten(1)
+        c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+
+        p_ctx = self.ctx_proj(c)
+        rnn_input = torch.cat(
+            (embed, p_ctx.unsqueeze(1), condition.reshape(-1, 1, 1)),
+            dim=-1)
+
+        out, state = self.model(rnn_input, state)
+
+        output = {
+            "state": state,
+            "embed": out,
+            "logit": self.classifier(out),
+            "attn_weight": attn_weight
+        }
+        return output
+
+
+class TransformerDecoder(BaseDecoder):
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, dropout, **kwargs):
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout=dropout,)
+        self.d_model = emb_dim
+        self.nhead = kwargs.get("nhead", self.d_model // 64)
+        self.nlayers = kwargs.get("nlayers", 2)
+        self.dim_feedforward = kwargs.get("dim_feedforward", self.d_model * 4)
+
+        self.pos_encoder = PositionalEncoding(self.d_model, dropout)
+        layer = nn.TransformerDecoderLayer(d_model=self.d_model,
+                                           nhead=self.nhead,
+                                           dim_feedforward=self.dim_feedforward,
+                                           dropout=dropout)
+        self.model = nn.TransformerDecoder(layer, self.nlayers)
+        self.classifier = nn.Linear(self.d_model, vocab_size)
+        self.attn_proj = nn.Sequential(
+            nn.Linear(self.attn_emb_dim, self.d_model),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.LayerNorm(self.d_model)
+        )
+        # self.attn_proj = lambda x: x
+        self.init_params()
+
+    def init_params(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def generate_square_subsequent_mask(self, max_length):
+        mask = (torch.triu(torch.ones(max_length, max_length)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+        cap_padding_mask = input_dict["cap_padding_mask"]
+
+        p_attn_emb = self.attn_proj(attn_emb)
+        p_attn_emb = p_attn_emb.transpose(0, 1) # [T_src, N, emb_dim]
+        word = word.to(attn_emb.device)
+        embed = self.in_dropout(self.word_embedding(word)) * math.sqrt(self.emb_dim) # [N, T, emb_dim]
+        embed = embed.transpose(0, 1) # [T, N, emb_dim]
+        embed = self.pos_encoder(embed)
+
+        tgt_mask = self.generate_square_subsequent_mask(embed.size(0)).to(attn_emb.device)
+        memory_key_padding_mask = ~generate_length_mask(attn_emb_len, attn_emb.size(1)).to(attn_emb.device)
+        output = self.model(embed, p_attn_emb, tgt_mask=tgt_mask,
+                            tgt_key_padding_mask=cap_padding_mask,
+                            memory_key_padding_mask=memory_key_padding_mask)
+        output = output.transpose(0, 1)
+        output = {
+            "embed": output,
+            "logit": self.classifier(output),
+        }
+        return output
+
+
+
+
+class EventTransformerDecoder(TransformerDecoder):
+
+    def forward(self, input_dict):
+        word = input_dict["word"] # index of word embeddings
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+        cap_padding_mask = input_dict["cap_padding_mask"]
+        event_emb = input_dict["event"] # [N, emb_dim]
+
+        p_attn_emb = self.attn_proj(attn_emb)
+        p_attn_emb = p_attn_emb.transpose(0, 1) # [T_src, N, emb_dim]
+        word = word.to(attn_emb.device)
+        embed = self.in_dropout(self.word_embedding(word)) * math.sqrt(self.emb_dim) # [N, T, emb_dim]
+
+        embed = embed.transpose(0, 1) # [T, N, emb_dim]
+        embed += event_emb
+        embed = self.pos_encoder(embed)
+
+        tgt_mask = self.generate_square_subsequent_mask(embed.size(0)).to(attn_emb.device)
+        memory_key_padding_mask = ~generate_length_mask(attn_emb_len, attn_emb.size(1)).to(attn_emb.device)
+        output = self.model(embed, p_attn_emb, tgt_mask=tgt_mask,
+                            tgt_key_padding_mask=cap_padding_mask,
+                            memory_key_padding_mask=memory_key_padding_mask)
+        output = output.transpose(0, 1)
+        output = {
+            "embed": output,
+            "logit": self.classifier(output),
+        }
+        return output
+
+
+class KeywordProbTransformerDecoder(TransformerDecoder):
+
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                 dropout, keyword_classes_num, **kwargs):
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout, **kwargs)
+        self.keyword_proj = nn.Linear(keyword_classes_num, self.d_model)
+        self.word_keyword_norm = nn.LayerNorm(self.d_model)
+
+    def forward(self, input_dict):
+        word = input_dict["word"] # index of word embeddings
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+        cap_padding_mask = input_dict["cap_padding_mask"]
+        keyword = input_dict["keyword"] # [N, keyword_classes_num]
+
+        p_attn_emb = self.attn_proj(attn_emb)
+        p_attn_emb = p_attn_emb.transpose(0, 1) # [T_src, N, emb_dim]
+        word = word.to(attn_emb.device)
+        embed = self.in_dropout(self.word_embedding(word)) * math.sqrt(self.emb_dim) # [N, T, emb_dim]
+
+        embed = embed.transpose(0, 1) # [T, N, emb_dim]
+        embed += self.keyword_proj(keyword)
+        embed = self.word_keyword_norm(embed)
+
+        embed = self.pos_encoder(embed)
+
+        tgt_mask = self.generate_square_subsequent_mask(embed.size(0)).to(attn_emb.device)
+        memory_key_padding_mask = ~generate_length_mask(attn_emb_len, attn_emb.size(1)).to(attn_emb.device)
+        output = self.model(embed, p_attn_emb, tgt_mask=tgt_mask,
+                            tgt_key_padding_mask=cap_padding_mask,
+                            memory_key_padding_mask=memory_key_padding_mask)
+        output = output.transpose(0, 1)
+        output = {
+            "embed": output,
+            "logit": self.classifier(output),
+        }
+        return output
diff --git a/audio_to_text/captioning/models/encoder.py b/audio_to_text/captioning/models/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d6d8e87e0ed07abc04f6e79b0fa08cd102398a0
--- /dev/null
+++ b/audio_to_text/captioning/models/encoder.py
@@ -0,0 +1,686 @@
+# -*- coding: utf-8 -*-
+
+import math
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchaudio import transforms
+from torchlibrosa.augmentation import SpecAugmentation
+
+from .utils import mean_with_lens, max_with_lens, \
+    init, pack_wrapper, generate_length_mask, PositionalEncoding
+
+
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+ 
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+            
+    
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+
+
+class BaseEncoder(nn.Module):
+    
+    """
+    Encode the given audio into embedding
+    Base encoder class, cannot be called directly
+    All encoders should inherit from this class
+    """
+
+    def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim):
+        super(BaseEncoder, self).__init__()
+        self.spec_dim = spec_dim
+        self.fc_feat_dim = fc_feat_dim
+        self.attn_feat_dim = attn_feat_dim
+
+
+    def forward(self, x):
+        #########################
+        # an encoder first encodes audio feature into embedding, obtaining
+        # `encoded`: {
+        #     fc_embs: [N, fc_emb_dim],
+        #     attn_embs: [N, attn_max_len, attn_emb_dim],
+        #     attn_emb_lens: [N,]
+        # }
+        #########################
+        raise NotImplementedError
+
+
+class Block2D(nn.Module):
+
+    def __init__(self, cin, cout, kernel_size=3, padding=1):
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.BatchNorm2d(cin),
+            nn.Conv2d(cin,
+                      cout,
+                      kernel_size=kernel_size,
+                      padding=padding,
+                      bias=False),
+            nn.LeakyReLU(inplace=True, negative_slope=0.1))
+
+    def forward(self, x):
+        return self.block(x)
+
+
+class LinearSoftPool(nn.Module):
+    """LinearSoftPool
+    Linear softmax, takes logits and returns a probability, near to the actual maximum value.
+    Taken from the paper:
+        A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling
+    https://arxiv.org/abs/1810.09050
+    """
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+
+    def forward(self, logits, time_decision):
+        return (time_decision**2).sum(self.pooldim) / time_decision.sum(
+            self.pooldim)
+
+
+class MeanPool(nn.Module):
+
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+
+    def forward(self, logits, decision):
+        return torch.mean(decision, dim=self.pooldim)
+
+
+class AttentionPool(nn.Module):  
+    """docstring for AttentionPool"""  
+    def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
+        super().__init__()
+        self.inputdim = inputdim
+        self.outputdim = outputdim
+        self.pooldim = pooldim
+        self.transform = nn.Linear(inputdim, outputdim)
+        self.activ = nn.Softmax(dim=self.pooldim)
+        self.eps = 1e-7
+
+    def forward(self, logits, decision):
+        # Input is (B, T, D)
+        # B, T, D
+        w = self.activ(torch.clamp(self.transform(logits), -15, 15))
+        detect = (decision * w).sum(
+            self.pooldim) / (w.sum(self.pooldim) + self.eps)
+        # B, T, D
+        return detect
+
+
+class MMPool(nn.Module):
+
+    def __init__(self, dims):
+        super().__init__()
+        self.avgpool = nn.AvgPool2d(dims)
+        self.maxpool = nn.MaxPool2d(dims)
+
+    def forward(self, x):
+        return self.avgpool(x) + self.maxpool(x)
+
+
+def parse_poolingfunction(poolingfunction_name='mean', **kwargs):
+    """parse_poolingfunction
+    A heler function to parse any temporal pooling
+    Pooling is done on dimension 1
+    :param poolingfunction_name:
+    :param **kwargs:
+    """
+    poolingfunction_name = poolingfunction_name.lower()
+    if poolingfunction_name == 'mean':
+        return MeanPool(pooldim=1)
+    elif poolingfunction_name == 'linear':
+        return LinearSoftPool(pooldim=1)
+    elif poolingfunction_name == 'attention':  
+        return AttentionPool(inputdim=kwargs['inputdim'],  
+                             outputdim=kwargs['outputdim'])
+
+
+def embedding_pooling(x, lens, pooling="mean"):
+    if pooling == "max":
+        fc_embs = max_with_lens(x, lens)
+    elif pooling == "mean":
+        fc_embs = mean_with_lens(x, lens)
+    elif pooling == "mean+max":
+        x_mean = mean_with_lens(x, lens)
+        x_max = max_with_lens(x, lens)
+        fc_embs = x_mean + x_max
+    elif pooling == "last":
+        indices = (lens - 1).reshape(-1, 1, 1).repeat(1, 1, x.size(-1))
+        # indices: [N, 1, hidden]
+        fc_embs = torch.gather(x, 1, indices).squeeze(1)
+    else:
+        raise Exception(f"pooling method {pooling} not support")
+    return fc_embs
+
+
+class Cdur5Encoder(BaseEncoder):
+
+    def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, pooling="mean"):
+        super().__init__(spec_dim, fc_feat_dim, attn_feat_dim)
+        self.pooling = pooling
+        self.features = nn.Sequential(
+            Block2D(1, 32),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(32, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(128, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (1, 4)),
+            nn.Dropout(0.3),
+        )
+        with torch.no_grad():
+            rnn_input_dim = self.features(
+                torch.randn(1, 1, 500, spec_dim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+
+        self.gru = nn.GRU(rnn_input_dim,
+                          128,
+                          bidirectional=True,
+                          batch_first=True)
+        self.apply(init)
+
+    def forward(self, input_dict):
+        x = input_dict["spec"]
+        lens = input_dict["spec_len"]
+        if "upsample" not in input_dict:
+            input_dict["upsample"] = False
+        lens = torch.as_tensor(copy.deepcopy(lens))
+        N, T, _ = x.shape
+        x = x.unsqueeze(1)
+        x = self.features(x)
+        x = x.transpose(1, 2).contiguous().flatten(-2)
+        x, _ = self.gru(x)
+        if input_dict["upsample"]:
+            x = nn.functional.interpolate(
+                x.transpose(1, 2),
+                T,
+                mode='linear',
+                align_corners=False).transpose(1, 2)
+        else:
+            lens //= 4
+        attn_emb = x
+        fc_emb = embedding_pooling(x, lens, self.pooling)
+        return {
+            "attn_emb": attn_emb,
+            "fc_emb": fc_emb,
+            "attn_emb_len": lens
+        }
+
+
+def conv_conv_block(in_channel, out_channel):
+    return nn.Sequential(
+        nn.Conv2d(in_channel,
+                  out_channel,
+                  kernel_size=3,
+                  bias=False,
+                  padding=1),
+        nn.BatchNorm2d(out_channel),
+        nn.ReLU(True),
+        nn.Conv2d(out_channel,
+                  out_channel,
+                  kernel_size=3,
+                  bias=False,
+                  padding=1),
+        nn.BatchNorm2d(out_channel),
+        nn.ReLU(True)
+    )
+
+
+class Cdur8Encoder(BaseEncoder):
+    
+    def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, pooling="mean"):
+        super().__init__(spec_dim, fc_feat_dim, attn_feat_dim)
+        self.pooling = pooling
+        self.features = nn.Sequential(
+            conv_conv_block(1, 64),
+            MMPool((2, 2)),
+            nn.Dropout(0.2, True),
+            conv_conv_block(64, 128),
+            MMPool((2, 2)),
+            nn.Dropout(0.2, True),
+            conv_conv_block(128, 256),
+            MMPool((1, 2)),
+            nn.Dropout(0.2, True),
+            conv_conv_block(256, 512),
+            MMPool((1, 2)),
+            nn.Dropout(0.2, True),
+            nn.AdaptiveAvgPool2d((None, 1)),
+        )
+        self.init_bn = nn.BatchNorm2d(spec_dim)
+        self.embedding = nn.Linear(512, 512)
+        self.gru = nn.GRU(512, 256, bidirectional=True, batch_first=True)
+        self.apply(init)
+
+    def forward(self, input_dict):
+        x = input_dict["spec"]
+        lens = input_dict["spec_len"]
+        lens = torch.as_tensor(copy.deepcopy(lens))
+        x = x.unsqueeze(1)  # B x 1 x T x D
+        x = x.transpose(1, 3)
+        x = self.init_bn(x)
+        x = x.transpose(1, 3)
+        x = self.features(x)
+        x = x.transpose(1, 2).contiguous().flatten(-2)
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.embedding(x))
+        x, _ = self.gru(x)
+        attn_emb = x
+        lens //= 4
+        fc_emb = embedding_pooling(x, lens, self.pooling)
+        return {
+            "attn_emb": attn_emb,
+            "fc_emb": fc_emb,
+            "attn_emb_len": lens
+        }
+
+
+class Cnn10Encoder(BaseEncoder):
+
+    def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim):
+        super().__init__(spec_dim, fc_feat_dim, attn_feat_dim)
+        self.features = nn.Sequential(
+            conv_conv_block(1, 64),
+            nn.AvgPool2d((2, 2)),
+            nn.Dropout(0.2, True),
+            conv_conv_block(64, 128),
+            nn.AvgPool2d((2, 2)),
+            nn.Dropout(0.2, True),
+            conv_conv_block(128, 256),
+            nn.AvgPool2d((2, 2)),
+            nn.Dropout(0.2, True),
+            conv_conv_block(256, 512),
+            nn.AvgPool2d((2, 2)),
+            nn.Dropout(0.2, True),
+            nn.AdaptiveAvgPool2d((None, 1)),
+        )
+        self.init_bn = nn.BatchNorm2d(spec_dim)
+        self.embedding = nn.Linear(512, 512)
+        self.apply(init)
+
+    def forward(self, input_dict):
+        x = input_dict["spec"]
+        lens = input_dict["spec_len"]
+        lens = torch.as_tensor(copy.deepcopy(lens))
+        x = x.unsqueeze(1)  # [N, 1, T, D]
+        x = x.transpose(1, 3)
+        x = self.init_bn(x)
+        x = x.transpose(1, 3)
+        x = self.features(x) # [N, 512, T/16, 1]
+        x = x.transpose(1, 2).contiguous().flatten(-2) # [N, T/16, 512]
+        attn_emb = x
+        lens //= 16
+        fc_emb = embedding_pooling(x, lens, "mean+max")
+        fc_emb = F.dropout(fc_emb, p=0.5, training=self.training)
+        fc_emb = self.embedding(fc_emb)
+        fc_emb = F.relu_(fc_emb)
+        return {
+            "attn_emb": attn_emb,
+            "fc_emb": fc_emb,
+            "attn_emb_len": lens
+        }
+
+
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        
+        super(ConvBlock, self).__init__()
+        
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+                              
+        self.conv2 = nn.Conv2d(in_channels=out_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+                              
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+
+        self.init_weight()
+        
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+
+        
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        
+        return x
+
+
+class Cnn14Encoder(nn.Module):
+    def __init__(self, sample_rate=32000):
+        super().__init__()
+        sr_to_fmax = {
+            32000: 14000,
+            16000: 8000
+        }
+        # Logmel spectrogram extractor
+        self.melspec_extractor = transforms.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_fft=32 * sample_rate // 1000,
+            win_length=32 * sample_rate // 1000,
+            hop_length=10 * sample_rate // 1000,
+            f_min=50,
+            f_max=sr_to_fmax[sample_rate],
+            n_mels=64,
+            norm="slaney",
+            mel_scale="slaney"
+        )
+        self.hop_length = 10 * sample_rate // 1000
+        self.db_transform = transforms.AmplitudeToDB()
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
+            time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2)
+
+        self.bn0 = nn.BatchNorm2d(64)
+
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+
+        self.downsample_ratio = 32
+
+        self.fc1 = nn.Linear(2048, 2048, bias=True)
+        
+        self.init_weight()
+
+    def init_weight(self):
+        init_bn(self.bn0)
+        init_layer(self.fc1)
+
+    def load_pretrained(self, pretrained):
+        checkpoint = torch.load(pretrained, map_location="cpu")
+
+        if "model" in checkpoint:
+            state_keys = checkpoint["model"].keys()
+            backbone = False
+            for key in state_keys:
+                if key.startswith("backbone."):
+                    backbone = True
+                    break
+
+            if backbone: # COLA
+                state_dict = {}
+                for key, value in checkpoint["model"].items():
+                    if key.startswith("backbone."):
+                        model_key = key.replace("backbone.", "")
+                        state_dict[model_key] = value
+            else: # PANNs
+                state_dict = checkpoint["model"]
+        elif "state_dict" in checkpoint: # CLAP
+            state_dict = checkpoint["state_dict"]
+            state_dict_keys = list(filter(
+                lambda x: "audio_encoder" in x, state_dict.keys()))
+            state_dict = {
+                key.replace('audio_encoder.', ''): state_dict[key]
+                    for key in state_dict_keys
+            }
+        else:
+            raise Exception("Unkown checkpoint format")
+
+        model_dict = self.state_dict()
+        pretrained_dict = {
+            k: v for k, v in state_dict.items() if (k in model_dict) and (
+                model_dict[k].shape == v.shape)
+        }
+        model_dict.update(pretrained_dict)
+        self.load_state_dict(model_dict, strict=True)
+ 
+    def forward(self, input_dict):
+        """
+        Input: (batch_size, n_samples)"""
+        waveform = input_dict["wav"]
+        wave_length = input_dict["wav_len"]
+        specaug = input_dict["specaug"]
+        x = self.melspec_extractor(waveform)
+        x = self.db_transform(x)    # (batch_size, mel_bins, time_steps)
+        x = x.transpose(1, 2)
+        x = x.unsqueeze(1)      # (batch_size, 1, time_steps, mel_bins)
+
+        # SpecAugment
+        if self.training and specaug:
+            x = self.spec_augmenter(x)
+
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+        attn_emb = x.transpose(1, 2)
+        
+        wave_length = torch.as_tensor(wave_length)
+        feat_length = torch.div(wave_length, self.hop_length,
+            rounding_mode="floor") + 1
+        feat_length = torch.div(feat_length, self.downsample_ratio,
+            rounding_mode="floor")
+        x_max = max_with_lens(attn_emb, feat_length)
+        x_mean = mean_with_lens(attn_emb, feat_length)
+        x = x_max + x_mean
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        fc_emb = F.dropout(x, p=0.5, training=self.training)
+        
+        output_dict = {
+            'fc_emb': fc_emb,
+            'attn_emb': attn_emb,
+            'attn_emb_len': feat_length
+        }
+
+        return output_dict
+
+
+class RnnEncoder(BaseEncoder):
+
+    def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim,
+                 pooling="mean", **kwargs):
+        super().__init__(spec_dim, fc_feat_dim, attn_feat_dim)
+        self.pooling = pooling
+        self.hidden_size = kwargs.get('hidden_size', 512)
+        self.bidirectional = kwargs.get('bidirectional', False)
+        self.num_layers = kwargs.get('num_layers', 1)
+        self.dropout = kwargs.get('dropout', 0.2)
+        self.rnn_type = kwargs.get('rnn_type', "GRU")
+        self.in_bn = kwargs.get('in_bn', False)
+        self.embed_dim = self.hidden_size * (self.bidirectional + 1)
+        self.network = getattr(nn, self.rnn_type)(
+            attn_feat_dim,
+            self.hidden_size,
+            num_layers=self.num_layers,
+            bidirectional=self.bidirectional,
+            dropout=self.dropout,
+            batch_first=True)
+        if self.in_bn:
+            self.bn = nn.BatchNorm1d(self.embed_dim)
+        self.apply(init)
+
+    def forward(self, input_dict):
+        x = input_dict["attn"]
+        lens = input_dict["attn_len"]
+        lens = torch.as_tensor(lens)
+        # x: [N, T, E]
+        if self.in_bn:
+            x = pack_wrapper(self.bn, x, lens)
+        out = pack_wrapper(self.network, x, lens)
+        # out: [N, T, hidden]
+        attn_emb = out
+        fc_emb = embedding_pooling(out, lens, self.pooling)
+        return {
+            "attn_emb": attn_emb,
+            "fc_emb": fc_emb,
+            "attn_emb_len": lens
+        }
+
+
+class Cnn14RnnEncoder(nn.Module):
+    def __init__(self, sample_rate=32000, pretrained=None,
+                 freeze_cnn=False, freeze_cnn_bn=False,
+                 pooling="mean", **kwargs):
+        super().__init__()
+        self.cnn = Cnn14Encoder(sample_rate)
+        self.rnn = RnnEncoder(64, 2048, 2048, pooling, **kwargs)
+        if pretrained is not None:
+            self.cnn.load_pretrained(pretrained)
+        if freeze_cnn:
+            assert pretrained is not None, "cnn is not pretrained but frozen"
+            for param in self.cnn.parameters():
+                param.requires_grad = False
+            self.freeze_cnn_bn = freeze_cnn_bn
+
+    def train(self, mode):
+        super().train(mode=mode)
+        if self.freeze_cnn_bn:
+            def bn_eval(module):
+                class_name = module.__class__.__name__
+                if class_name.find("BatchNorm") != -1:
+                    module.eval()
+            self.cnn.apply(bn_eval)
+        return self
+
+    def forward(self, input_dict):
+        output_dict = self.cnn(input_dict)
+        output_dict["attn"] = output_dict["attn_emb"]
+        output_dict["attn_len"] = output_dict["attn_emb_len"]
+        del output_dict["attn_emb"], output_dict["attn_emb_len"]
+        output_dict = self.rnn(output_dict)
+        return output_dict
+
+
+class TransformerEncoder(BaseEncoder):
+
+    def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, d_model, **kwargs):
+        super().__init__(spec_dim, fc_feat_dim, attn_feat_dim)
+        self.d_model = d_model
+        dropout = kwargs.get("dropout", 0.2)
+        self.nhead = kwargs.get("nhead", self.d_model // 64)
+        self.nlayers = kwargs.get("nlayers", 2)
+        self.dim_feedforward = kwargs.get("dim_feedforward", self.d_model * 4)
+
+        self.attn_proj = nn.Sequential(
+            nn.Linear(attn_feat_dim, self.d_model),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.LayerNorm(self.d_model)
+        )
+        layer = nn.TransformerEncoderLayer(d_model=self.d_model,
+                                           nhead=self.nhead,
+                                           dim_feedforward=self.dim_feedforward,
+                                           dropout=dropout)
+        self.model = nn.TransformerEncoder(layer, self.nlayers)
+        self.cls_token = nn.Parameter(torch.zeros(d_model))
+        self.init_params()
+
+    def init_params(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, input_dict):
+        attn_feat = input_dict["attn"]
+        attn_feat_len = input_dict["attn_len"]
+        attn_feat_len = torch.as_tensor(attn_feat_len)
+
+        attn_feat = self.attn_proj(attn_feat) # [bs, T, d_model]
+
+        cls_emb = self.cls_token.reshape(1, 1, self.d_model).repeat(
+            attn_feat.size(0), 1, 1)
+        attn_feat = torch.cat((cls_emb, attn_feat), dim=1)
+        attn_feat = attn_feat.transpose(0, 1)
+
+        attn_feat_len += 1
+        src_key_padding_mask = ~generate_length_mask(
+            attn_feat_len, attn_feat.size(0)).to(attn_feat.device)
+        output = self.model(attn_feat, src_key_padding_mask=src_key_padding_mask)
+
+        attn_emb = output.transpose(0, 1)
+        fc_emb = attn_emb[:, 0]
+        return {
+            "attn_emb": attn_emb,
+            "fc_emb": fc_emb,
+            "attn_emb_len": attn_feat_len
+        }
+
+
+class Cnn14TransformerEncoder(nn.Module):
+    def __init__(self, sample_rate=32000, pretrained=None,
+                 freeze_cnn=False, freeze_cnn_bn=False,
+                 d_model="mean", **kwargs):
+        super().__init__()
+        self.cnn = Cnn14Encoder(sample_rate)
+        self.trm = TransformerEncoder(64, 2048, 2048, d_model, **kwargs)
+        if pretrained is not None:
+            self.cnn.load_pretrained(pretrained)
+        if freeze_cnn:
+            assert pretrained is not None, "cnn is not pretrained but frozen"
+            for param in self.cnn.parameters():
+                param.requires_grad = False
+            self.freeze_cnn_bn = freeze_cnn_bn
+
+    def train(self, mode):
+        super().train(mode=mode)
+        if self.freeze_cnn_bn:
+            def bn_eval(module):
+                class_name = module.__class__.__name__
+                if class_name.find("BatchNorm") != -1:
+                    module.eval()
+            self.cnn.apply(bn_eval)
+        return self
+
+    def forward(self, input_dict):
+        output_dict = self.cnn(input_dict)
+        output_dict["attn"] = output_dict["attn_emb"]
+        output_dict["attn_len"] = output_dict["attn_emb_len"]
+        del output_dict["attn_emb"], output_dict["attn_emb_len"]
+        output_dict = self.trm(output_dict)
+        return output_dict
+
+
+
+
+
diff --git a/audio_to_text/captioning/models/transformer_model.py b/audio_to_text/captioning/models/transformer_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..76c97f171955f04b10c16fd1f1a205ce7343a0ac
--- /dev/null
+++ b/audio_to_text/captioning/models/transformer_model.py
@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+import random
+import torch
+import torch.nn as nn
+
+from .base_model import CaptionModel
+from .utils import repeat_tensor
+import audio_to_text.captioning.models.decoder
+
+
+class TransformerModel(CaptionModel):
+
+    def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+        if not hasattr(self, "compatible_decoders"):
+            self.compatible_decoders = (
+                audio_to_text.captioning.models.decoder.TransformerDecoder,
+            )
+        super().__init__(encoder, decoder, **kwargs)
+
+    def seq_forward(self, input_dict):
+        cap = input_dict["cap"]
+        cap_padding_mask = (cap == self.pad_idx).to(cap.device)
+        cap_padding_mask = cap_padding_mask[:, :-1]
+        output = self.decoder(
+            {
+                "word": cap[:, :-1],
+                "attn_emb": input_dict["attn_emb"],
+                "attn_emb_len": input_dict["attn_emb_len"],
+                "cap_padding_mask": cap_padding_mask
+            }
+        )
+        return output
+
+    def prepare_decoder_input(self, input_dict, output):
+        decoder_input = {
+            "attn_emb": input_dict["attn_emb"],
+            "attn_emb_len": input_dict["attn_emb_len"]
+        }
+        t = input_dict["t"]
+        
+        ###############
+        # determine input word
+        ################
+        if input_dict["mode"] == "train" and random.random() < input_dict["ss_ratio"]: # training, scheduled sampling
+            word = input_dict["cap"][:, :t+1]
+        else:
+            start_word = torch.tensor([self.start_idx,] * input_dict["attn_emb"].size(0)).unsqueeze(1).long()
+            if t == 0:
+                word = start_word
+            else:
+                word = torch.cat((start_word, output["seq"][:, :t]), dim=-1)
+        # word: [N, T]
+        decoder_input["word"] = word
+
+        cap_padding_mask = (word == self.pad_idx).to(input_dict["attn_emb"].device)
+        decoder_input["cap_padding_mask"] = cap_padding_mask
+        return decoder_input
+
+    def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+        decoder_input = {}
+        t = input_dict["t"]
+        i = input_dict["sample_idx"]
+        beam_size = input_dict["beam_size"]
+        ###############
+        # prepare attn embeds
+        ################
+        if t == 0:
+            attn_emb = repeat_tensor(input_dict["attn_emb"][i], beam_size)
+            attn_emb_len = repeat_tensor(input_dict["attn_emb_len"][i], beam_size)
+            output_i["attn_emb"] = attn_emb
+            output_i["attn_emb_len"] = attn_emb_len
+        decoder_input["attn_emb"] = output_i["attn_emb"]
+        decoder_input["attn_emb_len"] = output_i["attn_emb_len"]
+        ###############
+        # determine input word
+        ################
+        start_word = torch.tensor([self.start_idx,] * beam_size).unsqueeze(1).long()
+        if t == 0:
+            word = start_word
+        else:
+            word = torch.cat((start_word, output_i["seq"]), dim=-1)
+        decoder_input["word"] = word
+        cap_padding_mask = (word == self.pad_idx).to(input_dict["attn_emb"].device)
+        decoder_input["cap_padding_mask"] = cap_padding_mask
+
+        return decoder_input
+
+
+class M2TransformerModel(CaptionModel):
+
+    def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+        if not hasattr(self, "compatible_decoders"):
+            self.compatible_decoders = (
+                captioning.models.decoder.M2TransformerDecoder,
+            )
+        super().__init__(encoder, decoder, **kwargs)
+        self.check_encoder_compatibility()
+
+    def check_encoder_compatibility(self):
+        assert isinstance(self.encoder, captioning.models.encoder.M2TransformerEncoder), \
+            f"only M2TransformerModel is compatible with {self.__class__.__name__}"
+
+
+    def seq_forward(self, input_dict):
+        cap = input_dict["cap"]
+        output = self.decoder(
+            {
+                "word": cap[:, :-1],
+                "attn_emb": input_dict["attn_emb"],
+                "attn_emb_mask": input_dict["attn_emb_mask"],
+            }
+        )
+        return output
+
+    def prepare_decoder_input(self, input_dict, output):
+        decoder_input = {
+            "attn_emb": input_dict["attn_emb"],
+            "attn_emb_mask": input_dict["attn_emb_mask"]
+        }
+        t = input_dict["t"]
+        
+        ###############
+        # determine input word
+        ################
+        if input_dict["mode"] == "train" and random.random() < input_dict["ss_ratio"]: # training, scheduled sampling
+            word = input_dict["cap"][:, :t+1]
+        else:
+            start_word = torch.tensor([self.start_idx,] * input_dict["attn_emb"].size(0)).unsqueeze(1).long()
+            if t == 0:
+                word = start_word
+            else:
+                word = torch.cat((start_word, output["seq"][:, :t]), dim=-1)
+        # word: [N, T]
+        decoder_input["word"] = word
+
+        return decoder_input
+
+    def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+        decoder_input = {}
+        t = input_dict["t"]
+        i = input_dict["sample_idx"]
+        beam_size = input_dict["beam_size"]
+        ###############
+        # prepare attn embeds
+        ################
+        if t == 0:
+            attn_emb = repeat_tensor(input_dict["attn_emb"][i], beam_size)
+            attn_emb_mask = repeat_tensor(input_dict["attn_emb_mask"][i], beam_size)
+            output_i["attn_emb"] = attn_emb
+            output_i["attn_emb_mask"] = attn_emb_mask
+        decoder_input["attn_emb"] = output_i["attn_emb"]
+        decoder_input["attn_emb_mask"] = output_i["attn_emb_mask"]
+        ###############
+        # determine input word
+        ################
+        start_word = torch.tensor([self.start_idx,] * beam_size).unsqueeze(1).long()
+        if t == 0:
+            word = start_word
+        else:
+            word = torch.cat((start_word, output_i["seq"]), dim=-1)
+        decoder_input["word"] = word
+
+        return decoder_input
+
+
+class EventEncoder(nn.Module):
+    """
+    Encode the Label information in AudioCaps and AudioSet
+    """
+    def __init__(self, emb_dim, vocab_size=527):
+        super(EventEncoder, self).__init__()
+        self.label_embedding = nn.Parameter(
+            torch.randn((vocab_size, emb_dim)), requires_grad=True)
+        
+    def forward(self, word_idxs):
+        indices = word_idxs / word_idxs.sum(dim=1, keepdim=True)
+        embeddings = indices @ self.label_embedding
+        return embeddings
+
+
+class EventCondTransformerModel(TransformerModel):
+
+    def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+        if not hasattr(self, "compatible_decoders"):
+            self.compatible_decoders = (
+                captioning.models.decoder.EventTransformerDecoder,
+            )
+        super().__init__(encoder, decoder, **kwargs)
+        self.label_encoder = EventEncoder(decoder.emb_dim, 527)
+        self.train_forward_keys += ["events"]
+        self.inference_forward_keys += ["events"]
+
+    # def seq_forward(self, input_dict):
+        # cap = input_dict["cap"]
+        # cap_padding_mask = (cap == self.pad_idx).to(cap.device)
+        # cap_padding_mask = cap_padding_mask[:, :-1]
+        # output = self.decoder(
+            # {
+                # "word": cap[:, :-1],
+                # "attn_emb": input_dict["attn_emb"],
+                # "attn_emb_len": input_dict["attn_emb_len"],
+                # "cap_padding_mask": cap_padding_mask
+            # }
+        # )
+        # return output
+
+    def prepare_decoder_input(self, input_dict, output):
+        decoder_input = super().prepare_decoder_input(input_dict, output)
+        decoder_input["events"] = self.label_encoder(input_dict["events"])
+        return decoder_input
+
+    def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+        decoder_input = super().prepare_beamsearch_decoder_input(input_dict, output_i)
+        t = input_dict["t"]
+        i = input_dict["sample_idx"]
+        beam_size = input_dict["beam_size"]
+        if t == 0:
+            output_i["events"] = repeat_tensor(self.label_encoder(input_dict["events"])[i], beam_size)
+        decoder_input["events"] = output_i["events"]
+        return decoder_input
+
+
+class KeywordCondTransformerModel(TransformerModel):
+
+    def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+        if not hasattr(self, "compatible_decoders"):
+            self.compatible_decoders = (
+                captioning.models.decoder.KeywordProbTransformerDecoder,
+            )
+        super().__init__(encoder, decoder, **kwargs)
+        self.train_forward_keys += ["keyword"]
+        self.inference_forward_keys += ["keyword"]
+
+    def seq_forward(self, input_dict):
+        cap = input_dict["cap"]
+        cap_padding_mask = (cap == self.pad_idx).to(cap.device)
+        cap_padding_mask = cap_padding_mask[:, :-1]
+        keyword = input_dict["keyword"]
+        output = self.decoder(
+            {
+                "word": cap[:, :-1],
+                "attn_emb": input_dict["attn_emb"],
+                "attn_emb_len": input_dict["attn_emb_len"],
+                "keyword": keyword,
+                "cap_padding_mask": cap_padding_mask
+            }
+        )
+        return output
+
+    def prepare_decoder_input(self, input_dict, output):
+        decoder_input = super().prepare_decoder_input(input_dict, output)
+        decoder_input["keyword"] = input_dict["keyword"]
+        return decoder_input
+
+    def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+        decoder_input = super().prepare_beamsearch_decoder_input(input_dict, output_i)
+        t = input_dict["t"]
+        i = input_dict["sample_idx"]
+        beam_size = input_dict["beam_size"]
+        if t == 0:
+            output_i["keyword"] = repeat_tensor(input_dict["keyword"][i],
+                                                 beam_size)
+        decoder_input["keyword"] = output_i["keyword"]
+        return decoder_input
+
diff --git a/audio_to_text/captioning/models/utils.py b/audio_to_text/captioning/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3623cf43619a7a4ff5fa31f2b056378697b04d61
--- /dev/null
+++ b/audio_to_text/captioning/models/utils.py
@@ -0,0 +1,132 @@
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence
+
+
+def sort_pack_padded_sequence(input, lengths):
+    sorted_lengths, indices = torch.sort(lengths, descending=True)
+    tmp = pack_padded_sequence(input[indices], sorted_lengths.cpu(), batch_first=True)
+    inv_ix = indices.clone()
+    inv_ix[indices] = torch.arange(0,len(indices)).type_as(inv_ix)
+    return tmp, inv_ix
+
+def pad_unsort_packed_sequence(input, inv_ix):
+    tmp, _ = pad_packed_sequence(input, batch_first=True)
+    tmp = tmp[inv_ix]
+    return tmp
+
+def pack_wrapper(module, attn_feats, attn_feat_lens):
+    packed, inv_ix = sort_pack_padded_sequence(attn_feats, attn_feat_lens)
+    if isinstance(module, torch.nn.RNNBase):
+        return pad_unsort_packed_sequence(module(packed)[0], inv_ix)
+    else:
+        return pad_unsort_packed_sequence(PackedSequence(module(packed[0]), packed[1]), inv_ix)
+
+def generate_length_mask(lens, max_length=None):
+    lens = torch.as_tensor(lens)
+    N = lens.size(0)
+    if max_length is None:
+        max_length = max(lens)
+    idxs = torch.arange(max_length).repeat(N).view(N, max_length)
+    idxs = idxs.to(lens.device)
+    mask = (idxs < lens.view(-1, 1))
+    return mask
+
+def mean_with_lens(features, lens):
+    """
+    features: [N, T, ...] (assume the second dimension represents length)
+    lens: [N,]
+    """
+    lens = torch.as_tensor(lens)
+    if max(lens) != features.size(1):
+        max_length = features.size(1)
+        mask = generate_length_mask(lens, max_length)
+    else:
+        mask = generate_length_mask(lens)
+    mask = mask.to(features.device) # [N, T]
+
+    while mask.ndim < features.ndim:
+        mask = mask.unsqueeze(-1)
+    feature_mean = features * mask
+    feature_mean = feature_mean.sum(1)
+    while lens.ndim < feature_mean.ndim:
+        lens = lens.unsqueeze(1)
+    feature_mean = feature_mean / lens.to(features.device)
+    # feature_mean = features * mask.unsqueeze(-1)
+    # feature_mean = feature_mean.sum(1) / lens.unsqueeze(1).to(features.device)
+    return feature_mean
+
+def max_with_lens(features, lens):
+    """
+    features: [N, T, ...] (assume the second dimension represents length)
+    lens: [N,]
+    """
+    lens = torch.as_tensor(lens)
+    mask = generate_length_mask(lens).to(features.device) # [N, T]
+
+    feature_max = features.clone()
+    feature_max[~mask] = float("-inf")
+    feature_max, _ = feature_max.max(1)
+    return feature_max
+
+def repeat_tensor(x, n):
+    return x.unsqueeze(0).repeat(n, *([1] * len(x.shape)))
+
+def init(m, method="kaiming"):
+    if isinstance(m, (nn.Conv2d, nn.Conv1d)):
+        if method == "kaiming":
+            nn.init.kaiming_uniform_(m.weight)
+        elif method == "xavier":
+            nn.init.xavier_uniform_(m.weight)
+        else:
+            raise Exception(f"initialization method {method} not supported")
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)):
+        nn.init.constant_(m.weight, 1)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Linear):
+        if method == "kaiming":
+            nn.init.kaiming_uniform_(m.weight)
+        elif method == "xavier":
+            nn.init.xavier_uniform_(m.weight)
+        else:
+            raise Exception(f"initialization method {method} not supported")
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Embedding):
+        if method == "kaiming":
+            nn.init.kaiming_uniform_(m.weight)
+        elif method == "xavier":
+            nn.init.xavier_uniform_(m.weight)
+        else:
+            raise Exception(f"initialization method {method} not supported")
+
+
+
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, d_model, dropout=0.1, max_len=100):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * \
+            (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        # self.register_buffer("pe", pe)
+        self.register_parameter("pe", nn.Parameter(pe, requires_grad=False))
+
+    def forward(self, x):
+        # x: [T, N, E]
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
diff --git a/audio_to_text/captioning/utils/README.md b/audio_to_text/captioning/utils/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c6fd17d778a9f9dbe7bf632c92e40e36e67b91d2
--- /dev/null
+++ b/audio_to_text/captioning/utils/README.md
@@ -0,0 +1,19 @@
+# Utils
+
+Scripts in this directory are used as utility functions.
+
+## BERT Pretrained Embeddings
+
+You can load pretrained word embeddings in Google [BERT](https://github.com/google-research/bert#pre-trained-models) instead of training word embeddings from scratch. The scripts in `utils/bert` need a BERT server in the background. We use BERT server from [bert-as-service](https://github.com/hanxiao/bert-as-service).
+
+To use bert-as-service, you need to first install the repository. It is recommended that you create a new environment with Tensorflow 1.3 to run BERT server since it is incompatible with Tensorflow 2.x.
+
+After successful installation of [bert-as-service](https://github.com/hanxiao/bert-as-service), downloading and running the BERT server needs to execute:
+
+```bash
+bash scripts/prepare_bert_server.sh <path-to-server> <num-workers> zh
+```
+
+By default, server based on BERT base Chinese model is running in the background. You can change to other models by changing corresponding model name and path in `scripts/prepare_bert_server.sh`.
+
+To extract BERT word embeddings, you need to execute `utils/bert/create_word_embedding.py`.
diff --git a/audio_to_text/captioning/utils/__init__.py b/audio_to_text/captioning/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/audio_to_text/captioning/utils/__pycache__/__init__.cpython-38.pyc b/audio_to_text/captioning/utils/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be638756ffa795f33059276b99c2f8c05661cbdf
Binary files /dev/null and b/audio_to_text/captioning/utils/__pycache__/__init__.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/utils/__pycache__/train_util.cpython-38.pyc b/audio_to_text/captioning/utils/__pycache__/train_util.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4270c25cf751d703e233146358c7345c39e55ceb
Binary files /dev/null and b/audio_to_text/captioning/utils/__pycache__/train_util.cpython-38.pyc differ
diff --git a/audio_to_text/captioning/utils/bert/create_sent_embedding.py b/audio_to_text/captioning/utils/bert/create_sent_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..b517a32429ca74bae668291dcb03d34296027440
--- /dev/null
+++ b/audio_to_text/captioning/utils/bert/create_sent_embedding.py
@@ -0,0 +1,89 @@
+import pickle
+import fire
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+
+class EmbeddingExtractor(object):
+
+    def extract_sentbert(self, caption_file: str, output: str, dev: bool=True, zh: bool=False):
+        from sentence_transformers import SentenceTransformer
+        lang2model = {
+            "zh": "distiluse-base-multilingual-cased",
+            "en": "bert-base-nli-mean-tokens"
+        }
+        lang = "zh" if zh else "en"
+        model = SentenceTransformer(lang2model[lang])
+
+        self.extract(caption_file, model, output, dev)
+
+    def extract_originbert(self, caption_file: str, output: str, dev: bool=True, ip="localhost"):
+        from bert_serving.client import BertClient
+        client = BertClient(ip)
+        
+        self.extract(caption_file, client, output, dev)
+
+    def extract(self, caption_file: str, model, output, dev: bool):
+        caption_df = pd.read_json(caption_file, dtype={"key": str})
+        embeddings = {}
+
+        if dev:
+            with tqdm(total=caption_df.shape[0], ascii=True) as pbar:
+                for idx, row in caption_df.iterrows():
+                    caption = row["caption"]
+                    key = row["key"]
+                    cap_idx = row["caption_index"]
+                    embedding = model.encode([caption])
+                    embedding = np.array(embedding).reshape(-1)
+                    embeddings[f"{key}_{cap_idx}"] = embedding
+                    pbar.update()
+
+        else:
+            dump = {}
+
+            with tqdm(total=caption_df.shape[0], ascii=True) as pbar:
+                for idx, row in caption_df.iterrows():
+                    key = row["key"]
+                    caption = row["caption"]
+                    value = np.array(model.encode([caption])).reshape(-1)
+
+                    if key not in embeddings.keys():
+                        embeddings[key] = [value]
+                    else:
+                        embeddings[key].append(value)
+
+                    pbar.update()
+                
+            for key in embeddings:
+                dump[key] = np.stack(embeddings[key])
+
+            embeddings = dump
+
+        with open(output, "wb") as f:
+            pickle.dump(embeddings, f)
+        
+    def extract_sbert(self,
+                      input_json: str,
+                      output: str):
+        from sentence_transformers import SentenceTransformer
+        import json
+        import torch
+        from h5py import File
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
+        model = model.to(device)
+        model.eval()
+
+        data = json.load(open(input_json))["audios"]
+        with torch.no_grad(), tqdm(total=len(data), ascii=True) as pbar, File(output, "w") as store:
+            for sample in data:
+                audio_id = sample["audio_id"]
+                for cap in sample["captions"]:
+                    cap_id = cap["cap_id"]
+                    store[f"{audio_id}_{cap_id}"] = model.encode(cap["caption"])
+                pbar.update()
+
+
+if __name__ == "__main__":
+    fire.Fire(EmbeddingExtractor)
diff --git a/audio_to_text/captioning/utils/bert/create_word_embedding.py b/audio_to_text/captioning/utils/bert/create_word_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..43c980e69057dc251ddbb7ae6a19684807cc6699
--- /dev/null
+++ b/audio_to_text/captioning/utils/bert/create_word_embedding.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+
+from bert_serving.client import BertClient
+import numpy as np
+from tqdm import tqdm
+import fire
+import torch
+
+sys.path.append(os.getcwd())
+from utils.build_vocab import Vocabulary
+
+def main(vocab_file: str, output: str, server_hostname: str):
+    client = BertClient(ip=server_hostname)
+    vocabulary = torch.load(vocab_file)
+    vocab_size = len(vocabulary)
+    
+    fake_embedding = client.encode(["test"]).reshape(-1)
+    embed_size = fake_embedding.shape[0]
+
+    print("Encoding words into embeddings with size: ", embed_size)
+
+    embeddings = np.empty((vocab_size, embed_size))
+    for i in tqdm(range(len(embeddings)), ascii=True):
+        embeddings[i] = client.encode([vocabulary.idx2word[i]])
+    np.save(output, embeddings)
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
+
+    
diff --git a/audio_to_text/captioning/utils/build_vocab.py b/audio_to_text/captioning/utils/build_vocab.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9fab23bc2c48203e541d356dc172e1fdee8f113
--- /dev/null
+++ b/audio_to_text/captioning/utils/build_vocab.py
@@ -0,0 +1,153 @@
+import json
+from tqdm import tqdm
+import logging
+import pickle
+from collections import Counter
+import re
+import fire
+
+
+class Vocabulary(object):
+    """Simple vocabulary wrapper."""
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = {}
+        self.idx = 0
+
+    def add_word(self, word):
+        if not word in self.word2idx:
+            self.word2idx[word] = self.idx
+            self.idx2word[self.idx] = word
+            self.idx += 1
+
+    def __call__(self, word):
+        if not word in self.word2idx:
+            return self.word2idx["<unk>"]
+        return self.word2idx[word]
+
+    def __getitem__(self, word_id):
+        return self.idx2word[word_id]
+
+    def __len__(self):
+        return len(self.word2idx)
+
+
+def build_vocab(input_json: str,
+                threshold: int,
+                keep_punctuation: bool,
+                host_address: str,
+                character_level: bool = False,
+                zh: bool = True ):
+    """Build vocabulary from csv file with a given threshold to drop all counts < threshold
+
+    Args:
+        input_json(string): Preprossessed json file. Structure like this: 
+            {
+              'audios': [
+                {
+                  'audio_id': 'xxx',
+                  'captions': [
+                    { 
+                      'caption': 'xxx',
+                      'cap_id': 'xxx'
+                    }
+                  ]
+                },
+                ...
+              ]
+            }
+        threshold (int): Threshold to drop all words with counts < threshold
+        keep_punctuation (bool): Includes or excludes punctuation.
+
+    Returns:
+        vocab (Vocab): Object with the processed vocabulary
+"""
+    data = json.load(open(input_json, "r"))["audios"]
+    counter = Counter()
+    pretokenized = "tokens" in data[0]["captions"][0]
+    
+    if zh:
+        from nltk.parse.corenlp import CoreNLPParser
+        from zhon.hanzi import punctuation
+        if not pretokenized:
+            parser = CoreNLPParser(host_address)
+        for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+            for cap_idx in range(len(data[audio_idx]["captions"])):
+                if pretokenized:
+                    tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+                else:
+                    caption = data[audio_idx]["captions"][cap_idx]["caption"]
+                    # Remove all punctuations
+                    if not keep_punctuation:
+                        caption = re.sub("[{}]".format(punctuation), "", caption)
+                    if character_level:
+                        tokens = list(caption)
+                    else:
+                        tokens = list(parser.tokenize(caption))
+                    data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)
+                counter.update(tokens)
+    else:
+        if pretokenized:
+            for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+                for cap_idx in range(len(data[audio_idx]["captions"])):
+                    tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+                    counter.update(tokens)
+        else:
+            from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+            captions = {}
+            for audio_idx in range(len(data)):
+                audio_id = data[audio_idx]["audio_id"]
+                captions[audio_id] = []
+                for cap_idx in range(len(data[audio_idx]["captions"])):
+                    caption = data[audio_idx]["captions"][cap_idx]["caption"]
+                    captions[audio_id].append({
+                        "audio_id": audio_id,
+                        "id": cap_idx,
+                        "caption": caption
+                    })
+            tokenizer = PTBTokenizer()
+            captions = tokenizer.tokenize(captions)
+            for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+                audio_id = data[audio_idx]["audio_id"]
+                for cap_idx in range(len(data[audio_idx]["captions"])):
+                    tokens = captions[audio_id][cap_idx]
+                    data[audio_idx]["captions"][cap_idx]["tokens"] = tokens
+                    counter.update(tokens.split(" "))
+
+    if not pretokenized:
+        json.dump({ "audios": data }, open(input_json, "w"), indent=4, ensure_ascii=not zh)
+    words = [word for word, cnt in counter.items() if cnt >= threshold]
+
+    # Create a vocab wrapper and add some special tokens.
+    vocab = Vocabulary()
+    vocab.add_word("<pad>")
+    vocab.add_word("<start>")
+    vocab.add_word("<end>")
+    vocab.add_word("<unk>")
+
+    # Add the words to the vocabulary.
+    for word in words:
+        vocab.add_word(word)
+    return vocab
+
+
+def process(input_json: str,
+            output_file: str,
+            threshold: int = 1,
+            keep_punctuation: bool = False,
+            character_level: bool = False,
+            host_address: str = "http://localhost:9000",
+            zh: bool = False):
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    logging.basicConfig(level=logging.INFO, format=logfmt)
+    logging.info("Build Vocab")
+    vocabulary = build_vocab(
+        input_json=input_json, threshold=threshold, keep_punctuation=keep_punctuation,
+        host_address=host_address, character_level=character_level, zh=zh)
+    pickle.dump(vocabulary, open(output_file, "wb"))
+    logging.info("Total vocabulary size: {}".format(len(vocabulary)))
+    logging.info("Saved vocab to '{}'".format(output_file))
+
+
+if __name__ == '__main__':
+    fire.Fire(process)
diff --git a/audio_to_text/captioning/utils/build_vocab_ltp.py b/audio_to_text/captioning/utils/build_vocab_ltp.py
new file mode 100644
index 0000000000000000000000000000000000000000..aae0c718ae546882dcb573be42ace3408394468f
--- /dev/null
+++ b/audio_to_text/captioning/utils/build_vocab_ltp.py
@@ -0,0 +1,150 @@
+import json
+from tqdm import tqdm
+import logging
+import pickle
+from collections import Counter
+import re
+import fire
+
+class Vocabulary(object):
+    """Simple vocabulary wrapper."""
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = {}
+        self.idx = 0
+
+    def add_word(self, word):
+        if not word in self.word2idx:
+            self.word2idx[word] = self.idx
+            self.idx2word[self.idx] = word
+            self.idx += 1
+
+    def __call__(self, word):
+        if not word in self.word2idx:
+            return self.word2idx["<unk>"]
+        return self.word2idx[word]
+
+    def __len__(self):
+        return len(self.word2idx)
+
+def build_vocab(input_json: str,
+                output_json: str,
+                threshold: int,
+                keep_punctuation: bool,
+                character_level: bool = False,
+                zh: bool = True ):
+    """Build vocabulary from csv file with a given threshold to drop all counts < threshold
+
+    Args:
+        input_json(string): Preprossessed json file. Structure like this: 
+            {
+              'audios': [
+                {
+                  'audio_id': 'xxx',
+                  'captions': [
+                    { 
+                      'caption': 'xxx',
+                      'cap_id': 'xxx'
+                    }
+                  ]
+                },
+                ...
+              ]
+            }
+        threshold (int): Threshold to drop all words with counts < threshold
+        keep_punctuation (bool): Includes or excludes punctuation.
+
+    Returns:
+        vocab (Vocab): Object with the processed vocabulary
+"""
+    data = json.load(open(input_json, "r"))["audios"]
+    counter = Counter()
+    pretokenized = "tokens" in data[0]["captions"][0]
+    
+    if zh:
+        from ltp import LTP
+        from zhon.hanzi import punctuation
+        if not pretokenized:
+            parser = LTP("base")
+        for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+            for cap_idx in range(len(data[audio_idx]["captions"])):
+                if pretokenized:
+                    tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+                else:
+                    caption = data[audio_idx]["captions"][cap_idx]["caption"]
+                    if character_level:
+                        tokens = list(caption)
+                    else:
+                        tokens, _ = parser.seg([caption])
+                        tokens = tokens[0]
+                    # Remove all punctuations
+                    if not keep_punctuation:
+                        tokens = [token for token in tokens if token not in punctuation]
+                    data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)
+                counter.update(tokens)
+    else:
+        if pretokenized:
+            for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+                for cap_idx in range(len(data[audio_idx]["captions"])):
+                    tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+                    counter.update(tokens)
+        else:
+            from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+            captions = {}
+            for audio_idx in range(len(data)):
+                audio_id = data[audio_idx]["audio_id"]
+                captions[audio_id] = []
+                for cap_idx in range(len(data[audio_idx]["captions"])):
+                    caption = data[audio_idx]["captions"][cap_idx]["caption"]
+                    captions[audio_id].append({
+                        "audio_id": audio_id,
+                        "id": cap_idx,
+                        "caption": caption
+                    })
+            tokenizer = PTBTokenizer()
+            captions = tokenizer.tokenize(captions)
+            for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+                audio_id = data[audio_idx]["audio_id"]
+                for cap_idx in range(len(data[audio_idx]["captions"])):
+                    tokens = captions[audio_id][cap_idx]
+                    data[audio_idx]["captions"][cap_idx]["tokens"] = tokens
+                    counter.update(tokens.split(" "))
+
+    if not pretokenized:
+        if output_json is None:
+            output_json = input_json
+        json.dump({ "audios": data }, open(output_json, "w"), indent=4, ensure_ascii=not zh)
+    words = [word for word, cnt in counter.items() if cnt >= threshold]
+
+    # Create a vocab wrapper and add some special tokens.
+    vocab = Vocabulary()
+    vocab.add_word("<pad>")
+    vocab.add_word("<start>")
+    vocab.add_word("<end>")
+    vocab.add_word("<unk>")
+
+    # Add the words to the vocabulary.
+    for word in words:
+        vocab.add_word(word)
+    return vocab
+
+def process(input_json: str,
+            output_file: str,
+            output_json: str = None,
+            threshold: int = 1,
+            keep_punctuation: bool = False,
+            character_level: bool = False,
+            zh: bool = True):
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    logging.basicConfig(level=logging.INFO, format=logfmt)
+    logging.info("Build Vocab")
+    vocabulary = build_vocab(
+        input_json=input_json, output_json=output_json, threshold=threshold, 
+        keep_punctuation=keep_punctuation, character_level=character_level, zh=zh)
+    pickle.dump(vocabulary, open(output_file, "wb"))
+    logging.info("Total vocabulary size: {}".format(len(vocabulary)))
+    logging.info("Saved vocab to '{}'".format(output_file))
+
+
+if __name__ == '__main__':
+    fire.Fire(process)
diff --git a/audio_to_text/captioning/utils/build_vocab_spacy.py b/audio_to_text/captioning/utils/build_vocab_spacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..84da679f79d9f36b288d7312fb4ad9dc04723b0d
--- /dev/null
+++ b/audio_to_text/captioning/utils/build_vocab_spacy.py
@@ -0,0 +1,152 @@
+import json
+from tqdm import tqdm
+import logging
+import pickle
+from collections import Counter
+import re
+import fire
+
+class Vocabulary(object):
+    """Simple vocabulary wrapper."""
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = {}
+        self.idx = 0
+
+    def add_word(self, word):
+        if not word in self.word2idx:
+            self.word2idx[word] = self.idx
+            self.idx2word[self.idx] = word
+            self.idx += 1
+
+    def __call__(self, word):
+        if not word in self.word2idx:
+            return self.word2idx["<unk>"]
+        return self.word2idx[word]
+
+    def __len__(self):
+        return len(self.word2idx)
+
+
+def build_vocab(input_json: str,
+                output_json: str,
+                threshold: int,
+                keep_punctuation: bool,
+                host_address: str,
+                character_level: bool = False,
+                retokenize: bool = True,
+                zh: bool = True ):
+    """Build vocabulary from csv file with a given threshold to drop all counts < threshold
+
+    Args:
+        input_json(string): Preprossessed json file. Structure like this: 
+            {
+              'audios': [
+                {
+                  'audio_id': 'xxx',
+                  'captions': [
+                    { 
+                      'caption': 'xxx',
+                      'cap_id': 'xxx'
+                    }
+                  ]
+                },
+                ...
+              ]
+            }
+        threshold (int): Threshold to drop all words with counts < threshold
+        keep_punctuation (bool): Includes or excludes punctuation.
+
+    Returns:
+        vocab (Vocab): Object with the processed vocabulary
+"""
+    data = json.load(open(input_json, "r"))["audios"]
+    counter = Counter()
+    if retokenize:
+        pretokenized = False
+    else:
+        pretokenized = "tokens" in data[0]["captions"][0]
+    
+    if zh:
+        from nltk.parse.corenlp import CoreNLPParser
+        from zhon.hanzi import punctuation
+        if not pretokenized:
+            parser = CoreNLPParser(host_address)
+        for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+            for cap_idx in range(len(data[audio_idx]["captions"])):
+                if pretokenized:
+                    tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+                else:
+                    caption = data[audio_idx]["captions"][cap_idx]["caption"]
+                    # Remove all punctuations
+                    if not keep_punctuation:
+                        caption = re.sub("[{}]".format(punctuation), "", caption)
+                    if character_level:
+                        tokens = list(caption)
+                    else:
+                        tokens = list(parser.tokenize(caption))
+                    data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)
+                counter.update(tokens)
+    else:
+        if pretokenized:
+            for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+                for cap_idx in range(len(data[audio_idx]["captions"])):
+                    tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split()
+                    counter.update(tokens)
+        else:
+            import spacy
+            tokenizer = spacy.load("en_core_web_sm", disable=["parser", "ner"])
+            for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+                captions = data[audio_idx]["captions"]
+                for cap_idx in range(len(captions)):
+                    caption = captions[cap_idx]["caption"]
+                    doc = tokenizer(caption)
+                    tokens = " ".join([str(token).lower() for token in doc])
+                    data[audio_idx]["captions"][cap_idx]["tokens"] = tokens
+                    counter.update(tokens.split(" "))
+
+    if not pretokenized:
+        if output_json is None:
+            json.dump({ "audios": data }, open(input_json, "w"),
+                      indent=4, ensure_ascii=not zh)
+        else:
+            json.dump({ "audios": data }, open(output_json, "w"),
+                      indent=4, ensure_ascii=not zh)
+
+    words = [word for word, cnt in counter.items() if cnt >= threshold]
+
+    # Create a vocab wrapper and add some special tokens.
+    vocab = Vocabulary()
+    vocab.add_word("<pad>")
+    vocab.add_word("<start>")
+    vocab.add_word("<end>")
+    vocab.add_word("<unk>")
+
+    # Add the words to the vocabulary.
+    for word in words:
+        vocab.add_word(word)
+    return vocab
+
+def process(input_json: str,
+            output_file: str,
+            output_json: str = None,
+            threshold: int = 1,
+            keep_punctuation: bool = False,
+            character_level: bool = False,
+            retokenize: bool = False,
+            host_address: str = "http://localhost:9000",
+            zh: bool = True):
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    logging.basicConfig(level=logging.INFO, format=logfmt)
+    logging.info("Build Vocab")
+    vocabulary = build_vocab(
+        input_json=input_json, output_json=output_json, threshold=threshold,
+        keep_punctuation=keep_punctuation, host_address=host_address,
+        character_level=character_level, retokenize=retokenize, zh=zh)
+    pickle.dump(vocabulary, open(output_file, "wb"))
+    logging.info("Total vocabulary size: {}".format(len(vocabulary)))
+    logging.info("Saved vocab to '{}'".format(output_file))
+
+
+if __name__ == '__main__':
+    fire.Fire(process)
diff --git a/audio_to_text/captioning/utils/eval_round_robin.py b/audio_to_text/captioning/utils/eval_round_robin.py
new file mode 100644
index 0000000000000000000000000000000000000000..28603a56fe3e6603cca7da5d70c0f71b1421c7c5
--- /dev/null
+++ b/audio_to_text/captioning/utils/eval_round_robin.py
@@ -0,0 +1,182 @@
+import copy
+import json
+
+import numpy as np
+import fire
+
+
+def evaluate_annotation(key2refs, scorer):
+    if scorer.method() == "Bleu":
+        scores = np.array([ 0.0 for n in range(4) ])
+    else:
+        scores = 0
+    num_cap_per_audio = len(next(iter(key2refs.values())))
+
+    for i in range(num_cap_per_audio):
+        if i > 0:
+            for key in key2refs:
+                key2refs[key].insert(0, res[key][0])
+        res = { key: [refs.pop(),] for key, refs in key2refs.items() }
+        score, _ = scorer.compute_score(key2refs, res)
+        
+        if scorer.method() == "Bleu":
+            scores += np.array(score)
+        else:
+            scores += score
+    
+    score = scores / num_cap_per_audio
+    return score
+   
+def evaluate_prediction(key2pred, key2refs, scorer):
+    if scorer.method() == "Bleu":
+        scores = np.array([ 0.0 for n in range(4) ])
+    else:
+        scores = 0
+    num_cap_per_audio = len(next(iter(key2refs.values())))
+
+    for i in range(num_cap_per_audio):
+        key2refs_i = {}
+        for key, refs in key2refs.items():
+            key2refs_i[key] = refs[:i] + refs[i+1:]
+        score, _ = scorer.compute_score(key2refs_i, key2pred)
+        
+        if scorer.method() == "Bleu":
+            scores += np.array(score)
+        else:
+            scores += score
+    
+    score = scores / num_cap_per_audio
+    return score
+
+
+class Evaluator(object):
+
+    def eval_annotation(self, annotation, output):
+        captions = json.load(open(annotation, "r"))["audios"]
+
+        key2refs = {}
+        for audio_idx in range(len(captions)):
+            audio_id = captions[audio_idx]["audio_id"]
+            key2refs[audio_id] = []
+            for caption in captions[audio_idx]["captions"]:
+                key2refs[audio_id].append(caption["caption"])
+
+        from fense.fense import Fense
+        scores = {}
+        scorer = Fense()
+        scores[scorer.method()] = evaluate_annotation(copy.deepcopy(key2refs), scorer)
+
+        refs4eval = {}
+        for key, refs in key2refs.items():
+            refs4eval[key] = []
+            for idx, ref in enumerate(refs):
+                refs4eval[key].append({
+                    "audio_id": key,
+                    "id": idx,
+                    "caption": ref
+                })
+
+        from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+
+        tokenizer = PTBTokenizer()
+        key2refs = tokenizer.tokenize(refs4eval)
+
+
+        from pycocoevalcap.bleu.bleu import Bleu
+        from pycocoevalcap.cider.cider import Cider
+        from pycocoevalcap.rouge.rouge import Rouge
+        from pycocoevalcap.meteor.meteor import Meteor
+        from pycocoevalcap.spice.spice import Spice
+        
+
+        scorers = [Bleu(), Rouge(), Cider(), Meteor(), Spice()]
+        for scorer in scorers:
+            scores[scorer.method()] = evaluate_annotation(copy.deepcopy(key2refs), scorer)
+
+        spider = 0
+        with open(output, "w") as f:
+            for name, score in scores.items():
+                if name == "Bleu":
+                    for n in range(4):
+                        f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n]))
+                else:
+                    f.write("{}: {:6.3f}\n".format(name, score))
+                    if name in ["CIDEr", "SPICE"]:
+                        spider += score
+            f.write("SPIDEr: {:6.3f}\n".format(spider / 2))
+
+    def eval_prediction(self, prediction, annotation, output):
+        ref_captions = json.load(open(annotation, "r"))["audios"]
+
+        key2refs = {}
+        for audio_idx in range(len(ref_captions)):
+            audio_id = ref_captions[audio_idx]["audio_id"]
+            key2refs[audio_id] = []
+            for caption in ref_captions[audio_idx]["captions"]:
+                key2refs[audio_id].append(caption["caption"])
+
+        pred_captions = json.load(open(prediction, "r"))["predictions"]
+
+        key2pred = {}
+        for audio_idx in range(len(pred_captions)):
+            item = pred_captions[audio_idx]
+            audio_id = item["filename"]
+            key2pred[audio_id] = [item["tokens"]]
+
+        from fense.fense import Fense
+        scores = {}
+        scorer = Fense()
+        scores[scorer.method()] = evaluate_prediction(key2pred, key2refs, scorer)
+
+        refs4eval = {}
+        for key, refs in key2refs.items():
+            refs4eval[key] = []
+            for idx, ref in enumerate(refs):
+                refs4eval[key].append({
+                    "audio_id": key,
+                    "id": idx,
+                    "caption": ref
+                })
+
+        preds4eval = {}
+        for key, preds in key2pred.items():
+            preds4eval[key] = []
+            for idx, pred in enumerate(preds):
+                preds4eval[key].append({
+                    "audio_id": key,
+                    "id": idx,
+                    "caption": pred
+                })
+
+        from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+
+        tokenizer = PTBTokenizer()
+        key2refs = tokenizer.tokenize(refs4eval)
+        key2pred = tokenizer.tokenize(preds4eval)
+
+
+        from pycocoevalcap.bleu.bleu import Bleu
+        from pycocoevalcap.cider.cider import Cider
+        from pycocoevalcap.rouge.rouge import Rouge
+        from pycocoevalcap.meteor.meteor import Meteor
+        from pycocoevalcap.spice.spice import Spice
+
+        scorers = [Bleu(), Rouge(), Cider(), Meteor(), Spice()]
+        for scorer in scorers:
+            scores[scorer.method()] = evaluate_prediction(key2pred, key2refs, scorer)
+
+        spider = 0
+        with open(output, "w") as f:
+            for name, score in scores.items():
+                if name == "Bleu":
+                    for n in range(4):
+                        f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n]))
+                else:
+                    f.write("{}: {:6.3f}\n".format(name, score))
+                    if name in ["CIDEr", "SPICE"]:
+                        spider += score
+            f.write("SPIDEr: {:6.3f}\n".format(spider / 2))
+
+
+if __name__ == "__main__":
+    fire.Fire(Evaluator)
diff --git a/audio_to_text/captioning/utils/fasttext/create_word_embedding.py b/audio_to_text/captioning/utils/fasttext/create_word_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..09da13a62a3462e730c8275320a6391536ff42c4
--- /dev/null
+++ b/audio_to_text/captioning/utils/fasttext/create_word_embedding.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+#!/usr/bin/env python3
+
+import numpy as np
+import pandas as pd
+import torch
+from gensim.models import FastText
+from tqdm import tqdm
+import fire
+
+import sys
+import os
+sys.path.append(os.getcwd())
+from utils.build_vocab import Vocabulary
+
+def create_embedding(caption_file: str,
+                     vocab_file: str,
+                     embed_size: int,
+                     output: str,
+                     **fasttext_kwargs):
+    caption_df = pd.read_json(caption_file)
+    caption_df["tokens"] = caption_df["tokens"].apply(lambda x: ["<start>"] + [token for token in x] + ["<end>"])
+
+    sentences = list(caption_df["tokens"].values)
+    vocabulary = torch.load(vocab_file, map_location="cpu")
+
+    epochs = fasttext_kwargs.get("epochs", 10)
+    model = FastText(size=embed_size, min_count=1, **fasttext_kwargs)
+    model.build_vocab(sentences=sentences)
+    model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs)
+    
+    word_embeddings = np.zeros((len(vocabulary), embed_size))
+    
+    with tqdm(total=len(vocabulary), ascii=True) as pbar:
+        for word, idx in vocabulary.word2idx.items():
+            if word == "<pad>" or word == "<unk>":
+                continue
+            word_embeddings[idx] = model.wv[word]
+            pbar.update()
+
+    np.save(output, word_embeddings)
+
+    print("Finish writing fasttext embeddings to " + output)
+
+
+if __name__ == "__main__":
+    fire.Fire(create_embedding)
+
+
+
diff --git a/audio_to_text/captioning/utils/lr_scheduler.py b/audio_to_text/captioning/utils/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46e3f0397634bcf48a6a61ab041a7ea07577eb3
--- /dev/null
+++ b/audio_to_text/captioning/utils/lr_scheduler.py
@@ -0,0 +1,128 @@
+import math
+import torch
+
+
+class ExponentialDecayScheduler(torch.optim.lr_scheduler._LRScheduler):
+
+    def __init__(self, optimizer, total_iters, final_lrs,
+        warmup_iters=3000, last_epoch=-1, verbose=False):
+        self.total_iters = total_iters
+        self.final_lrs = final_lrs
+        if not isinstance(self.final_lrs, list) and not isinstance(
+            self.final_lrs, tuple):
+            self.final_lrs = [self.final_lrs] * len(optimizer.param_groups)
+        self.warmup_iters = warmup_iters
+        self.bases = [0.0,] * len(optimizer.param_groups)
+        super().__init__(optimizer, last_epoch, verbose)
+        for i, (base_lr, final_lr) in enumerate(zip(self.base_lrs, self.final_lrs)):
+            base = (final_lr / base_lr) ** (1 / (
+                self.total_iters - self.warmup_iters))
+            self.bases[i] = base
+
+    def _get_closed_form_lr(self):
+        warmup_coeff = 1.0
+        current_iter = self._step_count
+        if current_iter < self.warmup_iters:
+            warmup_coeff = current_iter / self.warmup_iters
+        current_lrs = []
+        # if not self.linear_warmup:
+            # for base_lr, final_lr, base in zip(self.base_lrs, self.final_lrs, self.bases):
+                # # current_lr = warmup_coeff * base_lr * math.exp(((current_iter - self.warmup_iters) / self.total_iters) * math.log(final_lr / base_lr))
+                # current_lr = warmup_coeff * base_lr * (base ** (current_iter - self.warmup_iters))
+                # current_lrs.append(current_lr)
+        # else:
+        for base_lr, final_lr, base in zip(self.base_lrs, self.final_lrs,
+                self.bases):
+            if current_iter <= self.warmup_iters:
+                current_lr = warmup_coeff * base_lr
+            else:
+                # current_lr = warmup_coeff * base_lr * math.exp(((current_iter - self.warmup_iters) / self.total_iters) * math.log(final_lr / base_lr))
+                current_lr = base_lr * (base ** (current_iter - self.warmup_iters))
+            current_lrs.append(current_lr)
+        return current_lrs
+
+    def get_lr(self):
+        return self._get_closed_form_lr()
+
+
+class NoamScheduler(torch.optim.lr_scheduler._LRScheduler):
+
+    def __init__(self, optimizer, model_size=512, factor=1, warmup_iters=3000,
+            last_epoch=-1, verbose=False):
+        self.model_size = model_size
+        self.warmup_iters = warmup_iters
+        # self.factors = [group["lr"] / (self.model_size ** (-0.5) * self.warmup_iters ** (-0.5)) for group in optimizer.param_groups]
+        self.factor = factor
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def _get_closed_form_lr(self):
+        current_iter = self._step_count
+        current_lrs = []
+        for _ in self.base_lrs:
+            current_lr = self.factor * \
+                (self.model_size ** (-0.5) * min(current_iter ** (-0.5),
+                current_iter * self.warmup_iters ** (-1.5)))
+            current_lrs.append(current_lr)
+        return current_lrs
+
+    def get_lr(self):
+        return self._get_closed_form_lr()
+
+
+class CosineWithWarmup(torch.optim.lr_scheduler._LRScheduler):
+
+    def __init__(self, optimizer, total_iters, warmup_iters,
+            num_cycles=0.5, last_epoch=-1, verbose=False):
+        self.total_iters = total_iters
+        self.warmup_iters = warmup_iters
+        self.num_cycles = num_cycles
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def lr_lambda(self, iteration):
+        if iteration < self.warmup_iters:
+            return float(iteration) / float(max(1, self.warmup_iters))
+        progress = float(iteration - self.warmup_iters) / float(max(1,
+            self.total_iters - self.warmup_iters))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(
+            self.num_cycles) * 2.0 * progress)))
+
+    def _get_closed_form_lr(self):
+        current_iter = self._step_count
+        current_lrs = []
+        for base_lr in self.base_lrs:
+            current_lr = base_lr * self.lr_lambda(current_iter)
+            current_lrs.append(current_lr)
+        return current_lrs
+
+    def get_lr(self):
+        return self._get_closed_form_lr()
+
+
+if __name__ == "__main__":
+    model = torch.nn.Linear(10, 5)
+    optimizer = torch.optim.Adam(model.parameters(), 5e-4)
+    epochs = 25
+    iters = 600
+    scheduler = CosineWithWarmup(optimizer, 600 * 25, 600 * 5,)
+    # scheduler = ExponentialDecayScheduler(optimizer, 600 * 25, 5e-7, 600 * 5)
+    criterion = torch.nn.MSELoss()
+    lrs = []
+    for epoch in range(1, epochs + 1):
+        for iteration in range(1, iters + 1):
+            optimizer.zero_grad()
+            x = torch.randn(4, 10)
+            y = torch.randn(4, 5)
+            loss = criterion(model(x), y)
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+            # print(f"lr: {scheduler.get_last_lr()}")
+            # lrs.append(scheduler.get_last_lr())
+            lrs.append(optimizer.param_groups[0]["lr"])
+    import matplotlib.pyplot as plt
+    plt.plot(list(range(1, len(lrs) + 1)), lrs, '-o', markersize=1)
+    # plt.legend(loc="best")
+    plt.xlabel("Iteration")
+    plt.ylabel("LR")
+
+    plt.savefig("lr_curve.png", dpi=100)
diff --git a/audio_to_text/captioning/utils/model_eval_diff.py b/audio_to_text/captioning/utils/model_eval_diff.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c29ef8fde2451d3f84e842d0d6a72754f0d4603
--- /dev/null
+++ b/audio_to_text/captioning/utils/model_eval_diff.py
@@ -0,0 +1,110 @@
+import os
+import sys
+import copy
+import pickle
+
+import numpy as np
+import pandas as pd
+import fire
+
+sys.path.append(os.getcwd())
+
+
+def coco_score(refs, pred, scorer):
+    if scorer.method() == "Bleu":
+        scores = np.array([ 0.0 for n in range(4) ])
+    else:
+        scores = 0
+    num_cap_per_audio = len(refs[list(refs.keys())[0]])
+
+    for i in range(num_cap_per_audio):
+        if i > 0:
+            for key in refs:
+                refs[key].insert(0, res[key][0])
+        res = {key: [refs[key].pop(),] for key in refs}
+        score, _ = scorer.compute_score(refs, pred)    
+        
+        if scorer.method() == "Bleu":
+            scores += np.array(score)
+        else:
+            scores += score
+    
+    score = scores / num_cap_per_audio
+
+    for key in refs:
+        refs[key].insert(0, res[key][0])
+    score_allref, _ = scorer.compute_score(refs, pred)
+    diff = score_allref - score
+    return diff
+
+def embedding_score(refs, pred, scorer):
+
+    num_cap_per_audio = len(refs[list(refs.keys())[0]])
+    scores = 0
+
+    for i in range(num_cap_per_audio):
+        res = {key: [refs[key][i],] for key in refs.keys() if len(refs[key]) == num_cap_per_audio}
+        refs_i = {key: np.concatenate([refs[key][:i], refs[key][i+1:]]) for key in refs.keys() if len(refs[key]) == num_cap_per_audio}
+        score, _ = scorer.compute_score(refs_i, pred)    
+        
+        scores += score
+    
+    score = scores / num_cap_per_audio
+
+    score_allref, _ = scorer.compute_score(refs, pred)
+    diff = score_allref - score
+    return diff
+   
+def main(output_file, eval_caption_file, eval_embedding_file, output, zh=False):
+    output_df = pd.read_json(output_file)
+    output_df["key"] = output_df["filename"].apply(lambda x: os.path.splitext(os.path.basename(x))[0])
+    pred = output_df.groupby("key")["tokens"].apply(list).to_dict()
+
+    label_df = pd.read_json(eval_caption_file)
+    if zh:
+        refs = label_df.groupby("key")["tokens"].apply(list).to_dict()
+    else:
+        refs = label_df.groupby("key")["caption"].apply(list).to_dict()
+
+    from pycocoevalcap.bleu.bleu import Bleu
+    from pycocoevalcap.cider.cider import Cider
+    from pycocoevalcap.rouge.rouge import Rouge
+
+    scorer = Bleu(zh=zh)
+    bleu_scores = coco_score(copy.deepcopy(refs), pred, scorer)
+    scorer = Cider(zh=zh)
+    cider_score = coco_score(copy.deepcopy(refs), pred, scorer)
+    scorer = Rouge(zh=zh)
+    rouge_score = coco_score(copy.deepcopy(refs), pred, scorer)
+
+    if not zh:
+        from pycocoevalcap.meteor.meteor import Meteor
+        scorer = Meteor()
+        meteor_score = coco_score(copy.deepcopy(refs), pred, scorer)
+
+        from pycocoevalcap.spice.spice import Spice
+        scorer = Spice()
+        spice_score = coco_score(copy.deepcopy(refs), pred, scorer)
+    
+    # from audiocaptioneval.sentbert.sentencebert import SentenceBert
+    # scorer = SentenceBert(zh=zh)
+    # with open(eval_embedding_file, "rb") as f:
+        # ref_embeddings = pickle.load(f)
+
+    # sent_bert = embedding_score(ref_embeddings, pred, scorer)
+
+    with open(output, "w") as f:
+        f.write("Diff:\n")
+        for n in range(4):
+            f.write("BLEU-{}: {:6.3f}\n".format(n+1, bleu_scores[n]))
+        f.write("CIDEr: {:6.3f}\n".format(cider_score))
+        f.write("ROUGE: {:6.3f}\n".format(rouge_score))
+        if not zh:
+            f.write("Meteor: {:6.3f}\n".format(meteor_score))
+            f.write("SPICE: {:6.3f}\n".format(spice_score))
+        # f.write("SentenceBert: {:6.3f}\n".format(sent_bert))
+
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/audio_to_text/captioning/utils/predict_nn.py b/audio_to_text/captioning/utils/predict_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..699c3dcffe7ce2c6dad33a5546c707dd76ccf82c
--- /dev/null
+++ b/audio_to_text/captioning/utils/predict_nn.py
@@ -0,0 +1,49 @@
+import json
+import random
+import argparse
+import numpy as np
+from tqdm import tqdm
+from h5py import File
+import sklearn.metrics
+
+random.seed(1)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("train_feature", type=str)
+parser.add_argument("train_corpus", type=str)
+parser.add_argument("pred_feature", type=str)
+parser.add_argument("output_json", type=str)
+
+args = parser.parse_args()
+train_embs = []
+train_idx_to_audioid = []
+with File(args.train_feature, "r") as store:
+    for audio_id, embedding in tqdm(store.items(), ascii=True):
+        train_embs.append(embedding[()])
+        train_idx_to_audioid.append(audio_id)
+
+train_annotation = json.load(open(args.train_corpus, "r"))["audios"]
+train_audioid_to_tokens = {}
+for item in train_annotation:
+    audio_id = item["audio_id"]
+    train_audioid_to_tokens[audio_id] = [cap_item["tokens"] for cap_item in item["captions"]]
+train_embs = np.stack(train_embs)
+
+
+pred_data = []
+pred_embs = []
+pred_idx_to_audioids = []
+with File(args.pred_feature, "r") as store:
+    for audio_id, embedding in tqdm(store.items(), ascii=True):
+        pred_embs.append(embedding[()])
+        pred_idx_to_audioids.append(audio_id)
+pred_embs = np.stack(pred_embs)
+
+similarity = sklearn.metrics.pairwise.cosine_similarity(pred_embs, train_embs)
+for idx, audio_id in enumerate(pred_idx_to_audioids):
+    train_idx = similarity[idx].argmax()
+    pred_data.append({
+        "filename": audio_id,
+        "tokens": random.choice(train_audioid_to_tokens[train_idx_to_audioid[train_idx]])
+    })
+json.dump({"predictions": pred_data}, open(args.output_json, "w"), ensure_ascii=False, indent=4)
diff --git a/audio_to_text/captioning/utils/remove_optimizer.py b/audio_to_text/captioning/utils/remove_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b9871ee8022c0e0814abb46173fee1a6ae4ba9c
--- /dev/null
+++ b/audio_to_text/captioning/utils/remove_optimizer.py
@@ -0,0 +1,18 @@
+import argparse
+import torch
+
+
+def main(checkpoint):
+    state_dict = torch.load(checkpoint, map_location="cpu")
+    if "optimizer" in state_dict:
+        del state_dict["optimizer"]
+    if "lr_scheduler" in state_dict:
+        del state_dict["lr_scheduler"]
+    torch.save(state_dict, checkpoint)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint", type=str)
+    args = parser.parse_args()
+    main(args.checkpoint)
diff --git a/audio_to_text/captioning/utils/report_results.py b/audio_to_text/captioning/utils/report_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b9f6ec5e8d2f253706198e0d521f73981ef3efe
--- /dev/null
+++ b/audio_to_text/captioning/utils/report_results.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+import argparse
+import numpy as np
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="input filename", type=str, nargs="+")
+parser.add_argument("--output", help="output result file", default=None)
+
+args = parser.parse_args()
+
+
+scores = {}
+for path in args.input:
+    with open(path, "r") as reader:
+        for line in reader.readlines():
+            metric, score = line.strip().split(": ")
+            score = float(score)
+            if metric not in scores:
+                scores[metric] = []
+            scores[metric].append(score)
+
+if len(scores) == 0:
+    print("No experiment directory found, wrong path?")
+    exit(1)
+
+with open(args.output, "w") as writer:
+    print("Average results: ", file=writer)
+    for metric, score in scores.items():
+        score = np.array(score)
+        mean = np.mean(score)
+        std = np.std(score)
+        print(f"{metric}: {mean:.3f} (±{std:.3f})", file=writer)
+    print("", file=writer)
+    print("Best results: ", file=writer)
+    for metric, score in scores.items():
+        score = np.max(score)
+        print(f"{metric}: {score:.3f}", file=writer)
diff --git a/audio_to_text/captioning/utils/tokenize_caption.py b/audio_to_text/captioning/utils/tokenize_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..b340068577a1d4b02e187048e6a20cb95264561f
--- /dev/null
+++ b/audio_to_text/captioning/utils/tokenize_caption.py
@@ -0,0 +1,86 @@
+import json
+from tqdm import tqdm
+import re
+import fire
+
+
+def tokenize_caption(input_json: str,
+                     keep_punctuation: bool = False,
+                     host_address: str = None,
+                     character_level: bool = False,
+                     zh: bool = True,
+                     output_json: str = None):
+    """Build vocabulary from csv file with a given threshold to drop all counts < threshold
+
+    Args:
+        input_json(string): Preprossessed json file. Structure like this: 
+            {
+              'audios': [
+                {
+                  'audio_id': 'xxx',
+                  'captions': [
+                    { 
+                      'caption': 'xxx',
+                      'cap_id': 'xxx'
+                    }
+                  ]
+                },
+                ...
+              ]
+            }
+        threshold (int): Threshold to drop all words with counts < threshold
+        keep_punctuation (bool): Includes or excludes punctuation.
+
+    Returns:
+        vocab (Vocab): Object with the processed vocabulary
+"""
+    data = json.load(open(input_json, "r"))["audios"]
+    
+    if zh:
+        from nltk.parse.corenlp import CoreNLPParser
+        from zhon.hanzi import punctuation
+        parser = CoreNLPParser(host_address)
+        for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+            for cap_idx in range(len(data[audio_idx]["captions"])):
+                caption = data[audio_idx]["captions"][cap_idx]["caption"]
+                # Remove all punctuations
+                if not keep_punctuation:
+                    caption = re.sub("[{}]".format(punctuation), "", caption)
+                if character_level:
+                    tokens = list(caption)
+                else:
+                    tokens = list(parser.tokenize(caption))
+                data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)
+    else:
+        from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+        captions = {}
+        for audio_idx in range(len(data)):
+            audio_id = data[audio_idx]["audio_id"]
+            captions[audio_id] = []
+            for cap_idx in range(len(data[audio_idx]["captions"])):
+                caption = data[audio_idx]["captions"][cap_idx]["caption"]
+                captions[audio_id].append({
+                    "audio_id": audio_id,
+                    "id": cap_idx,
+                    "caption": caption
+                })
+        tokenizer = PTBTokenizer()
+        captions = tokenizer.tokenize(captions)
+        for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
+            audio_id = data[audio_idx]["audio_id"]
+            for cap_idx in range(len(data[audio_idx]["captions"])):
+                tokens = captions[audio_id][cap_idx]
+                data[audio_idx]["captions"][cap_idx]["tokens"] = tokens
+
+    if output_json:
+        json.dump(
+            { "audios": data }, open(output_json, "w"),
+            indent=4, ensure_ascii=not zh)
+    else:
+        json.dump(
+            { "audios": data }, open(input_json, "w"),
+            indent=4, ensure_ascii=not zh)
+
+
+if __name__ == "__main__":
+    fire.Fire(tokenize_caption)
diff --git a/audio_to_text/captioning/utils/train_util.py b/audio_to_text/captioning/utils/train_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cd62cc36043a2db75cc6761c51fdfdd18d11392
--- /dev/null
+++ b/audio_to_text/captioning/utils/train_util.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env python3
+import os
+import sys
+import logging
+from typing import Callable, Dict, Union
+import yaml
+import torch
+from torch.optim.swa_utils import AveragedModel as torch_average_model
+import numpy as np
+import pandas as pd
+from pprint import pformat
+
+
+def load_dict_from_csv(csv, cols):
+    df = pd.read_csv(csv, sep="\t")
+    output = dict(zip(df[cols[0]], df[cols[1]]))
+    return output
+
+
+def init_logger(filename, level="INFO"):
+    formatter = logging.Formatter(
+        "[ %(levelname)s : %(asctime)s ] - %(message)s")
+    logger = logging.getLogger(__name__ + "." + filename)
+    logger.setLevel(getattr(logging, level))
+    # Log results to std
+    # stdhandler = logging.StreamHandler(sys.stdout)
+    # stdhandler.setFormatter(formatter)
+    # Dump log to file
+    filehandler = logging.FileHandler(filename)
+    filehandler.setFormatter(formatter)
+    logger.addHandler(filehandler)
+    # logger.addHandler(stdhandler)
+    return logger
+
+
+def init_obj(module, config, **kwargs):# 'captioning.models.encoder'
+    obj_args = config["args"].copy()
+    obj_args.update(kwargs)
+    return getattr(module, config["type"])(**obj_args)
+
+
+def pprint_dict(in_dict, outputfun=sys.stdout.write, formatter='yaml'):
+    """pprint_dict
+
+    :param outputfun: function to use, defaults to sys.stdout
+    :param in_dict: dict to print
+    """
+    if formatter == 'yaml':
+        format_fun = yaml.dump
+    elif formatter == 'pretty':
+        format_fun = pformat
+    for line in format_fun(in_dict).split('\n'):
+        outputfun(line)
+
+
+def merge_a_into_b(a, b):
+    # merge dict a into dict b. values in a will overwrite b.
+    for k, v in a.items():
+        if isinstance(v, dict) and k in b:
+            assert isinstance(
+                b[k], dict
+            ), "Cannot inherit key '{}' from base!".format(k)
+            merge_a_into_b(v, b[k])
+        else:
+            b[k] = v
+
+
+def load_config(config_file):
+    with open(config_file, "r") as reader:
+        config = yaml.load(reader, Loader=yaml.FullLoader)
+    if "inherit_from" in config:
+        base_config_file = config["inherit_from"]
+        base_config_file = os.path.join(
+            os.path.dirname(config_file), base_config_file
+        )
+        assert not os.path.samefile(config_file, base_config_file), \
+            "inherit from itself"
+        base_config = load_config(base_config_file)
+        del config["inherit_from"]
+        merge_a_into_b(config, base_config)
+        return base_config
+    return config
+
+
+def parse_config_or_kwargs(config_file, **kwargs):
+    yaml_config = load_config(config_file)
+    # passed kwargs will override yaml config
+    args = dict(yaml_config, **kwargs)
+    return args
+
+
+def store_yaml(config, config_file):
+    with open(config_file, "w") as con_writer:
+        yaml.dump(config, con_writer, indent=4, default_flow_style=False)
+
+
+class MetricImprover:
+
+    def __init__(self, mode):
+        assert mode in ("min", "max")
+        self.mode = mode
+        # min: lower -> better; max: higher -> better
+        self.best_value = np.inf if mode == "min" else -np.inf
+
+    def compare(self, x, best_x):
+        return x < best_x if self.mode == "min" else x > best_x
+
+    def __call__(self, x):
+        if self.compare(x, self.best_value):
+            self.best_value = x
+            return True
+        return False
+
+    def state_dict(self):
+        return self.__dict__
+
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+
+
+def fix_batchnorm(model: torch.nn.Module):
+    def inner(module):
+        class_name = module.__class__.__name__
+        if class_name.find("BatchNorm") != -1:
+            module.eval()
+    model.apply(inner)
+
+
+def load_pretrained_model(model: torch.nn.Module,
+                          pretrained: Union[str, Dict],
+                          output_fn: Callable = sys.stdout.write):
+    if not isinstance(pretrained, dict) and not os.path.exists(pretrained):
+        output_fn(f"pretrained {pretrained} not exist!")
+        return
+    
+    if hasattr(model, "load_pretrained"):
+        model.load_pretrained(pretrained)
+        return
+
+    if isinstance(pretrained, dict):
+        state_dict = pretrained
+    else:
+        state_dict = torch.load(pretrained, map_location="cpu")
+
+    if "model" in state_dict:
+        state_dict = state_dict["model"]
+    model_dict = model.state_dict()
+    pretrained_dict = {
+        k: v for k, v in state_dict.items() if (k in model_dict) and (
+            model_dict[k].shape == v.shape)
+    }
+    output_fn(f"Loading pretrained keys {pretrained_dict.keys()}")
+    model_dict.update(pretrained_dict)
+    model.load_state_dict(model_dict, strict=True)
+
+
+class AveragedModel(torch_average_model):
+
+    def update_parameters(self, model):
+        for p_swa, p_model in zip(self.parameters(), model.parameters()):
+            device = p_swa.device
+            p_model_ = p_model.detach().to(device)
+            if self.n_averaged == 0:
+                p_swa.detach().copy_(p_model_)
+            else:
+                p_swa.detach().copy_(self.avg_fn(p_swa.detach(), p_model_,
+                                                 self.n_averaged.to(device)))
+
+        for b_swa, b_model in zip(list(self.buffers())[1:], model.buffers()):
+            device = b_swa.device
+            b_model_ = b_model.detach().to(device)
+            if self.n_averaged == 0:
+                b_swa.detach().copy_(b_model_)
+            else:
+                b_swa.detach().copy_(self.avg_fn(b_swa.detach(), b_model_,
+                                                 self.n_averaged.to(device)))
+        self.n_averaged += 1
diff --git a/audio_to_text/captioning/utils/word2vec/create_word_embedding.py b/audio_to_text/captioning/utils/word2vec/create_word_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..77ebe5adc6ec14bd639e78125f00c1eaea0b4dcc
--- /dev/null
+++ b/audio_to_text/captioning/utils/word2vec/create_word_embedding.py
@@ -0,0 +1,67 @@
+# coding=utf-8
+#!/usr/bin/env python3
+
+import numpy as np
+import pandas as pd
+import torch
+import gensim
+from gensim.models import Word2Vec
+from tqdm import tqdm
+import fire
+
+import sys
+import os
+sys.path.append(os.getcwd())
+from utils.build_vocab import Vocabulary
+
+def create_embedding(vocab_file: str,
+                     embed_size: int,
+                     output: str,
+                     caption_file: str = None,
+                     pretrained_weights_path: str = None,
+                     **word2vec_kwargs):
+    vocabulary = torch.load(vocab_file, map_location="cpu")
+
+    if pretrained_weights_path:
+        model = gensim.models.KeyedVectors.load_word2vec_format(
+            fname=pretrained_weights_path,
+            binary=True,
+        )
+        if model.vector_size != embed_size:
+            assert embed_size < model.vector_size, f"only reduce dimension, cannot add dimesion {model.vector_size} to {embed_size}"
+            from sklearn.decomposition import PCA
+            pca = PCA(n_components=embed_size)
+            model.vectors = pca.fit_transform(model.vectors)
+    else:
+        caption_df = pd.read_json(caption_file)
+        caption_df["tokens"] = caption_df["tokens"].apply(lambda x: ["<start>"] + [token for token in x] + ["<end>"])
+        sentences = list(caption_df["tokens"].values)
+        epochs = word2vec_kwargs.get("epochs", 10)
+        if "epochs" in word2vec_kwargs:
+            del word2vec_kwargs["epochs"]
+        model = Word2Vec(size=embed_size, min_count=1, **word2vec_kwargs)
+        model.build_vocab(sentences=sentences)
+        model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs)
+    
+    word_embeddings = np.random.randn(len(vocabulary), embed_size)
+    
+    if isinstance(model, gensim.models.word2vec.Word2Vec):
+        model = model.wv
+    with tqdm(total=len(vocabulary), ascii=True) as pbar:
+        for word, idx in vocabulary.word2idx.items():
+            try:
+                word_embeddings[idx] = model.get_vector(word)
+            except KeyError:
+                print(f"word {word} not found in word2vec model, it is random initialized!")
+            pbar.update()
+
+    np.save(output, word_embeddings)
+
+    print("Finish writing word2vec embeddings to " + output)
+
+
+if __name__ == "__main__":
+    fire.Fire(create_embedding)
+
+
+
diff --git a/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..034751fcff1e1d3b686ae0ad1cd6346f92dacc13
--- /dev/null
+++ b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml
@@ -0,0 +1,22 @@
+model:
+    encoder:
+        type: Cnn14RnnEncoder
+        args:
+            sample_rate: 32000
+            pretrained: ./audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
+            freeze_cnn: True
+            freeze_cnn_bn: True
+            bidirectional: True
+            dropout: 0.5
+            hidden_size: 256
+            num_layers: 3
+    decoder:
+        type: TransformerDecoder
+        args:
+            attn_emb_dim: 512
+            dropout: 0.2
+            emb_dim: 256
+            fc_emb_dim: 512
+            nlayers: 2
+    type: TransformerModel
+    args: {}
diff --git a/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0f9a16de2efa334d403326acec7de5de4c3393d6
--- /dev/null
+++ b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8d341dccafcdcfb7009c402afb07f314ab1d613a5f5c42d32407d6c2a821abf
+size 41755865
diff --git a/audio_to_text/inference_waveform.py b/audio_to_text/inference_waveform.py
new file mode 100644
index 0000000000000000000000000000000000000000..aba39614c8104f62cdb8a3c7e0e3cf5dced0d95a
--- /dev/null
+++ b/audio_to_text/inference_waveform.py
@@ -0,0 +1,102 @@
+import sys
+import os
+import librosa
+import numpy as np
+import torch
+import audio_to_text.captioning.models
+import audio_to_text.captioning.models.encoder
+import audio_to_text.captioning.models.decoder
+import audio_to_text.captioning.utils.train_util as train_util
+
+
+def load_model(config, checkpoint):
+    ckpt = torch.load(checkpoint, "cpu")
+    encoder_cfg = config["model"]["encoder"]
+    encoder = train_util.init_obj(
+        audio_to_text.captioning.models.encoder,
+        encoder_cfg
+    )
+    if "pretrained" in encoder_cfg:
+        pretrained = encoder_cfg["pretrained"]
+        train_util.load_pretrained_model(encoder,
+                                         pretrained,
+                                         sys.stdout.write)
+    decoder_cfg = config["model"]["decoder"]
+    if "vocab_size" not in decoder_cfg["args"]:
+        decoder_cfg["args"]["vocab_size"] = len(ckpt["vocabulary"])
+    decoder = train_util.init_obj(
+        audio_to_text.captioning.models.decoder,
+        decoder_cfg
+    )
+    if "word_embedding" in decoder_cfg:
+        decoder.load_word_embedding(**decoder_cfg["word_embedding"])
+    if "pretrained" in decoder_cfg:
+        pretrained = decoder_cfg["pretrained"]
+        train_util.load_pretrained_model(decoder,
+                                         pretrained,
+                                         sys.stdout.write)
+    model = train_util.init_obj(audio_to_text.captioning.models, config["model"],
+        encoder=encoder, decoder=decoder)
+    train_util.load_pretrained_model(model, ckpt)
+    model.eval()
+    return {
+        "model": model,
+        "vocabulary": ckpt["vocabulary"]
+    }
+
+
+def decode_caption(word_ids, vocabulary):
+    candidate = []
+    for word_id in word_ids:
+        word = vocabulary[word_id]
+        if word == "<end>":
+            break
+        elif word == "<start>":
+            continue
+        candidate.append(word)
+    candidate = " ".join(candidate)
+    return candidate
+
+
+class AudioCapModel(object):
+    def __init__(self,weight_dir,device='cuda'):
+        config = os.path.join(weight_dir,'config.yaml')
+        self.config = train_util.parse_config_or_kwargs(config)
+        checkpoint = os.path.join(weight_dir,'swa.pth')
+        resumed = load_model(self.config, checkpoint)
+        model = resumed["model"]
+        self.vocabulary = resumed["vocabulary"]
+        self.model = model.to(device)
+        self.device = device
+
+    def caption(self,audio_list):
+        if isinstance(audio_list,np.ndarray):
+            audio_list = [audio_list]
+        elif isinstance(audio_list,str):
+            audio_list = [librosa.load(audio_list,sr=32000)[0]]
+        
+        captions = []
+        for wav in audio_list:
+            inputwav = torch.as_tensor(wav).float().unsqueeze(0).to(self.device)
+            wav_len = torch.LongTensor([len(wav)])
+            input_dict = {
+                "mode": "inference",
+                "wav": inputwav,
+                "wav_len": wav_len,
+                "specaug": False,
+                "sample_method": "beam",
+            }
+            print(input_dict)
+            out_dict = self.model(input_dict)
+            caption_batch = [decode_caption(seq, self.vocabulary) for seq in \
+                out_dict["seq"].cpu().numpy()]
+            captions.extend(caption_batch)
+        return captions
+
+
+        
+    def __call__(self, audio_list):
+        return self.caption(audio_list)
+
+
+
diff --git a/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth b/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8f570dc2d96679fbdecaba7d8f266368fc7fb0c9
--- /dev/null
+++ b/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c4faa86f30e77df235b5dc1fb6578a18ff2b8a1b0043f47e30acb9ccb53a336
+size 494977221
diff --git a/checkpoints/0102_xiaoma_pe/config.yaml b/checkpoints/0102_xiaoma_pe/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..69a88444205377d48573d53bb4fb500860976588
--- /dev/null
+++ b/checkpoints/0102_xiaoma_pe/config.yaml
@@ -0,0 +1,172 @@
+accumulate_grad_batches: 1
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+base_config:
+- configs/tts/lj/fs2.yaml
+binarization_args:
+  shuffle: false
+  with_align: true
+  with_f0: true
+  with_f0cwt: true
+  with_spk_embed: true
+  with_txt: true
+  with_wav: false
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+binary_data_dir: data/binary/xiaoma1022_24k_128hop
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+decoder_type: fft
+dict_dir: ''
+dropout: 0.1
+ds_workers: 4
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+enc_ffn_kernel_size: 9
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+gen_dir_name: ''
+hidden_size: 256
+hop_size: 128
+infer: false
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+load_ckpt: ''
+log_interval: 100
+loud_norm: false
+lr: 2.0
+max_epochs: 1000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 5000
+max_input_tokens: 1550
+max_sentences: 100000
+max_tokens: 20000
+max_updates: 60000
+mel_loss: l1
+mel_vmax: 1.5
+mel_vmin: -6
+min_level_db: -120
+norm_type: gn
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 5
+num_spk: 1
+num_test_samples: 20
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_ar: false
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor_conv_layers: 2
+pitch_loss: l1
+pitch_norm: log
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  forced_align: mfa
+  txt_processor: en
+  use_sox: false
+  use_tone: true
+pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+pretrain_fs_ckpt: ''
+processed_data_dir: data/processed/ljspeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech-1.1
+ref_norm_layer: bn
+reset_phone_dict: true
+save_best: false
+save_ckpt: true
+save_codes:
+- configs
+- modules
+- tasks
+- utils
+- usr
+save_f0: false
+save_gt: false
+seed: 1234
+sort_by_len: true
+stop_token_weight: 5.0
+task_cls: tasks.tts.pe.PitchExtractionTask
+test_ids:
+- 68
+- 70
+- 74
+- 87
+- 110
+- 172
+- 190
+- 215
+- 231
+- 294
+- 316
+- 324
+- 402
+- 422
+- 485
+- 500
+- 505
+- 508
+- 509
+- 519
+test_input_dir: ''
+test_num: 523
+test_set_name: test
+train_set_name: train
+use_denoise: false
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_pitch_embed: true
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_num: 348
+valid_set_name: valid
+vocoder: pwg
+vocoder_ckpt: ''
+warmup_updates: 2000
+weight_decay: 0
+win_size: 512
+work_dir: checkpoints/0102_xiaoma_pe
diff --git a/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt b/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..468cc81b1a95e2f3dd490a8770bd705e14855f77
--- /dev/null
+++ b/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53942abd8cb908b6d161e1ad7ff3d7d0dd6b204d5bf050613c9d00c56b185ceb
+size 13047222
diff --git a/checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml b/checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95fc5414ba1aff1bad8284ebfba52f5636b4d76d
--- /dev/null
+++ b/checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml
@@ -0,0 +1,241 @@
+accumulate_grad_batches: 1
+adam_b1: 0.8
+adam_b2: 0.99
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+aux_context_window: 0
+#base_config:
+#- egs/egs_bases/singing/pwg.yaml
+#- egs/egs_bases/tts/vocoder/hifigan.yaml
+binarization_args:
+  reset_phone_dict: true
+  reset_word_dict: true
+  shuffle: false
+  trim_eos_bos: false
+  trim_sil: false
+  with_align: false
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: false
+  with_spk_id: true
+  with_txt: false
+  with_wav: true
+  with_word: false
+binarizer_cls: data_gen.tts.singing.binarize.SingingBinarizer
+binary_data_dir: data/binary/big_popcs_24k_hop128
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+datasets: []
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+dict_dir: ''
+disc_start_steps: 40000
+discriminator_grad_norm: 1
+discriminator_optimizer_params:
+  eps: 1.0e-06
+  lr: 0.0002
+  weight_decay: 0.0
+discriminator_params:
+  bias: true
+  conv_channels: 64
+  in_channels: 1
+  kernel_size: 3
+  layers: 10
+  nonlinear_activation: LeakyReLU
+  nonlinear_activation_params:
+    negative_slope: 0.2
+  out_channels: 1
+  use_weight_norm: true
+discriminator_scheduler_params:
+  gamma: 0.999
+  step_size: 600
+dropout: 0.1
+ds_workers: 1
+enc_ffn_kernel_size: 9
+enc_layers: 4
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+frames_multiple: 1
+gen_dir_name: ''
+generator_grad_norm: 10
+generator_optimizer_params:
+  eps: 1.0e-06
+  lr: 0.0002
+  weight_decay: 0.0
+generator_params:
+  aux_channels: 80
+  dropout: 0.0
+  gate_channels: 128
+  in_channels: 1
+  kernel_size: 3
+  layers: 30
+  out_channels: 1
+  residual_channels: 64
+  skip_channels: 64
+  stacks: 3
+  upsample_net: ConvInUpsampleNetwork
+  upsample_params:
+    upsample_scales:
+    - 2
+    - 4
+    - 4
+    - 4
+  use_nsf: false
+  use_pitch_embed: true
+  use_weight_norm: true
+generator_scheduler_params:
+  gamma: 0.999
+  step_size: 600
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 128
+infer: false
+lambda_adv: 1.0
+lambda_cdisc: 4.0
+lambda_energy: 0.0
+lambda_f0: 0.0
+lambda_mel: 5.0
+lambda_mel_adv: 1.0
+lambda_ph_dur: 0.0
+lambda_sent_dur: 0.0
+lambda_uv: 0.0
+lambda_word_dur: 0.0
+load_ckpt: ''
+loud_norm: false
+lr: 2.0
+max_epochs: 1000
+max_frames: 2400
+max_input_tokens: 1550
+max_samples: 8192
+max_sentences: 20
+max_tokens: 24000
+max_updates: 3000000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 0
+min_level_db: -120
+num_ckpt_keep: 3
+num_heads: 2
+num_mels: 80
+num_sanity_val_steps: 5
+num_spk: 100
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  sox_resample: true
+  sox_to_wav: false
+  trim_sil: false
+  txt_processor: zh
+  use_tone: false
+pre_align_cls: data_gen.tts.singing.pre_align.SingingPreAlign
+predictor_grad: 0.0
+print_nan_grads: false
+processed_data_dir: ''
+profile_infer: false
+raw_data_dir: ''
+ref_level_db: 20
+rename_tmux: true
+rerun_gen: true
+resblock: '1'
+resblock_dilation_sizes:
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+resblock_kernel_sizes:
+- 3
+- 7
+- 11
+resume_from_checkpoint: 0
+save_best: true
+save_codes: []
+save_f0: true
+save_gt: true
+scheduler: rsqrt
+seed: 1234
+sort_by_len: true
+stft_loss_params:
+  fft_sizes:
+  - 1024
+  - 2048
+  - 512
+  hop_sizes:
+  - 120
+  - 240
+  - 50
+  win_lengths:
+  - 600
+  - 1200
+  - 240
+  window: hann_window
+task_cls: tasks.vocoder.hifigan.HifiGanTask
+tb_log_interval: 100
+test_ids: []
+test_input_dir: ''
+test_num: 50
+test_prefixes: []
+test_set_name: test
+train_set_name: train
+train_sets: ''
+upsample_initial_channel: 512
+upsample_kernel_sizes:
+- 16
+- 16
+- 4
+- 4
+upsample_rates:
+- 8
+- 4
+- 2
+- 2
+use_cdisc: false
+use_cond_disc: false
+use_fm_loss: false
+use_gt_dur: true
+use_gt_f0: true
+use_mel_loss: true
+use_ms_stft: false
+use_pitch_embed: true
+use_ref_enc: true
+use_spec_disc: false
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder: pwg
+vocoder_ckpt: ''
+vocoder_denoise_c: 0.0
+warmup_updates: 8000
+weight_decay: 0
+win_length: null
+win_size: 512
+window: hann
+word_size: 3000
+work_dir: checkpoints/0109_hifigan_bigpopcs_hop128
diff --git a/checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt b/checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..ed55eaa98f86e3e22f4eb4e8115f254745cea155
--- /dev/null
+++ b/checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cb68f3ce0c46ba0a8b6d49718f1fffdf5bd7bcab769a986fd2fd129835cc1d1
+size 55827436
diff --git a/checkpoints/0228_opencpop_ds100_rel/config.yaml b/checkpoints/0228_opencpop_ds100_rel/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..977627b65c12e00e5dd2cc42e423f9ee4899956a
--- /dev/null
+++ b/checkpoints/0228_opencpop_ds100_rel/config.yaml
@@ -0,0 +1,342 @@
+K_step: 100
+accumulate_grad_batches: 1
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+base_config:
+- usr/configs/popcs_ds_beta6.yaml
+- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
+binarization_args:
+  shuffle: false
+  with_align: true
+  with_f0: true
+  with_f0cwt: true
+  with_spk_embed: false
+  with_txt: true
+  with_wav: true
+binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
+binary_data_dir: data/binary/opencpop-midi-dp
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+content_cond_steps: []
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+datasets:
+- popcs
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+decay_steps: 50000
+decoder_type: fft
+dict_dir: ''
+diff_decoder_type: wavenet
+diff_loss_type: l1
+dilation_cycle_length: 4
+dropout: 0.1
+ds_workers: 4
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 5
+enc_ffn_kernel_size: 9
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+fs2_ckpt: ''
+gaussian_start: true
+gen_dir_name: ''
+gen_tgt_spk_id: -1
+hidden_size: 256
+hop_size: 128
+infer: false
+keep_bins: 80
+lambda_commit: 0.25
+lambda_energy: 0.0
+lambda_f0: 0.0
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_uv: 0.0
+lambda_word_dur: 1.0
+load_ckpt: ''
+log_interval: 100
+loud_norm: false
+lr: 0.001
+max_beta: 0.06
+max_epochs: 1000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 8000
+max_input_tokens: 1550
+max_sentences: 48
+max_tokens: 40000
+max_updates: 160000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6.0
+min_level_db: -120
+norm_type: gn
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 1
+num_spk: 1
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pe_ckpt: checkpoints/0102_xiaoma_pe
+pe_enable: true
+pitch_ar: false
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l1
+pitch_norm: log
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  forced_align: mfa
+  txt_processor: zh_g2pM
+  use_sox: true
+  use_tone: false
+pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 5
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+pretrain_fs_ckpt: ''
+processed_data_dir: data/processed/popcs
+profile_infer: false
+raw_data_dir: data/raw/popcs
+ref_norm_layer: bn
+rel_pos: true
+reset_phone_dict: true
+residual_channels: 256
+residual_layers: 20
+save_best: false
+save_ckpt: true
+save_codes:
+- configs
+- modules
+- tasks
+- utils
+- usr
+save_f0: true
+save_gt: false
+schedule_type: linear
+seed: 1234
+sort_by_len: true
+spec_max:
+- -0.79453
+- -0.81116
+- -0.61631
+- -0.30679
+- -0.13863
+- -0.050652
+- -0.11563
+- -0.10679
+- -0.091068
+- -0.062174
+- -0.075302
+- -0.072217
+- -0.063815
+- -0.073299
+- 0.007361
+- -0.072508
+- -0.050234
+- -0.16534
+- -0.26928
+- -0.20782
+- -0.20823
+- -0.11702
+- -0.070128
+- -0.065868
+- -0.012675
+- 0.0015121
+- -0.089902
+- -0.21392
+- -0.23789
+- -0.28922
+- -0.30405
+- -0.23029
+- -0.22088
+- -0.21542
+- -0.29367
+- -0.30137
+- -0.38281
+- -0.4359
+- -0.28681
+- -0.46855
+- -0.57485
+- -0.47022
+- -0.54266
+- -0.44848
+- -0.6412
+- -0.687
+- -0.6486
+- -0.76436
+- -0.49971
+- -0.71068
+- -0.69724
+- -0.61487
+- -0.55843
+- -0.69773
+- -0.57502
+- -0.70919
+- -0.82431
+- -0.84213
+- -0.90431
+- -0.8284
+- -0.77945
+- -0.82758
+- -0.87699
+- -1.0532
+- -1.0766
+- -1.1198
+- -1.0185
+- -0.98983
+- -1.0001
+- -1.0756
+- -1.0024
+- -1.0304
+- -1.0579
+- -1.0188
+- -1.05
+- -1.0842
+- -1.0923
+- -1.1223
+- -1.2381
+- -1.6467
+spec_min:
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+spk_cond_steps: []
+stop_token_weight: 5.0
+task_cls: usr.diffsinger_task.DiffSingerMIDITask
+test_ids: []
+test_input_dir: ''
+test_num: 0
+test_prefixes:
+- "popcs-\u8BF4\u6563\u5C31\u6563"
+- "popcs-\u9690\u5F62\u7684\u7FC5\u8180"
+test_set_name: test
+timesteps: 100
+train_set_name: train
+use_denoise: false
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_midi: true
+use_nsf: true
+use_pitch_embed: false
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_num: 0
+valid_set_name: valid
+vocoder: vocoders.hifigan.HifiGAN
+vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
+warmup_updates: 2000
+wav2spec_eps: 1e-6
+weight_decay: 0
+win_size: 512
+work_dir: checkpoints/0228_opencpop_ds100_rel
diff --git a/checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt b/checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..07b944d43e3bd61ebd8272c09db0011425b4af08
--- /dev/null
+++ b/checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a8261f7415bb39eb80a19d4c27c0ea084f63af2fdf6b82e63fcbd9cd82fc90c
+size 170226367
diff --git a/checkpoints/0831_opencpop_ds1000/config.yaml b/checkpoints/0831_opencpop_ds1000/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc2be3b17c1cab8a96f033a6370e6dbfbca1b66d
--- /dev/null
+++ b/checkpoints/0831_opencpop_ds1000/config.yaml
@@ -0,0 +1,346 @@
+K_step: 1000
+accumulate_grad_batches: 1
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+base_config:
+- usr/configs/popcs_ds_beta6.yaml
+- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
+binarization_args:
+  shuffle: false
+  with_align: true
+  with_f0: true
+  with_f0cwt: true
+  with_spk_embed: false
+  with_txt: true
+  with_wav: true
+binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
+binary_data_dir: data/binary/opencpop-midi-dp
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+content_cond_steps: []
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+datasets:
+- opencpop
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+decay_steps: 50000
+decoder_type: fft
+dict_dir: ''
+diff_decoder_type: wavenet
+diff_loss_type: l1
+dilation_cycle_length: 4
+dropout: 0.1
+ds_workers: 4
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 5
+enc_ffn_kernel_size: 9
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+fs2_ckpt: ''
+gaussian_start: true
+gen_dir_name: ''
+gen_tgt_spk_id: -1
+hidden_size: 256
+hop_size: 128
+infer: false
+keep_bins: 80
+lambda_commit: 0.25
+lambda_energy: 0.0
+lambda_f0: 0.0
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_uv: 0.0
+lambda_word_dur: 1.0
+load_ckpt: ''
+log_interval: 100
+loud_norm: false
+lr: 0.001
+max_beta: 0.02
+max_epochs: 1000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 8000
+max_input_tokens: 1550
+max_sentences: 48
+max_tokens: 36000
+max_updates: 320000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6.0
+min_level_db: -120
+norm_type: gn
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 1
+num_spk: 1
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pe_ckpt: checkpoints/0102_xiaoma_pe
+pe_enable: true
+pitch_ar: false
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l1
+pitch_norm: log
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  forced_align: mfa
+  txt_processor: zh_g2pM
+  use_sox: true
+  use_tone: false
+pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 5
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+pretrain_fs_ckpt: ''
+processed_data_dir: xxx
+profile_infer: false
+raw_data_dir: data/raw/opencpop/segments
+ref_norm_layer: bn
+rel_pos: true
+reset_phone_dict: true
+residual_channels: 256
+residual_layers: 20
+save_best: false
+save_ckpt: true
+save_codes:
+- configs
+- modules
+- tasks
+- utils
+- usr
+save_f0: true
+save_gt: false
+schedule_type: linear
+seed: 1234
+sort_by_len: true
+spec_max:
+- -0.79453
+- -0.81116
+- -0.61631
+- -0.30679
+- -0.13863
+- -0.050652
+- -0.11563
+- -0.10679
+- -0.091068
+- -0.062174
+- -0.075302
+- -0.072217
+- -0.063815
+- -0.073299
+- 0.007361
+- -0.072508
+- -0.050234
+- -0.16534
+- -0.26928
+- -0.20782
+- -0.20823
+- -0.11702
+- -0.070128
+- -0.065868
+- -0.012675
+- 0.0015121
+- -0.089902
+- -0.21392
+- -0.23789
+- -0.28922
+- -0.30405
+- -0.23029
+- -0.22088
+- -0.21542
+- -0.29367
+- -0.30137
+- -0.38281
+- -0.4359
+- -0.28681
+- -0.46855
+- -0.57485
+- -0.47022
+- -0.54266
+- -0.44848
+- -0.6412
+- -0.687
+- -0.6486
+- -0.76436
+- -0.49971
+- -0.71068
+- -0.69724
+- -0.61487
+- -0.55843
+- -0.69773
+- -0.57502
+- -0.70919
+- -0.82431
+- -0.84213
+- -0.90431
+- -0.8284
+- -0.77945
+- -0.82758
+- -0.87699
+- -1.0532
+- -1.0766
+- -1.1198
+- -1.0185
+- -0.98983
+- -1.0001
+- -1.0756
+- -1.0024
+- -1.0304
+- -1.0579
+- -1.0188
+- -1.05
+- -1.0842
+- -1.0923
+- -1.1223
+- -1.2381
+- -1.6467
+spec_min:
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+spk_cond_steps: []
+stop_token_weight: 5.0
+task_cls: usr.diffsinger_task.DiffSingerMIDITask
+test_ids: []
+test_input_dir: ''
+test_num: 0
+test_prefixes:
+- '2044'
+- '2086'
+- '2092'
+- '2093'
+- '2100'
+test_set_name: test
+timesteps: 1000
+train_set_name: train
+use_denoise: false
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_midi: true
+use_nsf: true
+use_pitch_embed: false
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_num: 0
+valid_set_name: valid
+vocoder: vocoders.hifigan.HifiGAN
+vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
+warmup_updates: 2000
+wav2spec_eps: 1e-6
+weight_decay: 0
+win_size: 512
+work_dir: checkpoints/0831_opencpop_ds1000
+pndm_speedup: 10
diff --git a/checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt b/checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..f36846cd61ffca537611feea3166011f480a443a
--- /dev/null
+++ b/checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:954a31208ee6afb6240d09454bb204c4fbc63cf70e2586bed0ab29b1dc964c9e
+size 170269591
diff --git a/checkpoints/Emotion_encoder.pt b/checkpoints/Emotion_encoder.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ac214aba4b7a248c6742782392529b8442855805
--- /dev/null
+++ b/checkpoints/Emotion_encoder.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9de4930cbd8e5ba51efdef84c326e3728a5482dd7668f82960e4cb0f97cc8e5
+size 17095350
diff --git a/checkpoints/GenerSpeech/config.yaml b/checkpoints/GenerSpeech/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed493feb76965929cd333ffafcb95f2d47cfc0e6
--- /dev/null
+++ b/checkpoints/GenerSpeech/config.yaml
@@ -0,0 +1,249 @@
+accumulate_grad_batches: 1
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 16000
+base_config:
+- egs/egs_bases/tts/fs2_adv.yaml
+- egs/datasets/audio/emotion/base_text2mel.yaml
+binarization_args:
+  reset_phone_dict: true
+  reset_word_dict: true
+  shuffle: true
+  trim_eos_bos: false
+  trim_sil: false
+  with_align: true
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: true
+  with_spk_id: true
+  with_txt: true
+  with_wav: true
+  with_word: true
+binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
+binary_data_dir: data/binary/training_set
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+conv_use_pos: false
+crop: false
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+debug: false
+dec_dilations:
+- 1
+- 1
+- 1
+- 1
+dec_ffn_kernel_size: 9
+dec_inp_add_noise: false
+dec_kernel_size: 5
+dec_layers: 4
+dec_num_heads: 2
+decoder_rnn_dim: 0
+decoder_type: fft
+dict_dir: ''
+disc_hidden_size: 128
+disc_interval: 1
+disc_lr: 0.0001
+disc_norm: in
+disc_reduction: stack
+disc_start_steps: 0
+disc_win_num: 3
+discriminator_grad_norm: 1
+discriminator_optimizer_params:
+  eps: 1.0e-06
+  weight_decay: 0.0
+discriminator_scheduler_params:
+  gamma: 0.5
+  step_size: 60000
+dropout: 0.05
+ds_workers: 2
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+emotion_encoder_path: checkpoints/Emotion_encoder.pt # set the emotion encoder path
+enc_dec_norm: ln
+enc_dilations:
+- 1
+- 1
+- 1
+- 1
+enc_ffn_kernel_size: 9
+enc_kernel_size: 5
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_hidden_size: 1024
+ffn_padding: SAME
+fft_size: 1024
+fmax: 7600
+fmin: 80
+forcing: 20000
+frames_multiple: 1
+gen_dir_name: ''
+generator_grad_norm: 5.0
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 256
+infer: false
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_mel_adv: 0.1
+lambda_ph_dur: 0.1
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+layers_in_block: 2
+load_ckpt: ''
+loud_norm: false
+lr: 1.0
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_sentences: 100000
+max_tokens: 30000
+max_updates: 300000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_disc_hidden_size: 128
+mel_gan: true
+mel_hidden_size: 256
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 128
+min_level_db: -100
+nVQ: 128
+noise_scale: 0.8
+num_ckpt_keep: 2
+num_heads: 2
+num_sanity_val_steps: -1
+num_spk: 500
+num_test_samples: 72
+num_valid_plots: 10
+optimizer_adam_beta1: 0.5
+optimizer_adam_beta2: 0.999
+out_wav_norm: false
+pitch_ar: false
+pitch_embed_type: 0
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l1
+pitch_norm: standard
+pitch_ssim_win: 11
+pitch_type: frame
+post_glow_hidden: 128
+post_glow_kernel_size: 3
+post_glow_n_block_layers: 3
+post_glow_n_blocks: 8
+post_share_cond_layers: false
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  sox_resample: false
+  sox_to_wav: false
+  trim_sil: false
+  txt_processor: en
+  use_tone: true
+pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
+predictor_dropout: 0.5
+predictor_grad: 1.0
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+preprocess_args:
+  add_eos_bos: true
+  mfa_group_shuffle: false
+  mfa_offset: 0.02
+  nsample_per_mfa_group: 1000
+  reset_phone_dict: true
+  reset_word_dict: true
+  save_sil_mask: true
+  txt_processor: en
+  use_mfa: true
+  vad_max_silence_length: 12
+  wav_processors: []
+  with_phsep: true
+preprocess_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
+pretrain_fs_ckpt: ''
+print_nan_grads: false
+processed_data_dir: data/processed/emotion
+profile_infer: false
+raw_data_dir: data/raw/ESD
+ref_audio: ''
+ref_hidden_stride_kernel:
+- 0,3,5
+- 0,3,5
+- 0,2,5
+- 0,2,5
+- 0,2,5
+ref_level_db: 20
+ref_norm_layer: bn
+rename_tmux: true
+rerun_gen: false
+resume_from_checkpoint: 0
+save_best: false
+save_codes: []
+save_f0: false
+save_gt: true
+scheduler: rsqrt
+seed: 1234
+share_wn_layers: 4
+sigmoid_scale: false
+sil_add_noise: false
+sort_by_len: true
+task_cls: modules.GenerSpeech.task.generspeech.GenerSpeechTask
+tb_log_interval: 100
+test_ids: []
+test_input_dir: ''
+test_num: 200
+test_set_name: test
+text: ''
+train_set_name: train
+train_sets: ''
+use_cond_disc: false
+use_emotion: true
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_latent_cond: true
+use_pitch_embed: true
+use_pos_embed: true
+use_ref_enc: false
+use_spk_embed: true
+use_spk_id: false
+use_split_spk_id: false
+use_txt_cond: true
+use_uv: true
+use_var_enc: false
+use_word: true
+vae_dropout: 0.0
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+var_enc_vq_codes: 64
+vocoder: hifigan
+vocoder_ckpt: checkpoints/trainset_hifigan
+vocoder_denoise_c: 0.0
+vq_start: 20500
+warmup_updates: 2000
+weight_decay: 0
+win_size: 1024
+word_size: 30000
+work_dir: checkpoints/GenerSpeech
diff --git a/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt b/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..def291d926fe008dc220e775ee525cdfe501d7c8
--- /dev/null
+++ b/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b872bb686013cee2a98cc610b8b66b788c46ff4c33130682b63af4ac005405ea
+size 619582860
diff --git a/checkpoints/trainset_hifigan/config.yaml b/checkpoints/trainset_hifigan/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df5c3117c000dd2d20637f52dc11b87b653142e2
--- /dev/null
+++ b/checkpoints/trainset_hifigan/config.yaml
@@ -0,0 +1,178 @@
+accumulate_grad_batches: 1
+adam_b1: 0.8
+adam_b2: 0.99
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 16000
+aux_context_window: 0
+base_config:
+- egs/egs_bases/tts/vocoder/hifigan.yaml
+- egs/datasets/audio/emotion/base_text2mel.yaml
+binarization_args:
+  reset_phone_dict: true
+  reset_word_dict: true
+  shuffle: true
+  trim_eos_bos: false
+  trim_sil: false
+  with_align: false
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: false
+  with_spk_id: true
+  with_txt: false
+  with_wav: true
+  with_word: false
+binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
+binary_data_dir: data/binary/training_set
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+dict_dir: ''
+disc_start_steps: 40000
+discriminator_grad_norm: 1
+discriminator_optimizer_params:
+  lr: 0.0002
+discriminator_scheduler_params:
+  gamma: 0.999
+  step_size: 600
+dropout: 0.1
+ds_workers: 1
+enc_ffn_kernel_size: 9
+enc_layers: 4
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 1024
+fmax: 7600
+fmin: 80
+frames_multiple: 1
+gen_dir_name: ''
+generator_grad_norm: 10
+generator_optimizer_params:
+  lr: 0.0002
+generator_scheduler_params:
+  gamma: 0.999
+  step_size: 600
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 256
+infer: false
+lambda_adv: 1.0
+lambda_cdisc: 4.0
+lambda_mel: 5.0
+lambda_mel_adv: 1.0
+load_ckpt: ''
+loud_norm: false
+lr: 2.0
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_samples: 8192
+max_sentences: 24
+max_tokens: 30000
+max_updates: 1000000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 128
+min_level_db: -100
+num_ckpt_keep: 3
+num_heads: 2
+num_mels: 80
+num_sanity_val_steps: -1
+num_spk: 10
+num_test_samples: 30
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  sox_resample: false
+  sox_to_wav: false
+  trim_sil: false
+  txt_processor: en
+  use_tone: true
+pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
+print_nan_grads: false
+processed_data_dir: data/processed/emotion,data/processed/LibriTTS
+profile_infer: false
+raw_data_dir: data/raw/ESD
+ref_level_db: 20
+rename_tmux: true
+resblock: '1'
+resblock_dilation_sizes:
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+resblock_kernel_sizes:
+- 3
+- 7
+- 11
+resume_from_checkpoint: 0
+save_best: true
+save_codes: []
+save_f0: false
+save_gt: true
+scheduler: rsqrt
+seed: 1234
+sort_by_len: true
+task_cls: tasks.vocoder.hifigan.HifiGanTask
+tb_log_interval: 100
+test_ids: []
+test_input_dir: ''
+test_num: 200
+test_set_name: test
+train_set_name: train
+train_sets: ''
+upsample_initial_channel: 512
+upsample_kernel_sizes:
+- 16
+- 16
+- 4
+- 4
+upsample_rates:
+- 8
+- 8
+- 2
+- 2
+use_cdisc: false
+use_cond_disc: false
+use_emotion: true
+use_fm_loss: false
+use_ms_stft: false
+use_pitch_embed: false
+use_spec_disc: false
+use_spk_embed: false
+use_spk_id: true
+use_split_spk_id: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder: pwg
+vocoder_ckpt: ''
+vocoder_denoise_c: 0.0
+warmup_updates: 8000
+weight_decay: 0
+win_length: null
+win_size: 1024
+window: hann
+word_size: 30000
+work_dir: checkpoints/trainset_hifigan
diff --git a/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt b/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..9c71c2b0d75bd2867111cf7401bf8c7e0b77b03c
--- /dev/null
+++ b/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a2577919899400a111ef42a2aba65797d282c259d083d2c276539dda9d17870
+size 1016199247