diff --git a/audio_detection/__init__.py b/audio_detection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_detection/audio_infer/__init__.py b/audio_detection/audio_infer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_detection/audio_infer/__pycache__/__init__.cpython-38.pyc b/audio_detection/audio_infer/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0be763e4f05595b0b8fc1819a5ce5d665e6a7e6d Binary files /dev/null and b/audio_detection/audio_infer/__pycache__/__init__.cpython-38.pyc differ diff --git a/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv new file mode 100644 index 0000000000000000000000000000000000000000..48d8522774b0127d4b585c18fb7da54a9fcbc248 --- /dev/null +++ b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv @@ -0,0 +1,1350 @@ +-JMT0mK0Dbg_30.000_40.000.wav 30.000 40.000 Train horn +3ACjUf9QpAQ_30.000_40.000.wav 30.000 40.000 Train horn +3S2-TODd__k_90.000_100.000.wav 90.000 100.000 Train horn +3YJewEC-NWo_30.000_40.000.wav 30.000 40.000 Train horn +3jXAh3V2FO8_30.000_40.000.wav 30.000 40.000 Train horn +53oq_Otm_XI_30.000_40.000.wav 30.000 40.000 Train horn +8IaInXpdd9M_0.000_10.000.wav 0.000 10.000 Train horn +8nU1aVscJec_30.000_40.000.wav 30.000 40.000 Train horn +9LQEZJPNVpw_30.000_40.000.wav 30.000 40.000 Train horn +AHom7lBbtoY_30.000_40.000.wav 30.000 40.000 Train horn +Ag_zT74ZGNc_9.000_19.000.wav 9.000 19.000 Train horn +BQpa8whzwAE_30.000_40.000.wav 30.000 40.000 Train horn +CCX_4cW_SAU_0.000_10.000.wav 0.000 10.000 Train horn +CLIdVCUO_Vw_30.000_40.000.wav 30.000 40.000 Train horn +D_nXtMgbPNY_30.000_40.000.wav 30.000 40.000 Train horn +GFQnh84kNwU_30.000_40.000.wav 30.000 40.000 Train horn +I4qODX0fypE_30.000_40.000.wav 30.000 40.000 Train horn +IdqEbjujFb8_30.000_40.000.wav 30.000 40.000 Train horn +L3a132_uApg_50.000_60.000.wav 50.000 60.000 Train horn +LzcNa3HvD7c_30.000_40.000.wav 30.000 40.000 Train horn +MCYY8tJsnfY_7.000_17.000.wav 7.000 17.000 Train horn +MPSf7dJpV5w_30.000_40.000.wav 30.000 40.000 Train horn +NdCr5IDnkxc_30.000_40.000.wav 30.000 40.000 Train horn +P54KKbTA_TE_0.000_7.000.wav 0.000 7.000 Train horn +PJUy17bXlhc_40.000_50.000.wav 40.000 50.000 Train horn +QrAoRSA13bM_30.000_40.000.wav 30.000 40.000 Train horn +R_Lpb-51Kl4_30.000_40.000.wav 30.000 40.000 Train horn +Rq-22Cycrpg_30.000_40.000.wav 30.000 40.000 Train horn +TBjrN1aMRrM_30.000_40.000.wav 30.000 40.000 Train horn +XAUtk9lwzU8_30.000_40.000.wav 30.000 40.000 Train horn +XW8pSKLyr0o_20.000_30.000.wav 20.000 30.000 Train horn +Y10I9JSvJuQ_30.000_40.000.wav 30.000 40.000 Train horn +Y_jwEflLthg_190.000_200.000.wav 190.000 200.000 Train horn +YilfKdY7w6Y_60.000_70.000.wav 60.000 70.000 Train horn +ZcTI8fQgEZE_240.000_250.000.wav 240.000 250.000 Train horn +_8MvhMlbwiE_40.000_50.000.wav 40.000 50.000 Train horn +_dkeW6lqmq4_30.000_40.000.wav 30.000 40.000 Train horn +aXsUHAKbyLs_30.000_40.000.wav 30.000 40.000 Train horn +arevYmB0qGg_30.000_40.000.wav 30.000 40.000 Train horn +d1o334I5X_k_30.000_40.000.wav 30.000 40.000 Train horn +dSzZWgbJ378_30.000_40.000.wav 30.000 40.000 Train horn +ePVb5Upev8k_40.000_50.000.wav 40.000 50.000 Train horn +g4cA-ifQc70_30.000_40.000.wav 30.000 40.000 Train horn +g9JVq7wfDIo_30.000_40.000.wav 30.000 40.000 Train horn +gTFCK9TuLOQ_30.000_40.000.wav 30.000 40.000 Train horn +hYqzr_rIIAw_30.000_40.000.wav 30.000 40.000 Train horn +iZgzRfa-xPQ_30.000_40.000.wav 30.000 40.000 Train horn +k8H8rn4NaSM_0.000_10.000.wav 0.000 10.000 Train horn +lKQ-I_P7TEM_20.000_30.000.wav 20.000 30.000 Train horn +nfY_zkJceDw_30.000_40.000.wav 30.000 40.000 Train horn +pW5SI1ZKUpA_30.000_40.000.wav 30.000 40.000 Train horn +pxmrmtEnROk_30.000_40.000.wav 30.000 40.000 Train horn +q7zzKHFWGkg_30.000_40.000.wav 30.000 40.000 Train horn +qu8vVFWKszA_30.000_40.000.wav 30.000 40.000 Train horn +stdjjG6Y5IU_30.000_40.000.wav 30.000 40.000 Train horn +tdRMxc4UWRk_30.000_40.000.wav 30.000 40.000 Train horn +tu-cxDG2mW8_0.000_10.000.wav 0.000 10.000 Train horn +txXSE7kgrc8_30.000_40.000.wav 30.000 40.000 Train horn +xabrKa79prM_30.000_40.000.wav 30.000 40.000 Train horn +yBVxtq9k8Sg_0.000_10.000.wav 0.000 10.000 Train horn +-WoudI3gGvk_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +0_gci63CtFY_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +2-h8MRSRvEg_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +3NX4HaOVBoo_240.000_250.000.wav 240.000 250.000 Air horn, truck horn +9NPKQDaNCRk_0.000_6.000.wav 0.000 6.000 Air horn, truck horn +9ct4w4aYWdc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +9l9QXgsJSfo_120.000_130.000.wav 120.000 130.000 Air horn, truck horn +CN0Bi4MDpA4_20.000_30.000.wav 20.000 30.000 Air horn, truck horn +CU2MyVM_B48_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +Cg-DWc9nPfQ_90.000_100.000.wav 90.000 100.000 Air horn, truck horn +D62L3husEa0_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +GO2zKyMtBV4_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +Ge_KWS-0098_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +Hk7HqLBHWng_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +IpyingiCwV8_0.000_3.000.wav 0.000 3.000 Air horn, truck horn +Isuh9pOuH6I_300.000_310.000.wav 300.000 310.000 Air horn, truck horn +IuTfMfzkr5Y_120.000_130.000.wav 120.000 130.000 Air horn, truck horn +MFxsgcZZtFs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn +N3osL4QmOL8_49.000_59.000.wav 49.000 59.000 Air horn, truck horn +NOZsDTFLm7M_0.000_9.000.wav 0.000 9.000 Air horn, truck horn +OjVY3oM1jEU_40.000_50.000.wav 40.000 50.000 Air horn, truck horn +PNaLTW50fxM_60.000_70.000.wav 60.000 70.000 Air horn, truck horn +TYLZuBBu8ms_0.000_10.000.wav 0.000 10.000 Air horn, truck horn +UdHR1P_NIbo_110.000_120.000.wav 110.000 120.000 Air horn, truck horn +YilfKdY7w6Y_60.000_70.000.wav 60.000 70.000 Air horn, truck horn +Yt4ZWNjvJOY_50.000_60.000.wav 50.000 60.000 Air horn, truck horn +Z5M3fGT3Xjk_60.000_70.000.wav 60.000 70.000 Air horn, truck horn +ZauRsP1uH74_12.000_22.000.wav 12.000 22.000 Air horn, truck horn +a_6CZ2JaEuc_0.000_2.000.wav 0.000 2.000 Air horn, truck horn +b7m5Kt5U7Vc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +bIObkrK06rk_15.000_25.000.wav 15.000 25.000 Air horn, truck horn +cdrjKqyDrak_420.000_430.000.wav 420.000 430.000 Air horn, truck horn +ckSYn557ZyE_20.000_30.000.wav 20.000 30.000 Air horn, truck horn +cs-RPPsg_ks_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +ctsq33oUBT8_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +eCFUwyU9ZWA_9.000_19.000.wav 9.000 19.000 Air horn, truck horn +ePVb5Upev8k_40.000_50.000.wav 40.000 50.000 Air horn, truck horn +fHaQPHCjyfA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +fOVsAMJ3Yms_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +g4cA-ifQc70_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +gjlo4evwjlE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +i9VjpIbM3iE_410.000_420.000.wav 410.000 420.000 Air horn, truck horn +ieZVo7W3BQ4_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +ii87iO6JboA_10.000_20.000.wav 10.000 20.000 Air horn, truck horn +jko48cNdvFA_80.000_90.000.wav 80.000 90.000 Air horn, truck horn +kJuvA2zmrnY_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +kUrb38hMwPs_0.000_10.000.wav 0.000 10.000 Air horn, truck horn +km_hVyma2vo_0.000_10.000.wav 0.000 10.000 Air horn, truck horn +m1e9aOwRiDQ_0.000_9.000.wav 0.000 9.000 Air horn, truck horn +mQJcObz1k_E_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +pk75WDyNZKc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +rhUfN81puDI_0.000_10.000.wav 0.000 10.000 Air horn, truck horn +suuYwAifIAQ_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +wDdEZ46B-tM_460.000_470.000.wav 460.000 470.000 Air horn, truck horn +wHISHmuP58s_80.000_90.000.wav 80.000 90.000 Air horn, truck horn +xwqIKDz1bT4_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +y4Ko6VNiqB0_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +yhcmPrU3QSk_61.000_71.000.wav 61.000 71.000 Air horn, truck horn +3FWHjjZGT9U_80.000_90.000.wav 80.000 90.000 Car alarm +3YChVhqW42E_130.000_140.000.wav 130.000 140.000 Car alarm +3YRkin3bMlQ_170.000_180.000.wav 170.000 180.000 Car alarm +4APBvMmKubU_10.000_20.000.wav 10.000 20.000 Car alarm +4JDah6Ckr9k_5.000_15.000.wav 5.000 15.000 Car alarm +5hL1uGb4sas_30.000_40.000.wav 30.000 40.000 Car alarm +969Zfj4IoPk_20.000_30.000.wav 20.000 30.000 Car alarm +AyfuBDN3Vdw_40.000_50.000.wav 40.000 50.000 Car alarm +B-ZqhRg3km4_60.000_70.000.wav 60.000 70.000 Car alarm +BDnwA3AaclE_10.000_20.000.wav 10.000 20.000 Car alarm +ES-rjFfuxq4_120.000_130.000.wav 120.000 130.000 Car alarm +EWbZq5ruCpg_0.000_10.000.wav 0.000 10.000 Car alarm +F50h9HiyC3k_40.000_50.000.wav 40.000 50.000 Car alarm +F5AP8kQvogM_30.000_40.000.wav 30.000 40.000 Car alarm +FKJuDOAumSk_20.000_30.000.wav 20.000 30.000 Car alarm +GmbNjZi4xBw_30.000_40.000.wav 30.000 40.000 Car alarm +H7lOMlND9dc_30.000_40.000.wav 30.000 40.000 Car alarm +Hu8lxbHYaqg_40.000_50.000.wav 40.000 50.000 Car alarm +IziTYkSwq9Q_30.000_40.000.wav 30.000 40.000 Car alarm +JcO2TTtiplA_30.000_40.000.wav 30.000 40.000 Car alarm +KKx7dWRg8s8_8.000_18.000.wav 8.000 18.000 Car alarm +Kf9Kr69mwOA_14.000_24.000.wav 14.000 24.000 Car alarm +L535vIV3ED4_40.000_50.000.wav 40.000 50.000 Car alarm +LOjT44tFx1A_0.000_10.000.wav 0.000 10.000 Car alarm +Mxn2FKuNwiI_20.000_30.000.wav 20.000 30.000 Car alarm +Nkqx09b-xyI_70.000_80.000.wav 70.000 80.000 Car alarm +QNKo1W1WRbc_22.000_32.000.wav 22.000 32.000 Car alarm +R0VxYDfjyAU_60.000_70.000.wav 60.000 70.000 Car alarm +TJ58vMpSy1w_30.000_40.000.wav 30.000 40.000 Car alarm +ToU1kRagUjY_0.000_10.000.wav 0.000 10.000 Car alarm +TrQGIZqrW0s_30.000_40.000.wav 30.000 40.000 Car alarm +ULFhHR0OLSE_30.000_40.000.wav 30.000 40.000 Car alarm +ULS3ffQkCW4_30.000_40.000.wav 30.000 40.000 Car alarm +U_9NuNORYQM_1.000_11.000.wav 1.000 11.000 Car alarm +UkCEuwYUW8c_110.000_120.000.wav 110.000 120.000 Car alarm +Wak5QxsS-QU_30.000_40.000.wav 30.000 40.000 Car alarm +XzE7mp3pVik_0.000_10.000.wav 0.000 10.000 Car alarm +Y-4dtrP-RNo_7.000_17.000.wav 7.000 17.000 Car alarm +Zltlj0fDeS4_30.000_40.000.wav 30.000 40.000 Car alarm +cB1jkzgH2es_150.000_160.000.wav 150.000 160.000 Car alarm +eIMjkADTWzA_60.000_70.000.wav 60.000 70.000 Car alarm +eL7s5CoW0UA_0.000_7.000.wav 0.000 7.000 Car alarm +i9VjpIbM3iE_410.000_420.000.wav 410.000 420.000 Car alarm +iWl-5LNURFc_30.000_40.000.wav 30.000 40.000 Car alarm +iX34nDCq9NU_10.000_20.000.wav 10.000 20.000 Car alarm +ii87iO6JboA_10.000_20.000.wav 10.000 20.000 Car alarm +l6_h_YHuTbY_30.000_40.000.wav 30.000 40.000 Car alarm +lhedRVb85Fk_30.000_40.000.wav 30.000 40.000 Car alarm +monelE7hnwI_20.000_30.000.wav 20.000 30.000 Car alarm +o2CmtHNUrXg_30.000_40.000.wav 30.000 40.000 Car alarm +pXX6cK4xtiY_11.000_21.000.wav 11.000 21.000 Car alarm +stnVta2ip9g_30.000_40.000.wav 30.000 40.000 Car alarm +uvuVg9Cl0n0_30.000_40.000.wav 30.000 40.000 Car alarm +vF2zXcbADUk_20.000_30.000.wav 20.000 30.000 Car alarm +vN7dJyt-nj0_20.000_30.000.wav 20.000 30.000 Car alarm +w8Md65mE5Vc_30.000_40.000.wav 30.000 40.000 Car alarm +ySqfMcFk5LM_30.000_40.000.wav 30.000 40.000 Car alarm +ysNK5RVF3Zw_0.000_10.000.wav 0.000 10.000 Car alarm +za8KPcQ0dTw_30.000_40.000.wav 30.000 40.000 Car alarm +-2sE5CH8Wb8_30.000_40.000.wav 30.000 40.000 Reversing beeps +-fJsZm3YRc0_30.000_40.000.wav 30.000 40.000 Reversing beeps +-oSzD8P2BtU_30.000_40.000.wav 30.000 40.000 Reversing beeps +-pzwalZ0ub0_5.000_15.000.wav 5.000 15.000 Reversing beeps +-t-htrAtNvM_30.000_40.000.wav 30.000 40.000 Reversing beeps +-zNEcuo28oE_30.000_40.000.wav 30.000 40.000 Reversing beeps +077aWlQn6XI_30.000_40.000.wav 30.000 40.000 Reversing beeps +0O-gZoirpRA_30.000_40.000.wav 30.000 40.000 Reversing beeps +10aF24rMeu0_30.000_40.000.wav 30.000 40.000 Reversing beeps +1P5FFxXLSpY_30.000_40.000.wav 30.000 40.000 Reversing beeps +1n_s2Gb5R1Q_30.000_40.000.wav 30.000 40.000 Reversing beeps +2HZcxlRs-hg_30.000_40.000.wav 30.000 40.000 Reversing beeps +2Jpg_KvJWL0_30.000_40.000.wav 30.000 40.000 Reversing beeps +2WTk_j_fivY_30.000_40.000.wav 30.000 40.000 Reversing beeps +38F6eeIR-s0_30.000_40.000.wav 30.000 40.000 Reversing beeps +3xh2kScw64U_30.000_40.000.wav 30.000 40.000 Reversing beeps +4MIHbR4QZhE_30.000_40.000.wav 30.000 40.000 Reversing beeps +4Tpy1lsfcSM_30.000_40.000.wav 30.000 40.000 Reversing beeps +4XMY2IvVSf0_30.000_40.000.wav 30.000 40.000 Reversing beeps +4ep09nZl3LA_30.000_40.000.wav 30.000 40.000 Reversing beeps +4t1VqRz4w2g_30.000_40.000.wav 30.000 40.000 Reversing beeps +4tKvAMmAUMM_30.000_40.000.wav 30.000 40.000 Reversing beeps +5-x2pk3YYAs_11.000_21.000.wav 11.000 21.000 Reversing beeps +5DW8WjxxCag_30.000_40.000.wav 30.000 40.000 Reversing beeps +5DjZHCumLfs_11.000_21.000.wav 11.000 21.000 Reversing beeps +5V0xKS-FGMk_30.000_40.000.wav 30.000 40.000 Reversing beeps +5fLzQegwHUg_30.000_40.000.wav 30.000 40.000 Reversing beeps +6Y8bKS6KLeE_30.000_40.000.wav 30.000 40.000 Reversing beeps +6xEHP-C-ZuU_30.000_40.000.wav 30.000 40.000 Reversing beeps +6yyToq9cW9A_60.000_70.000.wav 60.000 70.000 Reversing beeps +7Gua0-UrKIw_30.000_40.000.wav 30.000 40.000 Reversing beeps +7nglQSmcjAk_30.000_40.000.wav 30.000 40.000 Reversing beeps +81DteAPIhoE_30.000_40.000.wav 30.000 40.000 Reversing beeps +96a4smrM_30_30.000_40.000.wav 30.000 40.000 Reversing beeps +9EsgN-WS2qY_30.000_40.000.wav 30.000 40.000 Reversing beeps +9OcAwC8y-eQ_30.000_40.000.wav 30.000 40.000 Reversing beeps +9Ti98L4PRCo_17.000_27.000.wav 17.000 27.000 Reversing beeps +9yhMtJ50sys_30.000_40.000.wav 30.000 40.000 Reversing beeps +A9KMqwqLboE_30.000_40.000.wav 30.000 40.000 Reversing beeps +AFwmMFq_xlc_390.000_400.000.wav 390.000 400.000 Reversing beeps +AvhBRiwWJU4_30.000_40.000.wav 30.000 40.000 Reversing beeps +CL5vkiMs2c0_10.000_20.000.wav 10.000 20.000 Reversing beeps +DcU6AzN7imA_210.000_220.000.wav 210.000 220.000 Reversing beeps +ISBJKY8hwnM_30.000_40.000.wav 30.000 40.000 Reversing beeps +LA5TekLaIPI_10.000_20.000.wav 10.000 20.000 Reversing beeps +NqzZbJJl3E4_30.000_40.000.wav 30.000 40.000 Reversing beeps +PSt0xAYgf4g_0.000_10.000.wav 0.000 10.000 Reversing beeps +Q1CMSV81_ws_30.000_40.000.wav 30.000 40.000 Reversing beeps +_gG0KNGD47M_30.000_40.000.wav 30.000 40.000 Reversing beeps +ckt7YEGcSoY_30.000_40.000.wav 30.000 40.000 Reversing beeps +eIkUuCRE_0U_30.000_40.000.wav 30.000 40.000 Reversing beeps +kH6fFjIZkB0_30.000_40.000.wav 30.000 40.000 Reversing beeps +mCJ0aqIygWE_24.000_34.000.wav 24.000 34.000 Reversing beeps +nFqf1vflJaI_350.000_360.000.wav 350.000 360.000 Reversing beeps +nMaSkwx6cHE_30.000_40.000.wav 30.000 40.000 Reversing beeps +oHKTmTLEy68_11.000_21.000.wav 11.000 21.000 Reversing beeps +saPU2JNoytU_0.000_10.000.wav 0.000 10.000 Reversing beeps +tQd0vFueRKs_30.000_40.000.wav 30.000 40.000 Reversing beeps +vzP6soELj2Q_0.000_10.000.wav 0.000 10.000 Reversing beeps +0x82_HySIVU_30.000_40.000.wav 30.000 40.000 Bicycle +1IQdvfm9SDY_30.000_40.000.wav 30.000 40.000 Bicycle +1_hGvbEiYAs_30.000_40.000.wav 30.000 40.000 Bicycle +26CM8IXODG4_2.000_12.000.wav 2.000 12.000 Bicycle +2f7Ad-XpbnY_30.000_40.000.wav 30.000 40.000 Bicycle +3-a8i_MEUl8_30.000_40.000.wav 30.000 40.000 Bicycle +7KiTXYwaD04_7.000_17.000.wav 7.000 17.000 Bicycle +7gkjn-LLInI_30.000_40.000.wav 30.000 40.000 Bicycle +84flVacRHUI_21.000_31.000.wav 21.000 31.000 Bicycle +9VziOIkNXsE_30.000_40.000.wav 30.000 40.000 Bicycle +ANofTuuN0W0_160.000_170.000.wav 160.000 170.000 Bicycle +B6n0op0sLPA_30.000_40.000.wav 30.000 40.000 Bicycle +D4_zTwsCRds_60.000_70.000.wav 60.000 70.000 Bicycle +DEs_Sp9S1Nw_30.000_40.000.wav 30.000 40.000 Bicycle +GjsxrMRRdfQ_3.000_13.000.wav 3.000 13.000 Bicycle +GkpUU3VX4wQ_30.000_40.000.wav 30.000 40.000 Bicycle +H9HNXYxRmv8_30.000_40.000.wav 30.000 40.000 Bicycle +HPWRKwrs-rY_370.000_380.000.wav 370.000 380.000 Bicycle +HrQxbNO5jXU_6.000_16.000.wav 6.000 16.000 Bicycle +IYaEZkAO0LU_30.000_40.000.wav 30.000 40.000 Bicycle +Idzfy0XbZRo_7.000_17.000.wav 7.000 17.000 Bicycle +Iigfz_GeXVs_30.000_40.000.wav 30.000 40.000 Bicycle +JWCtQ_94YoQ_30.000_40.000.wav 30.000 40.000 Bicycle +JXmBrD4b4EI_30.000_40.000.wav 30.000 40.000 Bicycle +LSZPNwZex9s_30.000_40.000.wav 30.000 40.000 Bicycle +M5kwg1kx4q0_30.000_40.000.wav 30.000 40.000 Bicycle +NrR1wmCpqAk_12.000_22.000.wav 12.000 22.000 Bicycle +O1_Rw2dHb1I_2.000_12.000.wav 2.000 12.000 Bicycle +OEN0TySl1Jw_10.000_20.000.wav 10.000 20.000 Bicycle +PF7uY9ydMYc_30.000_40.000.wav 30.000 40.000 Bicycle +SDl0tWf9Q44_30.000_40.000.wav 30.000 40.000 Bicycle +SkXXjcw9sJI_30.000_40.000.wav 30.000 40.000 Bicycle +Ssa1m5Mnllw_0.000_9.000.wav 0.000 9.000 Bicycle +UB-A1oyNyyg_0.000_6.000.wav 0.000 6.000 Bicycle +UqyvFyQthHo_30.000_40.000.wav 30.000 40.000 Bicycle +Wg4ik5zZxBc_250.000_260.000.wav 250.000 260.000 Bicycle +WvquSD2PcCE_30.000_40.000.wav 30.000 40.000 Bicycle +YIJBuXUi64U_30.000_40.000.wav 30.000 40.000 Bicycle +aBHdl_TiseI_30.000_40.000.wav 30.000 40.000 Bicycle +aeHCq6fFkNo_30.000_40.000.wav 30.000 40.000 Bicycle +amKDjVcs1Vg_30.000_40.000.wav 30.000 40.000 Bicycle +ehYwty_G2L4_13.000_23.000.wav 13.000 23.000 Bicycle +jOlVJv7jAHg_30.000_40.000.wav 30.000 40.000 Bicycle +lGFDQ-ZwUfk_30.000_40.000.wav 30.000 40.000 Bicycle +lmTHvLGQy3g_50.000_60.000.wav 50.000 60.000 Bicycle +nNHW3Uxlb-g_30.000_40.000.wav 30.000 40.000 Bicycle +o98R4ruf8kw_30.000_40.000.wav 30.000 40.000 Bicycle +oiLHBkHgkAo_0.000_8.000.wav 0.000 8.000 Bicycle +qL0ESQcaPhM_30.000_40.000.wav 30.000 40.000 Bicycle +qjz5t9M4YCw_30.000_40.000.wav 30.000 40.000 Bicycle +qrCWPsqG9vA_30.000_40.000.wav 30.000 40.000 Bicycle +r06tmeUDgc8_3.000_13.000.wav 3.000 13.000 Bicycle +sAMjMyCdGOc_30.000_40.000.wav 30.000 40.000 Bicycle +tKdRlWz-1pg_30.000_40.000.wav 30.000 40.000 Bicycle +uNpSMpqlkMA_0.000_10.000.wav 0.000 10.000 Bicycle +vOYj9W7Jsxk_8.000_18.000.wav 8.000 18.000 Bicycle +xBKrmKdjAIA_0.000_10.000.wav 0.000 10.000 Bicycle +xfNeZaw4o3U_17.000_27.000.wav 17.000 27.000 Bicycle +xgiJqbhhU3c_30.000_40.000.wav 30.000 40.000 Bicycle +0vg9qxNKXOw_30.000_40.000.wav 30.000 40.000 Skateboard +10YXuv9Go0E_140.000_150.000.wav 140.000 150.000 Skateboard +3-a8i_MEUl8_30.000_40.000.wav 30.000 40.000 Skateboard +6kXUG1Zo6VA_0.000_10.000.wav 0.000 10.000 Skateboard +84fDGWoRtsU_210.000_220.000.wav 210.000 220.000 Skateboard +8kbHA22EWd0_330.000_340.000.wav 330.000 340.000 Skateboard +8m-a_6wLTkU_230.000_240.000.wav 230.000 240.000 Skateboard +9QwaP-cvdeU_360.000_370.000.wav 360.000 370.000 Skateboard +9ZYj5toEbGA_0.000_10.000.wav 0.000 10.000 Skateboard +9gkppwB5CXA_30.000_40.000.wav 30.000 40.000 Skateboard +9hlXgXWXYXQ_0.000_6.000.wav 0.000 6.000 Skateboard +ALxn5-2bVyI_30.000_40.000.wav 30.000 40.000 Skateboard +ANPjV_rudog_30.000_40.000.wav 30.000 40.000 Skateboard +ATAL-_Dblvg_0.000_7.000.wav 0.000 7.000 Skateboard +An-4jPvUT14_60.000_70.000.wav 60.000 70.000 Skateboard +BGR0QnX4k6w_30.000_40.000.wav 30.000 40.000 Skateboard +BlhUt8AJJO8_30.000_40.000.wav 30.000 40.000 Skateboard +CD7INyI79fM_170.000_180.000.wav 170.000 180.000 Skateboard +CNcxzB9F-Q8_100.000_110.000.wav 100.000 110.000 Skateboard +DqOGYyFVnKk_200.000_210.000.wav 200.000 210.000 Skateboard +E0gBwPTHxqE_30.000_40.000.wav 30.000 40.000 Skateboard +E3XIdP8kxwg_110.000_120.000.wav 110.000 120.000 Skateboard +FQZnQhiM41U_0.000_6.000.wav 0.000 6.000 Skateboard +FRwFfq3Tl1g_310.000_320.000.wav 310.000 320.000 Skateboard +JJo971B_eDg_30.000_40.000.wav 30.000 40.000 Skateboard +KXkxqxoCylc_30.000_40.000.wav 30.000 40.000 Skateboard +L4Z7XkS6CtA_30.000_40.000.wav 30.000 40.000 Skateboard +LjEqr0Z7xm0_0.000_6.000.wav 0.000 6.000 Skateboard +MAbDEeLF4cQ_30.000_40.000.wav 30.000 40.000 Skateboard +MUBbiivNYZs_30.000_40.000.wav 30.000 40.000 Skateboard +Nq8GyBrTI8Y_30.000_40.000.wav 30.000 40.000 Skateboard +PPq9QZmV7jc_25.000_35.000.wav 25.000 35.000 Skateboard +PVgL5wFOKMs_30.000_40.000.wav 30.000 40.000 Skateboard +Tcq_xAdCMr4_30.000_40.000.wav 30.000 40.000 Skateboard +UtZofZjccBs_290.000_300.000.wav 290.000 300.000 Skateboard +VZfrDZhI7BU_30.000_40.000.wav 30.000 40.000 Skateboard +WxChkRrVOIs_0.000_7.000.wav 0.000 7.000 Skateboard +YV0noe1sZAs_150.000_160.000.wav 150.000 160.000 Skateboard +YjScrri_F7U_0.000_10.000.wav 0.000 10.000 Skateboard +YrGQKTbiG1g_30.000_40.000.wav 30.000 40.000 Skateboard +ZM67kt6G-d4_30.000_40.000.wav 30.000 40.000 Skateboard +ZaUaqnLdg6k_30.000_40.000.wav 30.000 40.000 Skateboard +ZhpkRcAEJzc_3.000_13.000.wav 3.000 13.000 Skateboard +_43OOP6UEw0_30.000_40.000.wav 30.000 40.000 Skateboard +_6Fyave4jqA_260.000_270.000.wav 260.000 270.000 Skateboard +aOoZ0bCoaZw_30.000_40.000.wav 30.000 40.000 Skateboard +gV6y9L24wWg_0.000_10.000.wav 0.000 10.000 Skateboard +hHb0Eq1I7Fk_0.000_10.000.wav 0.000 10.000 Skateboard +lGf_L6i6AZI_20.000_30.000.wav 20.000 30.000 Skateboard +leOH87itNWM_30.000_40.000.wav 30.000 40.000 Skateboard +mIkW7mWlnXw_30.000_40.000.wav 30.000 40.000 Skateboard +qadmKrM0ppo_20.000_30.000.wav 20.000 30.000 Skateboard +rLUIHCc4b9A_0.000_7.000.wav 0.000 7.000 Skateboard +u3vBJgEVJvk_0.000_10.000.wav 0.000 10.000 Skateboard +vHKBrtPDSvA_150.000_160.000.wav 150.000 160.000 Skateboard +wWmydRt0Z-w_21.000_31.000.wav 21.000 31.000 Skateboard +xeHt-R5ScmI_0.000_10.000.wav 0.000 10.000 Skateboard +xqGtIVeeXY4_330.000_340.000.wav 330.000 340.000 Skateboard +y_lfY0uzmr0_30.000_40.000.wav 30.000 40.000 Skateboard +02Ak1eIyj3M_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +0N0C0Wbe6AI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +2-h8MRSRvEg_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +4APBvMmKubU_10.000_20.000.wav 10.000 20.000 Ambulance (siren) +5RgHBmX2HLw_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +6rXgD5JlYxY_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +7eeN-fXbso8_20.000_30.000.wav 20.000 30.000 Ambulance (siren) +8Aq2DyLbUBA_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +8qMHvgA9mGw_20.000_30.000.wav 20.000 30.000 Ambulance (siren) +9CRb-PToaAM_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +AwFuGITwrms_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +BGp9-Ro5h8Y_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +CDrpqsGqfPo_10.000_20.000.wav 10.000 20.000 Ambulance (siren) +Cc7-P0py1Mc_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +Daqv2F6SEmQ_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +F9Dbcxr-lAI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +GORjnSWhZeY_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +GgV0yYogTPI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +H9xQQVv3ElI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +LNQ7fzfdLiY_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +MEUcv-QM0cQ_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +QWVub6-0jX4_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +R8G5Y0HASxY_60.000_70.000.wav 60.000 70.000 Ambulance (siren) +RVTKY5KR3ME_20.000_30.000.wav 20.000 30.000 Ambulance (siren) +Sm0pPvXPA9U_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +VXI3-DI4xNs_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +W8fIlauyJkk_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +ZlS4vIWQMmE_0.000_10.000.wav 0.000 10.000 Ambulance (siren) +ZxlbI2Rj1VY_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +ZyuX_gMFiss_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +bA8mt0JI0Ko_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +bIU0X1v4SF0_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +cHm1cYBAXMI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +cR79KnWpiQA_70.000_80.000.wav 70.000 80.000 Ambulance (siren) +dPcw4R5lczw_500.000_510.000.wav 500.000 510.000 Ambulance (siren) +epwDz5WBkvc_80.000_90.000.wav 80.000 90.000 Ambulance (siren) +fHaQPHCjyfA_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +gw9pYEG2Zb0_20.000_30.000.wav 20.000 30.000 Ambulance (siren) +iEX8L_oEbsU_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +iM-U56fTTOQ_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +iSnWMz4FUAg_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +kJuvA2zmrnY_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +kSjvt2Z_pBo_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +ke35yF1LHs4_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +lqGtL8sUo_g_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +mAfPu0meA_Y_20.000_30.000.wav 20.000 30.000 Ambulance (siren) +mlS9LLiMIG8_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +oPR7tUEUptk_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +qsHc2X1toLs_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +rCQykaL8Hy4_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +rhUfN81puDI_0.000_10.000.wav 0.000 10.000 Ambulance (siren) +s0iddDFzL9s_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +tcKlq7_cOkw_8.000_18.000.wav 8.000 18.000 Ambulance (siren) +u3yYpMwG4Us_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +vBXPyBiyJG0_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +vVqUvv1SSu8_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +vYKWnuvq2FI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +ysNK5RVF3Zw_0.000_10.000.wav 0.000 10.000 Ambulance (siren) +z4B14tAqJ4w_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +zbiJEml563w_20.000_30.000.wav 20.000 30.000 Ambulance (siren) +-HxRz4w60-Y_150.000_160.000.wav 150.000 160.000 Fire engine, fire truck (siren) +-_dElQcyJnA_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +0K1mroXg8bs_9.000_19.000.wav 9.000 19.000 Fire engine, fire truck (siren) +0SvSNVatkv0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +2-h8MRSRvEg_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +31WGUPOYS5g_22.000_32.000.wav 22.000 32.000 Fire engine, fire truck (siren) +3h3_IZWhX0g_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +4APBvMmKubU_10.000_20.000.wav 10.000 20.000 Fire engine, fire truck (siren) +5fjy_2ajEkg_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +6rXgD5JlYxY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +8Aq2DyLbUBA_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +8DaEd5KbnnA_80.000_90.000.wav 80.000 90.000 Fire engine, fire truck (siren) +ARIVxBOc0BQ_40.000_50.000.wav 40.000 50.000 Fire engine, fire truck (siren) +AwFuGITwrms_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +Bs2KqqI9F_k_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +Cc7-P0py1Mc_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +D4M3YT75ZrQ_90.000_100.000.wav 90.000 100.000 Fire engine, fire truck (siren) +DWXQ_cSUW98_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +Daqv2F6SEmQ_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +DpagxUQwXDo_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +FFSI6Bg2M-Q_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +GORjnSWhZeY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +GbIuxmaiCOk_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +GgV0yYogTPI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +H6c8ZDrdUaM_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +H9xQQVv3ElI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +HQQxGJKg1iM_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +IiCh2H3JtsE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +InrS4Fdndr4_0.000_10.000.wav 0.000 10.000 Fire engine, fire truck (siren) +JpLA7HY9r3Y_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +MEUcv-QM0cQ_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +PCl-q7lCT_U_50.000_60.000.wav 50.000 60.000 Fire engine, fire truck (siren) +VXI3-DI4xNs_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +Xggsbzzes3M_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +YbiiaDBU-HI_10.000_20.000.wav 10.000 20.000 Fire engine, fire truck (siren) +ZeH6Fc7Y900_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +ZlS4vIWQMmE_0.000_10.000.wav 0.000 10.000 Fire engine, fire truck (siren) +bIU0X1v4SF0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +cHm1cYBAXMI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +fHaQPHCjyfA_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +iM-U56fTTOQ_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +k2a30--j37Q_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +kJuvA2zmrnY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +kr8ssbrDDMY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +pvYwIdGrS90_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +qsHc2X1toLs_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +rCQykaL8Hy4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +rhUfN81puDI_0.000_10.000.wav 0.000 10.000 Fire engine, fire truck (siren) +u08iA12iAmM_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +u9aHjYGbl5o_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +uUiZrgUpw2A_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +vBXPyBiyJG0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +vVqUvv1SSu8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +vYKWnuvq2FI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +wD0P-doqkXo_20.000_30.000.wav 20.000 30.000 Fire engine, fire truck (siren) +xbr7x2V6mxk_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +ysNK5RVF3Zw_0.000_10.000.wav 0.000 10.000 Fire engine, fire truck (siren) +z4B14tAqJ4w_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +zpzJKMG5iGc_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +02Ak1eIyj3M_30.000_40.000.wav 30.000 40.000 Civil defense siren +0CJFt950vOk_30.000_40.000.wav 30.000 40.000 Civil defense siren +0phl6nlC-n0_10.000_20.000.wav 10.000 20.000 Civil defense siren +1jhbNtCWC9w_50.000_60.000.wav 50.000 60.000 Civil defense siren +4Ukj2TTJxHM_30.000_40.000.wav 30.000 40.000 Civil defense siren +4XAVaSz_P7c_150.000_160.000.wav 150.000 160.000 Civil defense siren +69AIBPnJN5E_0.000_10.000.wav 0.000 10.000 Civil defense siren +8DaEd5KbnnA_80.000_90.000.wav 80.000 90.000 Civil defense siren +8ILgvaJVPCI_30.000_40.000.wav 30.000 40.000 Civil defense siren +9MWHXCLAX8I_30.000_40.000.wav 30.000 40.000 Civil defense siren +A5y-aZc0CiM_30.000_40.000.wav 30.000 40.000 Civil defense siren +AQCZH4OdNSM_30.000_40.000.wav 30.000 40.000 Civil defense siren +AVBUh6qeHrQ_30.000_40.000.wav 30.000 40.000 Civil defense siren +BhQPDafekdw_30.000_40.000.wav 30.000 40.000 Civil defense siren +CJXNdudcJrs_30.000_40.000.wav 30.000 40.000 Civil defense siren +CU2MyVM_B48_30.000_40.000.wav 30.000 40.000 Civil defense siren +DdZw0XDv0JI_30.000_40.000.wav 30.000 40.000 Civil defense siren +DgWHUawAGnI_30.000_40.000.wav 30.000 40.000 Civil defense siren +Do9Dffb6vHA_30.000_40.000.wav 30.000 40.000 Civil defense siren +GO2zKyMtBV4_30.000_40.000.wav 30.000 40.000 Civil defense siren +GeRgy4of730_30.000_40.000.wav 30.000 40.000 Civil defense siren +IIypdzgZAaI_30.000_40.000.wav 30.000 40.000 Civil defense siren +JpLA7HY9r3Y_30.000_40.000.wav 30.000 40.000 Civil defense siren +JqHJ7015aWM_30.000_40.000.wav 30.000 40.000 Civil defense siren +K7a1P4RX_5w_30.000_40.000.wav 30.000 40.000 Civil defense siren +KrTocA-I550_190.000_200.000.wav 190.000 200.000 Civil defense siren +KumYcZVLOVU_350.000_360.000.wav 350.000 360.000 Civil defense siren +L60HS_jbZu0_30.000_40.000.wav 30.000 40.000 Civil defense siren +MZ1Yh6mRC-E_30.000_40.000.wav 30.000 40.000 Civil defense siren +R8XUrRCFkzs_30.000_40.000.wav 30.000 40.000 Civil defense siren +SyWbolNFst4_60.000_70.000.wav 60.000 70.000 Civil defense siren +TYLZuBBu8ms_0.000_10.000.wav 0.000 10.000 Civil defense siren +Tx6eSkU2lKc_30.000_40.000.wav 30.000 40.000 Civil defense siren +VcflBZLflSU_130.000_140.000.wav 130.000 140.000 Civil defense siren +WXsTHg_DiYA_30.000_40.000.wav 30.000 40.000 Civil defense siren +Wz5ffJxCElQ_10.000_20.000.wav 10.000 20.000 Civil defense siren +X2MlmcY8UZU_30.000_40.000.wav 30.000 40.000 Civil defense siren +XYLheTmlEYI_30.000_40.000.wav 30.000 40.000 Civil defense siren +YyxlD_FwZXM_30.000_40.000.wav 30.000 40.000 Civil defense siren +adCuLs-4nmI_30.000_40.000.wav 30.000 40.000 Civil defense siren +cPjtrTq3F-I_30.000_40.000.wav 30.000 40.000 Civil defense siren +eHDm93tI4Ok_30.000_40.000.wav 30.000 40.000 Civil defense siren +etppP5Sdo14_30.000_40.000.wav 30.000 40.000 Civil defense siren +fRKxUc1gQBw_50.000_60.000.wav 50.000 60.000 Civil defense siren +feIue4LHzfM_30.000_40.000.wav 30.000 40.000 Civil defense siren +gr-Yen6Sj_Q_0.000_10.000.wav 0.000 10.000 Civil defense siren +hl3Kqi9Wi_g_30.000_40.000.wav 30.000 40.000 Civil defense siren +iKca2cbowd4_30.000_40.000.wav 30.000 40.000 Civil defense siren +kzFyGWdj6MI_30.000_40.000.wav 30.000 40.000 Civil defense siren +m3LGopSVju4_30.000_40.000.wav 30.000 40.000 Civil defense siren +ne4IMxs-hMk_30.000_40.000.wav 30.000 40.000 Civil defense siren +nuu2iNisoQc_6.000_16.000.wav 6.000 16.000 Civil defense siren +oYeql9xE19k_30.000_40.000.wav 30.000 40.000 Civil defense siren +rGUrM19BnJ8_110.000_120.000.wav 110.000 120.000 Civil defense siren +u08iA12iAmM_30.000_40.000.wav 30.000 40.000 Civil defense siren +uCRAnDBXxgI_30.000_40.000.wav 30.000 40.000 Civil defense siren +vQG4HZR2KSk_30.000_40.000.wav 30.000 40.000 Civil defense siren +vjsG5b2yNzc_190.000_200.000.wav 190.000 200.000 Civil defense siren +yO7guxGY-_k_30.000_40.000.wav 30.000 40.000 Civil defense siren +-9GUUhB3QV0_30.000_40.000.wav 30.000 40.000 Police car (siren) +-HxRz4w60-Y_150.000_160.000.wav 150.000 160.000 Police car (siren) +-UBVqmhbT50_30.000_40.000.wav 30.000 40.000 Police car (siren) +-_dElQcyJnA_30.000_40.000.wav 30.000 40.000 Police car (siren) +0N0C0Wbe6AI_30.000_40.000.wav 30.000 40.000 Police car (siren) +0SvSNVatkv0_30.000_40.000.wav 30.000 40.000 Police car (siren) +145N68nh4m0_120.000_130.000.wav 120.000 130.000 Police car (siren) +2-h8MRSRvEg_30.000_40.000.wav 30.000 40.000 Police car (siren) +31WGUPOYS5g_22.000_32.000.wav 22.000 32.000 Police car (siren) +5RgHBmX2HLw_30.000_40.000.wav 30.000 40.000 Police car (siren) +6rXgD5JlYxY_30.000_40.000.wav 30.000 40.000 Police car (siren) +8Aq2DyLbUBA_30.000_40.000.wav 30.000 40.000 Police car (siren) +8DaEd5KbnnA_80.000_90.000.wav 80.000 90.000 Police car (siren) +8E7okHnCcTA_30.000_40.000.wav 30.000 40.000 Police car (siren) +9CRb-PToaAM_30.000_40.000.wav 30.000 40.000 Police car (siren) +9OFUd38sBNM_0.000_8.000.wav 0.000 8.000 Police car (siren) +AQCZH4OdNSM_30.000_40.000.wav 30.000 40.000 Police car (siren) +AwFuGITwrms_30.000_40.000.wav 30.000 40.000 Police car (siren) +CDrpqsGqfPo_10.000_20.000.wav 10.000 20.000 Police car (siren) +DK_6C29B2zs_14.000_24.000.wav 14.000 24.000 Police car (siren) +GORjnSWhZeY_30.000_40.000.wav 30.000 40.000 Police car (siren) +GgV0yYogTPI_30.000_40.000.wav 30.000 40.000 Police car (siren) +H6c8ZDrdUaM_30.000_40.000.wav 30.000 40.000 Police car (siren) +H7lOMlND9dc_30.000_40.000.wav 30.000 40.000 Police car (siren) +H9xQQVv3ElI_30.000_40.000.wav 30.000 40.000 Police car (siren) +IiCh2H3JtsE_30.000_40.000.wav 30.000 40.000 Police car (siren) +InrS4Fdndr4_0.000_10.000.wav 0.000 10.000 Police car (siren) +JgDuU9kpHpM_30.000_40.000.wav 30.000 40.000 Police car (siren) +JpLA7HY9r3Y_30.000_40.000.wav 30.000 40.000 Police car (siren) +LNQ7fzfdLiY_30.000_40.000.wav 30.000 40.000 Police car (siren) +PCl-q7lCT_U_50.000_60.000.wav 50.000 60.000 Police car (siren) +QWVub6-0jX4_30.000_40.000.wav 30.000 40.000 Police car (siren) +Wak5QxsS-QU_30.000_40.000.wav 30.000 40.000 Police car (siren) +YbiiaDBU-HI_10.000_20.000.wav 10.000 20.000 Police car (siren) +Z34SD-OEpJI_10.000_20.000.wav 10.000 20.000 Police car (siren) +ZeH6Fc7Y900_30.000_40.000.wav 30.000 40.000 Police car (siren) +ZlS4vIWQMmE_0.000_10.000.wav 0.000 10.000 Police car (siren) +ZyuX_gMFiss_30.000_40.000.wav 30.000 40.000 Police car (siren) +bIU0X1v4SF0_30.000_40.000.wav 30.000 40.000 Police car (siren) +eIMjkADTWzA_60.000_70.000.wav 60.000 70.000 Police car (siren) +epwDz5WBkvc_80.000_90.000.wav 80.000 90.000 Police car (siren) +fHaQPHCjyfA_30.000_40.000.wav 30.000 40.000 Police car (siren) +fNcrlqPrAqM_30.000_40.000.wav 30.000 40.000 Police car (siren) +g_DBLppDZAs_30.000_40.000.wav 30.000 40.000 Police car (siren) +gw9pYEG2Zb0_20.000_30.000.wav 20.000 30.000 Police car (siren) +iEX8L_oEbsU_30.000_40.000.wav 30.000 40.000 Police car (siren) +iM-U56fTTOQ_30.000_40.000.wav 30.000 40.000 Police car (siren) +kJuvA2zmrnY_30.000_40.000.wav 30.000 40.000 Police car (siren) +kSjvt2Z_pBo_30.000_40.000.wav 30.000 40.000 Police car (siren) +lqGtL8sUo_g_30.000_40.000.wav 30.000 40.000 Police car (siren) +mAfPu0meA_Y_20.000_30.000.wav 20.000 30.000 Police car (siren) +mlS9LLiMIG8_30.000_40.000.wav 30.000 40.000 Police car (siren) +pzup58Eyhuo_30.000_40.000.wav 30.000 40.000 Police car (siren) +rCQykaL8Hy4_30.000_40.000.wav 30.000 40.000 Police car (siren) +rhUfN81puDI_0.000_10.000.wav 0.000 10.000 Police car (siren) +u08iA12iAmM_30.000_40.000.wav 30.000 40.000 Police car (siren) +u3yYpMwG4Us_30.000_40.000.wav 30.000 40.000 Police car (siren) +u9aHjYGbl5o_30.000_40.000.wav 30.000 40.000 Police car (siren) +uUiZrgUpw2A_30.000_40.000.wav 30.000 40.000 Police car (siren) +vYKWnuvq2FI_30.000_40.000.wav 30.000 40.000 Police car (siren) +xbr7x2V6mxk_30.000_40.000.wav 30.000 40.000 Police car (siren) +z4B14tAqJ4w_30.000_40.000.wav 30.000 40.000 Police car (siren) +-FKrYTj_eCU_0.000_10.000.wav 0.000 10.000 Screaming +0G50t4FlbIA_60.000_70.000.wav 60.000 70.000 Screaming +1LTxZ2aNytc_30.000_40.000.wav 30.000 40.000 Screaming +2FEhG1UXb_E_370.000_380.000.wav 370.000 380.000 Screaming +45vBbOhzS6g_50.000_60.000.wav 50.000 60.000 Screaming +4PYTtp78Ig0_60.000_70.000.wav 60.000 70.000 Screaming +5QNq0IEPICQ_30.000_40.000.wav 30.000 40.000 Screaming +5YcIJuYQECc_0.000_6.000.wav 0.000 6.000 Screaming +5kQF4r03yRI_0.000_6.000.wav 0.000 6.000 Screaming +7ARVgI_wx5Y_30.000_40.000.wav 30.000 40.000 Screaming +AIFvFuZPr68_30.000_40.000.wav 30.000 40.000 Screaming +Aw43FUCkIb8_20.000_30.000.wav 20.000 30.000 Screaming +AxM2BofYfPY_30.000_40.000.wav 30.000 40.000 Screaming +BFqHyCoypfM_16.000_26.000.wav 16.000 26.000 Screaming +Bk_xS_fKCpk_30.000_40.000.wav 30.000 40.000 Screaming +C4YMjmJ7tt4_90.000_100.000.wav 90.000 100.000 Screaming +CMWoAvgD0A0_9.000_19.000.wav 9.000 19.000 Screaming +DZfYFhywhRs_30.000_40.000.wav 30.000 40.000 Screaming +ElJFYwRtrH4_30.000_40.000.wav 30.000 40.000 Screaming +FcUVtXJMkJs_30.000_40.000.wav 30.000 40.000 Screaming +G--718JDmAQ_0.000_10.000.wav 0.000 10.000 Screaming +GPJ1uQwmNHk_30.000_40.000.wav 30.000 40.000 Screaming +H3vSRzkG82U_30.000_40.000.wav 30.000 40.000 Screaming +HS28EUWt8dE_110.000_120.000.wav 110.000 120.000 Screaming +KkGTB8ESMCM_0.000_10.000.wav 0.000 10.000 Screaming +MQ0YasvMcuQ_1.000_11.000.wav 1.000 11.000 Screaming +Msl9dI5yweA_90.000_100.000.wav 90.000 100.000 Screaming +Ntn6YvZM3kA_0.000_10.000.wav 0.000 10.000 Screaming +NwTHlpXdk4M_30.000_40.000.wav 30.000 40.000 Screaming +OHjfSfqa804_0.000_10.000.wav 0.000 10.000 Screaming +OzWJuqG2F3Y_30.000_40.000.wav 30.000 40.000 Screaming +QDW_uCMnMMU_0.000_8.000.wav 0.000 8.000 Screaming +SxI3Lnzzmkw_110.000_120.000.wav 110.000 120.000 Screaming +TVvbfuGu9eM_70.000_80.000.wav 70.000 80.000 Screaming +YCk9F0Uq3BE_70.000_80.000.wav 70.000 80.000 Screaming +Z54pSnNw2iM_30.000_40.000.wav 30.000 40.000 Screaming +a59ivTlYoNk_310.000_320.000.wav 310.000 320.000 Screaming +auC_LgwFF8g_30.000_40.000.wav 30.000 40.000 Screaming +bi8R9JbF2cc_80.000_90.000.wav 80.000 90.000 Screaming +cdbYsoEasio_70.000_80.000.wav 70.000 80.000 Screaming +dfsvT5xImNg_80.000_90.000.wav 80.000 90.000 Screaming +e2AaF6siR1A_540.000_550.000.wav 540.000 550.000 Screaming +gB1ytjgpcW4_190.000_200.000.wav 190.000 200.000 Screaming +gE-0JxMtUh0_20.000_30.000.wav 20.000 30.000 Screaming +hWiGgsuGnzs_100.000_110.000.wav 100.000 110.000 Screaming +l-iIfi3SNpw_120.000_130.000.wav 120.000 130.000 Screaming +mT-f0lGk-JM_30.000_40.000.wav 30.000 40.000 Screaming +nApE_Biu13k_10.000_20.000.wav 10.000 20.000 Screaming +nRMmafPUAEU_80.000_90.000.wav 80.000 90.000 Screaming +nYAbLuyqPis_30.000_40.000.wav 30.000 40.000 Screaming +nlYlNF30bVg_30.000_40.000.wav 30.000 40.000 Screaming +sUp-UXzgmrA_0.000_10.000.wav 0.000 10.000 Screaming +syIwNMo2TUA_0.000_7.000.wav 0.000 7.000 Screaming +uTu0a1wd9-M_21.000_31.000.wav 21.000 31.000 Screaming +xVG7dfH5DL0_320.000_330.000.wav 320.000 330.000 Screaming +xvAQ44hx3_k_220.000_230.000.wav 220.000 230.000 Screaming +yNTkb2zgA_M_70.000_80.000.wav 70.000 80.000 Screaming +zCdOEvduBTo_30.000_40.000.wav 30.000 40.000 Screaming +zMICvbCJ6zc_550.000_560.000.wav 550.000 560.000 Screaming +-0RWZT-miFs_420.000_430.000.wav 420.000 430.000 Car +-1pRmoJIGQc_11.000_21.000.wav 11.000 21.000 Car +-7eDqv-6AKQ_30.000_40.000.wav 30.000 40.000 Car +-CZ1LIc8aos_20.000_30.000.wav 20.000 30.000 Car +-HWygXWSNRA_30.000_40.000.wav 30.000 40.000 Car +-PVEno65928_30.000_40.000.wav 30.000 40.000 Car +-WgJ-M292Yc_30.000_40.000.wav 30.000 40.000 Car +0O-gZoirpRA_30.000_40.000.wav 30.000 40.000 Car +0QwxnzHf_0E_30.000_40.000.wav 30.000 40.000 Car +0bg1nzEVdgY_0.000_10.000.wav 0.000 10.000 Car +0lpPdWvg7Eo_0.000_10.000.wav 0.000 10.000 Car +11Pn3yJifSQ_4.000_14.000.wav 4.000 14.000 Car +1BgqrhbyRFw_30.000_40.000.wav 30.000 40.000 Car +1F9zCsJyw6k_430.000_440.000.wav 430.000 440.000 Car +1HayoASR-54_80.000_90.000.wav 80.000 90.000 Car +1P5FFxXLSpY_30.000_40.000.wav 30.000 40.000 Car +1hIg-Lsvc7Q_30.000_40.000.wav 30.000 40.000 Car +27m49pmJ8Og_370.000_380.000.wav 370.000 380.000 Car +2E_N8lnoVKE_30.000_40.000.wav 30.000 40.000 Car +2Fdau5KTEls_30.000_40.000.wav 30.000 40.000 Car +2STASUlGAjs_30.000_40.000.wav 30.000 40.000 Car +2fi0m8ei_B4_30.000_40.000.wav 30.000 40.000 Car +2uMXfAIMeN0_180.000_190.000.wav 180.000 190.000 Car +32V2zsK7GME_110.000_120.000.wav 110.000 120.000 Car +3YChVhqW42E_130.000_140.000.wav 130.000 140.000 Car +3_OLj6XChvM_30.000_40.000.wav 30.000 40.000 Car +3hLxPQpmfQo_30.000_40.000.wav 30.000 40.000 Car +3mDPQ_CPopw_30.000_40.000.wav 30.000 40.000 Car +3mor5mPSYoU_7.000_17.000.wav 7.000 17.000 Car +3xh2kScw64U_30.000_40.000.wav 30.000 40.000 Car +40s88hEcn5I_170.000_180.000.wav 170.000 180.000 Car +42P93B_GzGA_30.000_40.000.wav 30.000 40.000 Car +4KZWpXlcpM4_60.000_70.000.wav 60.000 70.000 Car +4TshFWSsrn8_290.000_300.000.wav 290.000 300.000 Car +4WRgvRI06zc_30.000_40.000.wav 30.000 40.000 Car +4aJfQpHt9lY_160.000_170.000.wav 160.000 170.000 Car +4hd2CLrzCZs_30.000_40.000.wav 30.000 40.000 Car +4zCHl7pRsNY_30.000_40.000.wav 30.000 40.000 Car +5RgHBmX2HLw_30.000_40.000.wav 30.000 40.000 Car +5oirFKi6Sfo_190.000_200.000.wav 190.000 200.000 Car +5vmxFp1r1ZM_30.000_40.000.wav 30.000 40.000 Car +5z1rE_l-0Ow_0.000_8.000.wav 0.000 8.000 Car +620GoTv5Ic8_30.000_40.000.wav 30.000 40.000 Car +6BitLl5Bnxw_30.000_40.000.wav 30.000 40.000 Car +6FVA4hqp1Ro_30.000_40.000.wav 30.000 40.000 Car +6U942AYlcXA_30.000_40.000.wav 30.000 40.000 Car +6b2ZMMrLTz8_5.000_15.000.wav 5.000 15.000 Car +6ibh38autyA_30.000_40.000.wav 30.000 40.000 Car +6kuESYFcEqw_30.000_40.000.wav 30.000 40.000 Car +73cuZZq-J3w_20.000_30.000.wav 20.000 30.000 Car +764IcMEMVUk_90.000_100.000.wav 90.000 100.000 Car +7NH1WJlSiYI_30.000_40.000.wav 30.000 40.000 Car +7lJu9wEsErY_220.000_230.000.wav 220.000 230.000 Car +8CqqK9CzuXM_30.000_40.000.wav 30.000 40.000 Car +8SYLYWR47EE_30.000_40.000.wav 30.000 40.000 Car +8Wk-ZmlsUqY_28.000_38.000.wav 28.000 38.000 Car +8q8JrJNAa-Q_30.000_40.000.wav 30.000 40.000 Car +8rMlNbKlp_s_0.000_10.000.wav 0.000 10.000 Car +8sGJFPr2Nmc_30.000_40.000.wav 30.000 40.000 Car +8yRROnG0-lA_30.000_40.000.wav 30.000 40.000 Car +9Ti98L4PRCo_17.000_27.000.wav 17.000 27.000 Car +9fzAWj5YJ9c_30.000_40.000.wav 30.000 40.000 Car +9rq8h4oMJ98_30.000_40.000.wav 30.000 40.000 Car +9ye2Fn62xDc_60.000_70.000.wav 60.000 70.000 Car +ACGuC6SH4V4_150.000_160.000.wav 150.000 160.000 Car +AFz5TIs_Gug_30.000_40.000.wav 30.000 40.000 Car +AedlWfHafgw_21.000_31.000.wav 21.000 31.000 Car +AlsDSDTiaWI_30.000_40.000.wav 30.000 40.000 Car +B3SkK0wuOhY_130.000_140.000.wav 130.000 140.000 Car +B9n4a5ciI48_16.000_26.000.wav 16.000 26.000 Car +BAekfGvUtFM_30.000_40.000.wav 30.000 40.000 Car +BNLOvQbrPdc_290.000_300.000.wav 290.000 300.000 Car +BS1fqEDAvh0_330.000_340.000.wav 330.000 340.000 Car +Bqx_SZgCzZw_10.000_20.000.wav 10.000 20.000 Car +CZB6WXDuM1g_30.000_40.000.wav 30.000 40.000 Car +C_pnsyNXphA_30.000_40.000.wav 30.000 40.000 Car +Ck5ZjBf1nLM_30.000_40.000.wav 30.000 40.000 Car +CqNyeZeHb8Y_30.000_40.000.wav 30.000 40.000 Car +Cs1d7Ibk8CA_220.000_230.000.wav 220.000 230.000 Car +CuS-ok0xG9g_0.000_10.000.wav 0.000 10.000 Car +CuaBHNKycvI_30.000_40.000.wav 30.000 40.000 Car +Cwur_jvxMzY_360.000_370.000.wav 360.000 370.000 Car +DEGSyVygE98_110.000_120.000.wav 110.000 120.000 Car +DLxTYAUifjU_30.000_40.000.wav 30.000 40.000 Car +DkKpnvJk9u0_30.000_40.000.wav 30.000 40.000 Car +DkVfro9iq80_30.000_40.000.wav 30.000 40.000 Car +Dw1q9rBv7oU_30.000_40.000.wav 30.000 40.000 Car +E8NgxTz1d90_30.000_40.000.wav 30.000 40.000 Car +ExqedxdXuBc_70.000_80.000.wav 70.000 80.000 Car +FCxEMSNSEuI_160.000_170.000.wav 160.000 170.000 Car +FEoMTMxzn3U_30.000_40.000.wav 30.000 40.000 Car +FFSWmryaZ60_30.000_40.000.wav 30.000 40.000 Car +FYk2paHPSdg_30.000_40.000.wav 30.000 40.000 Car +Fo_FDiZhzDo_30.000_40.000.wav 30.000 40.000 Car +GteozUDpJRc_30.000_40.000.wav 30.000 40.000 Car +GwBS2NzjAvA_30.000_40.000.wav 30.000 40.000 Car +H8d1mZOqb1c_110.000_120.000.wav 110.000 120.000 Car +HFF_PpqLQ9w_30.000_40.000.wav 30.000 40.000 Car +HHlb-h2Pc7o_30.000_40.000.wav 30.000 40.000 Car +Hu8lxbHYaqg_40.000_50.000.wav 40.000 50.000 Car +I-HlrcP6Qg4_30.000_40.000.wav 30.000 40.000 Car +I7vs2H-Htt8_480.000_490.000.wav 480.000 490.000 Car +IblhEF_MiH8_400.000_410.000.wav 400.000 410.000 Car +JgXnbgS_XBk_480.000_490.000.wav 480.000 490.000 Car +Ju7Kg_H2iZQ_30.000_40.000.wav 30.000 40.000 Car +KiCB6pP6EEo_100.000_110.000.wav 100.000 110.000 Car +Kwpn3utYEHM_30.000_40.000.wav 30.000 40.000 Car +Ky9Kw-0XwAs_30.000_40.000.wav 30.000 40.000 Car +KzKDk-UgS54_30.000_40.000.wav 30.000 40.000 Car +L1qC8DicAZE_70.000_80.000.wav 70.000 80.000 Car +L4N0LOYZrFo_30.000_40.000.wav 30.000 40.000 Car +L535vIV3ED4_40.000_50.000.wav 40.000 50.000 Car +L9YtOeck3A0_0.000_10.000.wav 0.000 10.000 Car +LEtkHiZZugk_30.000_40.000.wav 30.000 40.000 Car +LLkNFGrrgUo_30.000_40.000.wav 30.000 40.000 Car +LhRNnXaSsCk_30.000_40.000.wav 30.000 40.000 Car +M7NvD1WJQ7o_70.000_80.000.wav 70.000 80.000 Car +M8BFtmQRHq4_200.000_210.000.wav 200.000 210.000 Car +Mxn2FKuNwiI_20.000_30.000.wav 20.000 30.000 Car +NMqSBlEq14Q_30.000_40.000.wav 30.000 40.000 Car +NoPbk9fy6uw_10.000_20.000.wav 10.000 20.000 Car +O36torHptH4_30.000_40.000.wav 30.000 40.000 Car +OBwh-KGukE8_30.000_40.000.wav 30.000 40.000 Car +Oa2Os8eOUjs_30.000_40.000.wav 30.000 40.000 Car +PNaLTW50fxM_60.000_70.000.wav 60.000 70.000 Car +PfXdcsW8dJI_540.000_550.000.wav 540.000 550.000 Car +QAWuHvVCI6g_30.000_40.000.wav 30.000 40.000 Car +QBMDnMRwQCc_70.000_80.000.wav 70.000 80.000 Car +QzrS-S7OerE_370.000_380.000.wav 370.000 380.000 Car +R0BtkTm_CPI_30.000_40.000.wav 30.000 40.000 Car +SEHxfje9Eio_30.000_40.000.wav 30.000 40.000 Car +Sb3V17F8xU8_360.000_370.000.wav 360.000 370.000 Car +SkbFczIabRY_30.000_40.000.wav 30.000 40.000 Car +SqWkV-UQ6CI_30.000_40.000.wav 30.000 40.000 Car +TWDytzefXXc_10.000_20.000.wav 10.000 20.000 Car +Tv67JhZDAYs_30.000_40.000.wav 30.000 40.000 Car +VTwVF3xRSWg_12.000_22.000.wav 12.000 22.000 Car +VulCKZgWspc_570.000_580.000.wav 570.000 580.000 Car +Vx6mttDHWfo_30.000_40.000.wav 30.000 40.000 Car +W11cJ9HZNaY_30.000_40.000.wav 30.000 40.000 Car +WLXQgcx8qTI_30.000_40.000.wav 30.000 40.000 Car +WMbdMQ7rdFs_30.000_40.000.wav 30.000 40.000 Car +WZoQD6cInx8_360.000_370.000.wav 360.000 370.000 Car +WffmaOr2p8I_30.000_40.000.wav 30.000 40.000 Car +WoynilrteLU_30.000_40.000.wav 30.000 40.000 Car +WxrKq0aI0iM_130.000_140.000.wav 130.000 140.000 Car +X60eVxecY3I_30.000_40.000.wav 30.000 40.000 Car +X8fEzx-fA0U_80.000_90.000.wav 80.000 90.000 Car +XVxlZqwWcBI_10.000_20.000.wav 10.000 20.000 Car +Xnd8ERrynEo_120.000_130.000.wav 120.000 130.000 Car +XqXLI7bDb-I_0.000_7.000.wav 0.000 7.000 Car +XyCjByHuDIk_260.000_270.000.wav 260.000 270.000 Car +XzE7mp3pVik_0.000_10.000.wav 0.000 10.000 Car +Y5e8BW513ww_20.000_30.000.wav 20.000 30.000 Car +YJdBwuIn4Ec_30.000_40.000.wav 30.000 40.000 Car +YTFJUFWcRns_30.000_40.000.wav 30.000 40.000 Car +YY9aConw2QE_0.000_10.000.wav 0.000 10.000 Car +Yc_WuISxfLI_30.000_40.000.wav 30.000 40.000 Car +Ys_rO2Ieg1U_30.000_40.000.wav 30.000 40.000 Car +Z34SD-OEpJI_10.000_20.000.wav 10.000 20.000 Car +Z8cigemT5_g_210.000_220.000.wav 210.000 220.000 Car +ZJW7ymsioQc_16.000_26.000.wav 16.000 26.000 Car +ZY6A9ZDkudg_130.000_140.000.wav 130.000 140.000 Car +_Mw9lKigni4_30.000_40.000.wav 30.000 40.000 Car +_ZiJA6phEq8_30.000_40.000.wav 30.000 40.000 Car +_yU0-fmspFY_210.000_220.000.wav 210.000 220.000 Car +a5vTn5286-A_80.000_90.000.wav 80.000 90.000 Car +aCX6vJhHO2c_30.000_40.000.wav 30.000 40.000 Car +aHEAK0iWqKk_180.000_190.000.wav 180.000 190.000 Car +aOVPHKqKjyQ_90.000_100.000.wav 90.000 100.000 Car +aUq4glO5ryE_30.000_40.000.wav 30.000 40.000 Car +aW3DY8XDrmw_22.000_32.000.wav 22.000 32.000 Car +aa4uhPvKviY_30.000_40.000.wav 30.000 40.000 Car +akgqVmFFDiY_30.000_40.000.wav 30.000 40.000 Car +buOEFwXhoe0_310.000_320.000.wav 310.000 320.000 Car +cHCIoXF7moA_30.000_40.000.wav 30.000 40.000 Car +cW859JAzVZ0_30.000_40.000.wav 30.000 40.000 Car +cbYZQRz09bc_390.000_400.000.wav 390.000 400.000 Car +d-do1XZ8f_E_30.000_40.000.wav 30.000 40.000 Car +d3gMwtMK6Gs_30.000_40.000.wav 30.000 40.000 Car +d6AioJ8CkTc_30.000_40.000.wav 30.000 40.000 Car +dAud19zNZyw_190.000_200.000.wav 190.000 200.000 Car +dC1TVxwiitc_30.000_40.000.wav 30.000 40.000 Car +dFqOBLxhEl8_20.000_30.000.wav 20.000 30.000 Car +dSfcznv4KLo_30.000_40.000.wav 30.000 40.000 Car +dThSTe35jb0_50.000_60.000.wav 50.000 60.000 Car +dfwr8wgZU8M_40.000_50.000.wav 40.000 50.000 Car +dmJH84FnQa8_30.000_40.000.wav 30.000 40.000 Car +e9xPBfEJni8_230.000_240.000.wav 230.000 240.000 Car +eAl9WwRaWUE_30.000_40.000.wav 30.000 40.000 Car +eAt6si6k65c_30.000_40.000.wav 30.000 40.000 Car +eHiqCLHmoxI_0.000_8.000.wav 0.000 8.000 Car +eV5JX81GzqA_150.000_160.000.wav 150.000 160.000 Car +er1vQ-nse_g_30.000_40.000.wav 30.000 40.000 Car +eyFPHlybqDg_30.000_40.000.wav 30.000 40.000 Car +f70nsY7ThBA_220.000_230.000.wav 220.000 230.000 Car +fJLCT3xDGxA_30.000_40.000.wav 30.000 40.000 Car +fZMPDCNyQxE_30.000_40.000.wav 30.000 40.000 Car +f__6chtFRM0_30.000_40.000.wav 30.000 40.000 Car +fdDTuo_COG8_90.000_100.000.wav 90.000 100.000 Car +gFJjYWXeBn0_30.000_40.000.wav 30.000 40.000 Car +g_DBLppDZAs_30.000_40.000.wav 30.000 40.000 Car +gaFQgJLQHtU_90.000_100.000.wav 90.000 100.000 Car +gc6VlixMHXE_30.000_40.000.wav 30.000 40.000 Car +hN1ykzC8kZM_30.000_40.000.wav 30.000 40.000 Car +hQ_yyPI46FI_11.000_21.000.wav 11.000 21.000 Car +haiMRJEH-Aw_0.000_9.000.wav 0.000 9.000 Car +hsC_sT0A4XM_30.000_40.000.wav 30.000 40.000 Car +ihQDd1CqFBw_70.000_80.000.wav 70.000 80.000 Car +ii87iO6JboA_10.000_20.000.wav 10.000 20.000 Car +j2R1zurR39E_30.000_40.000.wav 30.000 40.000 Car +j42ETHcp044_0.000_10.000.wav 0.000 10.000 Car +j7OEpDiK3IA_30.000_40.000.wav 30.000 40.000 Car +jCeUZwd8b2w_0.000_10.000.wav 0.000 10.000 Car +jZxusrD28rM_30.000_40.000.wav 30.000 40.000 Car +kdDgTDfo9HY_100.000_110.000.wav 100.000 110.000 Car +l6_h_YHuTbY_30.000_40.000.wav 30.000 40.000 Car +lRrv5m9Xu4k_30.000_40.000.wav 30.000 40.000 Car +lb1awXgoyQE_0.000_10.000.wav 0.000 10.000 Car +llZBUsAwRWc_30.000_40.000.wav 30.000 40.000 Car +lu5teS1j1RQ_0.000_10.000.wav 0.000 10.000 Car +mCmjh_EJtb4_30.000_40.000.wav 30.000 40.000 Car +nFqf1vflJaI_350.000_360.000.wav 350.000 360.000 Car +njodYtK0Hqg_30.000_40.000.wav 30.000 40.000 Car +noymXcxyxis_30.000_40.000.wav 30.000 40.000 Car +o2CmtHNUrXg_30.000_40.000.wav 30.000 40.000 Car +oPJVdi0cqNE_30.000_40.000.wav 30.000 40.000 Car +oxJYMzEmtk4_10.000_20.000.wav 10.000 20.000 Car +pPnLErF3GOY_30.000_40.000.wav 30.000 40.000 Car +pXX6cK4xtiY_11.000_21.000.wav 11.000 21.000 Car +qC5M7BAsKOA_0.000_10.000.wav 0.000 10.000 Car +qg4WxBm8h_w_510.000_520.000.wav 510.000 520.000 Car +qxLdv8u_Ujw_0.000_5.000.wav 0.000 5.000 Car +rgeu0Gtf3Es_40.000_50.000.wav 40.000 50.000 Car +s3-i5eUpe6c_30.000_40.000.wav 30.000 40.000 Car +s5s3aR8Z7I8_350.000_360.000.wav 350.000 360.000 Car +syCQldBsAtg_30.000_40.000.wav 30.000 40.000 Car +tAfucDIyRiM_30.000_40.000.wav 30.000 40.000 Car +teoER4j9H14_290.000_300.000.wav 290.000 300.000 Car +uFSkczD2i14_30.000_40.000.wav 30.000 40.000 Car +uUyB4q7jgn4_30.000_40.000.wav 30.000 40.000 Car +uYqlVTlSgbM_40.000_50.000.wav 40.000 50.000 Car +v8Kry1CbTkM_310.000_320.000.wav 310.000 320.000 Car +vF2zXcbADUk_20.000_30.000.wav 20.000 30.000 Car +vHlqKDR7ggA_30.000_40.000.wav 30.000 40.000 Car +vPDXFKcdaS4_0.000_10.000.wav 0.000 10.000 Car +vW1nk4o9u5g_30.000_40.000.wav 30.000 40.000 Car +vdFYBSlmsXw_30.000_40.000.wav 30.000 40.000 Car +vtE1J8HsCUs_30.000_40.000.wav 30.000 40.000 Car +w0vy1YvNcOg_30.000_40.000.wav 30.000 40.000 Car +wDKrcZ7xLY8_80.000_90.000.wav 80.000 90.000 Car +wM-sBzIDzok_30.000_40.000.wav 30.000 40.000 Car +wUY4eWJt17w_30.000_40.000.wav 30.000 40.000 Car +we66pU0MN1M_30.000_40.000.wav 30.000 40.000 Car +wjfMWiYLDWA_30.000_40.000.wav 30.000 40.000 Car +wu3-_VKULZU_30.000_40.000.wav 30.000 40.000 Car +wwNIm8bgzKc_30.000_40.000.wav 30.000 40.000 Car +xqH9TpH6Xy0_0.000_10.000.wav 0.000 10.000 Car +xsT5ZJUnBg0_160.000_170.000.wav 160.000 170.000 Car +y9DFJEsiTLk_110.000_120.000.wav 110.000 120.000 Car +yESwp_fg0Po_70.000_80.000.wav 70.000 80.000 Car +yQg3eMb0QKU_30.000_40.000.wav 30.000 40.000 Car +yQjnNR7fXKo_50.000_60.000.wav 50.000 60.000 Car +zCuKYr_oMlE_60.000_70.000.wav 60.000 70.000 Car +zz35Va7tYmA_30.000_40.000.wav 30.000 40.000 Car +-CZ1LIc8aos_20.000_30.000.wav 20.000 30.000 Car passing by +-WgJ-M292Yc_30.000_40.000.wav 30.000 40.000 Car passing by +-iAAxJkoqcM_0.000_6.000.wav 0.000 6.000 Car passing by +0mQcGLpc8to_30.000_40.000.wav 30.000 40.000 Car passing by +1HtGgZnlKjU_30.000_40.000.wav 30.000 40.000 Car passing by +2IsAlhq0XFc_30.000_40.000.wav 30.000 40.000 Car passing by +2UvEmetE__I_30.000_40.000.wav 30.000 40.000 Car passing by +2oHGIzH_XzA_30.000_40.000.wav 30.000 40.000 Car passing by +3mor5mPSYoU_7.000_17.000.wav 7.000 17.000 Car passing by +8SYLYWR47EE_30.000_40.000.wav 30.000 40.000 Car passing by +8rzhhvS0tGc_30.000_40.000.wav 30.000 40.000 Car passing by +8v377AXrgac_30.000_40.000.wav 30.000 40.000 Car passing by +9lMtTDKyDEk_30.000_40.000.wav 30.000 40.000 Car passing by +BWoL8oKoTFI_30.000_40.000.wav 30.000 40.000 Car passing by +BsvD806qNM8_10.000_20.000.wav 10.000 20.000 Car passing by +C3LLtToB2zA_30.000_40.000.wav 30.000 40.000 Car passing by +Dk6b9dVD0i8_6.000_16.000.wav 6.000 16.000 Car passing by +Dw1q9rBv7oU_30.000_40.000.wav 30.000 40.000 Car passing by +EqFuY_U0Yz0_30.000_40.000.wav 30.000 40.000 Car passing by +FjpOboRcrNc_10.000_20.000.wav 10.000 20.000 Car passing by +FjyZV8zIJ0k_30.000_40.000.wav 30.000 40.000 Car passing by +Fn7eSPVvgCQ_30.000_40.000.wav 30.000 40.000 Car passing by +G6A-sT2DOjY_30.000_40.000.wav 30.000 40.000 Car passing by +GBXRuYIvhfM_30.000_40.000.wav 30.000 40.000 Car passing by +HDEPd5MIaow_30.000_40.000.wav 30.000 40.000 Car passing by +HQQxGJKg1iM_30.000_40.000.wav 30.000 40.000 Car passing by +If-V0XO-mpo_30.000_40.000.wav 30.000 40.000 Car passing by +JtuNiusRRLk_30.000_40.000.wav 30.000 40.000 Car passing by +M8BFtmQRHq4_200.000_210.000.wav 200.000 210.000 Car passing by +NKPAwhwZmqs_30.000_40.000.wav 30.000 40.000 Car passing by +Oa2Os8eOUjs_30.000_40.000.wav 30.000 40.000 Car passing by +QcLfJE-YfJY_30.000_40.000.wav 30.000 40.000 Car passing by +SkbFczIabRY_30.000_40.000.wav 30.000 40.000 Car passing by +VAiH1LX8guk_17.000_27.000.wav 17.000 27.000 Car passing by +Yc_WuISxfLI_30.000_40.000.wav 30.000 40.000 Car passing by +Yd10enP9ykM_30.000_40.000.wav 30.000 40.000 Car passing by +_HGGCwtyNxM_30.000_40.000.wav 30.000 40.000 Car passing by +a2U10_mi5as_30.000_40.000.wav 30.000 40.000 Car passing by +aB6FDPKAPus_30.000_40.000.wav 30.000 40.000 Car passing by +bDFQWubN4x4_30.000_40.000.wav 30.000 40.000 Car passing by +cW859JAzVZ0_30.000_40.000.wav 30.000 40.000 Car passing by +dDTvjXXFkDg_30.000_40.000.wav 30.000 40.000 Car passing by +dfwr8wgZU8M_40.000_50.000.wav 40.000 50.000 Car passing by +fJLCT3xDGxA_30.000_40.000.wav 30.000 40.000 Car passing by +gc6VlixMHXE_30.000_40.000.wav 30.000 40.000 Car passing by +gd_KjDM4fi8_0.000_10.000.wav 0.000 10.000 Car passing by +j7OEpDiK3IA_30.000_40.000.wav 30.000 40.000 Car passing by +jZxusrD28rM_30.000_40.000.wav 30.000 40.000 Car passing by +llZBUsAwRWc_30.000_40.000.wav 30.000 40.000 Car passing by +m_dCO5bBCic_26.000_36.000.wav 26.000 36.000 Car passing by +qDQX7Xi3GsQ_30.000_40.000.wav 30.000 40.000 Car passing by +qxLdv8u_Ujw_0.000_5.000.wav 0.000 5.000 Car passing by +reP-OOWiLWU_30.000_40.000.wav 30.000 40.000 Car passing by +s4jG5ZJYCvQ_30.000_40.000.wav 30.000 40.000 Car passing by +s5s3aR8Z7I8_350.000_360.000.wav 350.000 360.000 Car passing by +uUyB4q7jgn4_30.000_40.000.wav 30.000 40.000 Car passing by +vPDXFKcdaS4_0.000_10.000.wav 0.000 10.000 Car passing by +wD4QouhX8zo_30.000_40.000.wav 30.000 40.000 Car passing by +xqH9TpH6Xy0_0.000_10.000.wav 0.000 10.000 Car passing by +zd67ihUZ1u4_25.000_35.000.wav 25.000 35.000 Car passing by +-3z5mFRgbxc_30.000_40.000.wav 30.000 40.000 Bus +0N9EN0BEjP0_430.000_440.000.wav 430.000 440.000 Bus +0lPcHRhXlWk_30.000_40.000.wav 30.000 40.000 Bus +1E1evA4T_Tk_30.000_40.000.wav 30.000 40.000 Bus +1hIg-Lsvc7Q_30.000_40.000.wav 30.000 40.000 Bus +6-yQsEH2WYA_30.000_40.000.wav 30.000 40.000 Bus +6Y8wSI1l-Lw_30.000_40.000.wav 30.000 40.000 Bus +7T04388Ijk8_30.000_40.000.wav 30.000 40.000 Bus +8E7okHnCcTA_30.000_40.000.wav 30.000 40.000 Bus +8oEdgb8iXYA_1.000_11.000.wav 1.000 11.000 Bus +AdpNSGX2_Pk_10.000_20.000.wav 10.000 20.000 Bus +AwJ8orGuOXg_2.000_12.000.wav 2.000 12.000 Bus +BS1fqEDAvh0_330.000_340.000.wav 330.000 340.000 Bus +CoFbRc1OxFU_9.000_19.000.wav 9.000 19.000 Bus +DRqKOlP8BmU_110.000_120.000.wav 110.000 120.000 Bus +DYcXvyBFc5w_30.000_40.000.wav 30.000 40.000 Bus +DYdalOQnx1Y_30.000_40.000.wav 30.000 40.000 Bus +DkwFXd5nYLE_40.000_50.000.wav 40.000 50.000 Bus +FBMR3pW9H9o_30.000_40.000.wav 30.000 40.000 Bus +FEGa4e6RAlw_30.000_40.000.wav 30.000 40.000 Bus +Ge_KWS-0098_30.000_40.000.wav 30.000 40.000 Bus +HxMoMMrA6Eo_30.000_40.000.wav 30.000 40.000 Bus +I7esm6vqqZ4_30.000_40.000.wav 30.000 40.000 Bus +JLj11umr1CE_0.000_10.000.wav 0.000 10.000 Bus +JwAhcHHF2qg_30.000_40.000.wav 30.000 40.000 Bus +LhRNnXaSsCk_30.000_40.000.wav 30.000 40.000 Bus +LzZ_nxuZ8Co_30.000_40.000.wav 30.000 40.000 Bus +LzcNa3HvD7c_30.000_40.000.wav 30.000 40.000 Bus +Nyi9_-u6-w0_30.000_40.000.wav 30.000 40.000 Bus +O_SKumO328I_30.000_40.000.wav 30.000 40.000 Bus +Owg_XU9XmRM_30.000_40.000.wav 30.000 40.000 Bus +P94rcZSuTT8_30.000_40.000.wav 30.000 40.000 Bus +PP741kd2vRM_30.000_40.000.wav 30.000 40.000 Bus +Qna9qrV8_go_30.000_40.000.wav 30.000 40.000 Bus +Qt7FJkuqWPE_30.000_40.000.wav 30.000 40.000 Bus +UcQ7cVukaxY_21.000_31.000.wav 21.000 31.000 Bus +W8fIlauyJkk_30.000_40.000.wav 30.000 40.000 Bus +WDn851XbWTk_30.000_40.000.wav 30.000 40.000 Bus +WvquSD2PcCE_30.000_40.000.wav 30.000 40.000 Bus +a9B_HA3y8WQ_30.000_40.000.wav 30.000 40.000 Bus +cEEoKQ38fHY_30.000_40.000.wav 30.000 40.000 Bus +er1vQ-nse_g_30.000_40.000.wav 30.000 40.000 Bus +fLvM4bbpg6w_0.000_10.000.wav 0.000 10.000 Bus +fOVsAMJ3Yms_30.000_40.000.wav 30.000 40.000 Bus +gxVhAVNjSU0_30.000_40.000.wav 30.000 40.000 Bus +jaSK_t8QP1E_30.000_40.000.wav 30.000 40.000 Bus +ji_YCMygNHQ_8.000_18.000.wav 8.000 18.000 Bus +kNKfoDp0uUw_30.000_40.000.wav 30.000 40.000 Bus +kdDgTDfo9HY_100.000_110.000.wav 100.000 110.000 Bus +lHP0q2sQzPQ_30.000_40.000.wav 30.000 40.000 Bus +mGG8rop4Jig_30.000_40.000.wav 30.000 40.000 Bus +oHKTmTLEy68_11.000_21.000.wav 11.000 21.000 Bus +tAfucDIyRiM_30.000_40.000.wav 30.000 40.000 Bus +tQd0vFueRKs_30.000_40.000.wav 30.000 40.000 Bus +ucICmff0K-Q_30.000_40.000.wav 30.000 40.000 Bus +x-2Abohj8VY_30.000_40.000.wav 30.000 40.000 Bus +xFr2xX6PulQ_70.000_80.000.wav 70.000 80.000 Bus +yfSBqp5IZSM_10.000_20.000.wav 10.000 20.000 Bus +-2sE5CH8Wb8_30.000_40.000.wav 30.000 40.000 Truck +-BY64_p-vtM_30.000_40.000.wav 30.000 40.000 Truck +-fJsZm3YRc0_30.000_40.000.wav 30.000 40.000 Truck +-t-htrAtNvM_30.000_40.000.wav 30.000 40.000 Truck +-zNEcuo28oE_30.000_40.000.wav 30.000 40.000 Truck +01WuUBxFBp4_30.000_40.000.wav 30.000 40.000 Truck +077aWlQn6XI_30.000_40.000.wav 30.000 40.000 Truck +0Ga7T-2e490_17.000_27.000.wav 17.000 27.000 Truck +0N9EN0BEjP0_430.000_440.000.wav 430.000 440.000 Truck +10aF24rMeu0_30.000_40.000.wav 30.000 40.000 Truck +2HZcxlRs-hg_30.000_40.000.wav 30.000 40.000 Truck +2Jpg_KvJWL0_30.000_40.000.wav 30.000 40.000 Truck +2Tmi7EqpGZQ_0.000_10.000.wav 0.000 10.000 Truck +4DlKNmVcoek_20.000_30.000.wav 20.000 30.000 Truck +4MRzQbAIyV4_90.000_100.000.wav 90.000 100.000 Truck +4Tpy1lsfcSM_30.000_40.000.wav 30.000 40.000 Truck +4ep09nZl3LA_30.000_40.000.wav 30.000 40.000 Truck +5DW8WjxxCag_30.000_40.000.wav 30.000 40.000 Truck +5DjZHCumLfs_11.000_21.000.wav 11.000 21.000 Truck +5QP1Tc3XbDc_30.000_40.000.wav 30.000 40.000 Truck +5V0xKS-FGMk_30.000_40.000.wav 30.000 40.000 Truck +5fLzQegwHUg_30.000_40.000.wav 30.000 40.000 Truck +6HL_DKWK-WA_10.000_20.000.wav 10.000 20.000 Truck +6VQGk8IrV-4_30.000_40.000.wav 30.000 40.000 Truck +6Y8bKS6KLeE_30.000_40.000.wav 30.000 40.000 Truck +6xEHP-C-ZuU_30.000_40.000.wav 30.000 40.000 Truck +6yyToq9cW9A_60.000_70.000.wav 60.000 70.000 Truck +7Gua0-UrKIw_30.000_40.000.wav 30.000 40.000 Truck +7nglQSmcjAk_30.000_40.000.wav 30.000 40.000 Truck +81DteAPIhoE_30.000_40.000.wav 30.000 40.000 Truck +84E9i9_ELBs_30.000_40.000.wav 30.000 40.000 Truck +8jblPMBafKE_30.000_40.000.wav 30.000 40.000 Truck +8k17D6qiuqI_30.000_40.000.wav 30.000 40.000 Truck +9EsgN-WS2qY_30.000_40.000.wav 30.000 40.000 Truck +9LJnjmcRcb8_280.000_290.000.wav 280.000 290.000 Truck +9yhMtJ50sys_30.000_40.000.wav 30.000 40.000 Truck +A9KMqwqLboE_30.000_40.000.wav 30.000 40.000 Truck +ARIVxBOc0BQ_40.000_50.000.wav 40.000 50.000 Truck +AwFuGITwrms_30.000_40.000.wav 30.000 40.000 Truck +BQVXzH6YK8g_30.000_40.000.wav 30.000 40.000 Truck +CnYWJp2bknU_50.000_60.000.wav 50.000 60.000 Truck +DRqKOlP8BmU_110.000_120.000.wav 110.000 120.000 Truck +DXlTakKvLzg_30.000_40.000.wav 30.000 40.000 Truck +DkVfro9iq80_30.000_40.000.wav 30.000 40.000 Truck +Dmy4EjohxxU_60.000_70.000.wav 60.000 70.000 Truck +DvMFQ64YwcI_30.000_40.000.wav 30.000 40.000 Truck +FEoMTMxzn3U_30.000_40.000.wav 30.000 40.000 Truck +GTk_6JDmtCY_230.000_240.000.wav 230.000 240.000 Truck +HDEPd5MIaow_30.000_40.000.wav 30.000 40.000 Truck +HQkLVac7z9Q_70.000_80.000.wav 70.000 80.000 Truck +I4VDcVTE4YA_30.000_40.000.wav 30.000 40.000 Truck +IxlvxvG8zOE_110.000_120.000.wav 110.000 120.000 Truck +JLzD44Im1Ec_30.000_40.000.wav 30.000 40.000 Truck +K4Hcb00hTTY_30.000_40.000.wav 30.000 40.000 Truck +L2M3xanqQP8_30.000_40.000.wav 30.000 40.000 Truck +LA5TekLaIPI_10.000_20.000.wav 10.000 20.000 Truck +LhRNnXaSsCk_30.000_40.000.wav 30.000 40.000 Truck +MWTTe0M9vi4_30.000_40.000.wav 30.000 40.000 Truck +Nkqx09b-xyI_70.000_80.000.wav 70.000 80.000 Truck +NqzZbJJl3E4_30.000_40.000.wav 30.000 40.000 Truck +OPd0cz1hRqc_30.000_40.000.wav 30.000 40.000 Truck +PCl-q7lCT_U_50.000_60.000.wav 50.000 60.000 Truck +PNaLTW50fxM_60.000_70.000.wav 60.000 70.000 Truck +PO1eaJ7tQOg_180.000_190.000.wav 180.000 190.000 Truck +PSt0xAYgf4g_0.000_10.000.wav 0.000 10.000 Truck +Pef6g19i5iI_30.000_40.000.wav 30.000 40.000 Truck +Q1CMSV81_ws_30.000_40.000.wav 30.000 40.000 Truck +SiBIYAiIajM_30.000_40.000.wav 30.000 40.000 Truck +T6oYCFRafPs_30.000_40.000.wav 30.000 40.000 Truck +WdubBeFntYQ_460.000_470.000.wav 460.000 470.000 Truck +_ZiJA6phEq8_30.000_40.000.wav 30.000 40.000 Truck +_jfv_ziZWII_60.000_70.000.wav 60.000 70.000 Truck +acvV6yYNc7Y_30.000_40.000.wav 30.000 40.000 Truck +bQSaQ0iX_vk_30.000_40.000.wav 30.000 40.000 Truck +bhxN5w03yS0_30.000_40.000.wav 30.000 40.000 Truck +ckt7YEGcSoY_30.000_40.000.wav 30.000 40.000 Truck +eIkUuCRE_0U_30.000_40.000.wav 30.000 40.000 Truck +gxVhAVNjSU0_30.000_40.000.wav 30.000 40.000 Truck +hDVNQOJCvOk_30.000_40.000.wav 30.000 40.000 Truck +ieZVo7W3BQ4_30.000_40.000.wav 30.000 40.000 Truck +ikmE_kRvDAc_30.000_40.000.wav 30.000 40.000 Truck +jwZTKNsbf58_70.000_80.000.wav 70.000 80.000 Truck +kH6fFjIZkB0_30.000_40.000.wav 30.000 40.000 Truck +kr8ssbrDDMY_30.000_40.000.wav 30.000 40.000 Truck +lp66EaEOOoU_30.000_40.000.wav 30.000 40.000 Truck +n4o1r8Ai66o_30.000_40.000.wav 30.000 40.000 Truck +nDtrUUc2J2U_0.000_10.000.wav 0.000 10.000 Truck +nMaSkwx6cHE_30.000_40.000.wav 30.000 40.000 Truck +p70IcMwsW9M_30.000_40.000.wav 30.000 40.000 Truck +pJ1fore8JbQ_30.000_40.000.wav 30.000 40.000 Truck +pt-J_L-OFI8_0.000_10.000.wav 0.000 10.000 Truck +rdanJP7Usrg_30.000_40.000.wav 30.000 40.000 Truck +srTX18ikXkE_10.000_20.000.wav 10.000 20.000 Truck +tuplsUUDXKw_30.000_40.000.wav 30.000 40.000 Truck +x6vuWsdeS3s_30.000_40.000.wav 30.000 40.000 Truck +xMClk12ouB8_30.000_40.000.wav 30.000 40.000 Truck +ycqDMKTrvLY_30.000_40.000.wav 30.000 40.000 Truck +yk5LqHTtHLo_30.000_40.000.wav 30.000 40.000 Truck +yrscqyUOIlI_30.000_40.000.wav 30.000 40.000 Truck +zM3chsL-B7U_30.000_40.000.wav 30.000 40.000 Truck +06si40RVDco_30.000_40.000.wav 30.000 40.000 Motorcycle +0DzsPL-xElE_20.000_30.000.wav 20.000 30.000 Motorcycle +145N68nh4m0_120.000_130.000.wav 120.000 130.000 Motorcycle +16vw4K9qJnY_30.000_40.000.wav 30.000 40.000 Motorcycle +21QlKF17ipc_30.000_40.000.wav 30.000 40.000 Motorcycle +3LulQoOXNB0_30.000_40.000.wav 30.000 40.000 Motorcycle +45JHcLU57B8_20.000_30.000.wav 20.000 30.000 Motorcycle +4NZkW-XaIa4_30.000_40.000.wav 30.000 40.000 Motorcycle +506I6LfdDuk_50.000_60.000.wav 50.000 60.000 Motorcycle +6MCy1lh4qaw_20.000_30.000.wav 20.000 30.000 Motorcycle +6R8cO4ARzkY_30.000_40.000.wav 30.000 40.000 Motorcycle +6taAP7SFewI_30.000_40.000.wav 30.000 40.000 Motorcycle +7g6aZTBe2xE_30.000_40.000.wav 30.000 40.000 Motorcycle +9HcahqYUVoc_90.000_100.000.wav 90.000 100.000 Motorcycle +9N1iw5Vdim8_20.000_30.000.wav 20.000 30.000 Motorcycle +ANWU9Hiy_5k_40.000_50.000.wav 40.000 50.000 Motorcycle +BTNz6NftP34_30.000_40.000.wav 30.000 40.000 Motorcycle +BxnLAGsByCI_10.000_20.000.wav 10.000 20.000 Motorcycle +CZgx_6XaEkg_30.000_40.000.wav 30.000 40.000 Motorcycle +D3BJuOwltoI_10.000_20.000.wav 10.000 20.000 Motorcycle +FgN9v1jYqjA_30.000_40.000.wav 30.000 40.000 Motorcycle +HQ8eR2lvjSE_30.000_40.000.wav 30.000 40.000 Motorcycle +Mb-GyQEKoEc_30.000_40.000.wav 30.000 40.000 Motorcycle +Pair_NsHdTc_30.000_40.000.wav 30.000 40.000 Motorcycle +UFIBEBkm7ao_30.000_40.000.wav 30.000 40.000 Motorcycle +UWz5OIijWM4_30.000_40.000.wav 30.000 40.000 Motorcycle +WLX3Db60418_20.000_30.000.wav 20.000 30.000 Motorcycle +X5Xs8Y1cJK0_30.000_40.000.wav 30.000 40.000 Motorcycle +ZGf0vrZStwI_30.000_40.000.wav 30.000 40.000 Motorcycle +ZfkO1HlI0zM_30.000_40.000.wav 30.000 40.000 Motorcycle +bhtB2Zgh9Q8_110.000_120.000.wav 110.000 120.000 Motorcycle +d-m8eXCpeDg_30.000_40.000.wav 30.000 40.000 Motorcycle +d21IwtH2oHI_30.000_40.000.wav 30.000 40.000 Motorcycle +dhaKGPCgtfw_30.000_40.000.wav 30.000 40.000 Motorcycle +ee-0JGvEIng_30.000_40.000.wav 30.000 40.000 Motorcycle +epGDNMrsQb8_40.000_50.000.wav 40.000 50.000 Motorcycle +ezUkPETm6cs_30.000_40.000.wav 30.000 40.000 Motorcycle +f724u5z_UDw_30.000_40.000.wav 30.000 40.000 Motorcycle +gGmWm1i6pVo_30.000_40.000.wav 30.000 40.000 Motorcycle +i9VjpIbM3iE_410.000_420.000.wav 410.000 420.000 Motorcycle +iMp8nODaotA_580.000_590.000.wav 580.000 590.000 Motorcycle +lVW2CqsHJ4Y_30.000_40.000.wav 30.000 40.000 Motorcycle +lj7hzmz19-M_30.000_40.000.wav 30.000 40.000 Motorcycle +mX45CiTjf8I_30.000_40.000.wav 30.000 40.000 Motorcycle +mbLiZ_jpgeY_20.000_30.000.wav 20.000 30.000 Motorcycle +owZDBEq6WdU_30.000_40.000.wav 30.000 40.000 Motorcycle +pNMBIqvbyB4_30.000_40.000.wav 30.000 40.000 Motorcycle +po-tnKZAzdg_40.000_50.000.wav 40.000 50.000 Motorcycle +qAQuljp-atA_30.000_40.000.wav 30.000 40.000 Motorcycle +r0Oll28wmXs_30.000_40.000.wav 30.000 40.000 Motorcycle +sAMjMyCdGOc_30.000_40.000.wav 30.000 40.000 Motorcycle +vHlqKDR7ggA_30.000_40.000.wav 30.000 40.000 Motorcycle +wPfv8ifzzyg_30.000_40.000.wav 30.000 40.000 Motorcycle +wyhurCZbKQU_30.000_40.000.wav 30.000 40.000 Motorcycle +xQTPEQDb0Gg_30.000_40.000.wav 30.000 40.000 Motorcycle +xTPmoYwgKf4_30.000_40.000.wav 30.000 40.000 Motorcycle +xXGIKM4daMU_30.000_40.000.wav 30.000 40.000 Motorcycle +xZ8hQliZqhg_160.000_170.000.wav 160.000 170.000 Motorcycle +xuMBy2NoROI_30.000_40.000.wav 30.000 40.000 Motorcycle +z_8yGVO1qws_30.000_40.000.wav 30.000 40.000 Motorcycle +-BaVEk1zS2g_50.000_60.000.wav 50.000 60.000 Train +-Q4fBQ4egrs_0.000_10.000.wav 0.000 10.000 Train +-QxSFr1cYuQ_20.000_30.000.wav 20.000 30.000 Train +-ZdReI9dL6M_530.000_540.000.wav 530.000 540.000 Train +0YIyGEM0yG0_550.000_560.000.wav 550.000 560.000 Train +1Mk2MJDhLJQ_20.000_30.000.wav 20.000 30.000 Train +2nejPPEWqJ8_320.000_330.000.wav 320.000 330.000 Train +3ACjUf9QpAQ_30.000_40.000.wav 30.000 40.000 Train +3RfrTU1p5SA_500.000_510.000.wav 500.000 510.000 Train +3YJewEC-NWo_30.000_40.000.wav 30.000 40.000 Train +3ZZDuYU2HM4_150.000_160.000.wav 150.000 160.000 Train +3fPX1LaGwJo_60.000_70.000.wav 60.000 70.000 Train +4_gyCWuPxRg_170.000_180.000.wav 170.000 180.000 Train +4l4vGrMD4Tw_550.000_560.000.wav 550.000 560.000 Train +4oT0bxldS80_30.000_40.000.wav 30.000 40.000 Train +4t7Mi3pnSA4_210.000_220.000.wav 210.000 220.000 Train +53oq_Otm_XI_30.000_40.000.wav 30.000 40.000 Train +6OgSNQOTw2U_30.000_40.000.wav 30.000 40.000 Train +6_TGlFO0DCk_10.000_20.000.wav 10.000 20.000 Train +7KdSGBzXvz8_420.000_430.000.wav 420.000 430.000 Train +7W_kcu0CJqI_310.000_320.000.wav 310.000 320.000 Train +8IaInXpdd9M_0.000_10.000.wav 0.000 10.000 Train +8nU1aVscJec_30.000_40.000.wav 30.000 40.000 Train +9LQEZJPNVpw_30.000_40.000.wav 30.000 40.000 Train +9NT6gEiqpWA_30.000_40.000.wav 30.000 40.000 Train +AFhll08KM98_30.000_40.000.wav 30.000 40.000 Train +AHom7lBbtoY_30.000_40.000.wav 30.000 40.000 Train +AK0kZUDk294_2.000_12.000.wav 2.000 12.000 Train +AKPC4rEGoyI_30.000_40.000.wav 30.000 40.000 Train +APsvUzw7bWA_60.000_70.000.wav 60.000 70.000 Train +AshwkKUV07s_23.000_33.000.wav 23.000 33.000 Train +BI2Tol64na0_30.000_40.000.wav 30.000 40.000 Train +BmS2NiuT2c0_160.000_170.000.wav 160.000 170.000 Train +CCX_4cW_SAU_0.000_10.000.wav 0.000 10.000 Train +D_nXtMgbPNY_30.000_40.000.wav 30.000 40.000 Train +F-JFxERdA2w_30.000_40.000.wav 30.000 40.000 Train +FoIBRxw0tyE_30.000_40.000.wav 30.000 40.000 Train +G958vjLYBcI_110.000_120.000.wav 110.000 120.000 Train +GFQnh84kNwU_30.000_40.000.wav 30.000 40.000 Train +GKc8PCTen8Q_310.000_320.000.wav 310.000 320.000 Train +I4qODX0fypE_30.000_40.000.wav 30.000 40.000 Train +IIIxN_ziy_I_60.000_70.000.wav 60.000 70.000 Train +IdqEbjujFb8_30.000_40.000.wav 30.000 40.000 Train +K-i81KrH8BQ_30.000_40.000.wav 30.000 40.000 Train +K9pSRLw6FNc_40.000_50.000.wav 40.000 50.000 Train +KPyYUly5xCc_90.000_100.000.wav 90.000 100.000 Train +L3a132_uApg_50.000_60.000.wav 50.000 60.000 Train +LK4b2eJpy24_30.000_40.000.wav 30.000 40.000 Train +LzcNa3HvD7c_30.000_40.000.wav 30.000 40.000 Train +MCYY8tJsnfY_7.000_17.000.wav 7.000 17.000 Train +MDF2vsjm8jU_10.000_20.000.wav 10.000 20.000 Train +MMfiWJVftMA_60.000_70.000.wav 60.000 70.000 Train +MYzVHespZ-E_30.000_40.000.wav 30.000 40.000 Train +Mbe4rlNiM84_0.000_7.000.wav 0.000 7.000 Train +MczH_PWBNeI_360.000_370.000.wav 360.000 370.000 Train +Mfkif49LLc4_30.000_40.000.wav 30.000 40.000 Train +MwSbYICrYj8_290.000_300.000.wav 290.000 300.000 Train +PJUy17bXlhc_40.000_50.000.wav 40.000 50.000 Train +QDTbchu0LrU_30.000_40.000.wav 30.000 40.000 Train +QZJ5WAYIUh8_70.000_80.000.wav 70.000 80.000 Train +QrAoRSA13bM_30.000_40.000.wav 30.000 40.000 Train +RN-_agT8_Cg_0.000_10.000.wav 0.000 10.000 Train +R_Lpb-51Kl4_30.000_40.000.wav 30.000 40.000 Train +Rhvy7V4F95Q_40.000_50.000.wav 40.000 50.000 Train +Rq-22Cycrpg_30.000_40.000.wav 30.000 40.000 Train +RrlgSfQrqQc_20.000_30.000.wav 20.000 30.000 Train +RwBKGPEg6uA_340.000_350.000.wav 340.000 350.000 Train +T73runykdnE_25.000_35.000.wav 25.000 35.000 Train +T8M6W4yOzI4_30.000_40.000.wav 30.000 40.000 Train +Tmm4H6alHCE_30.000_40.000.wav 30.000 40.000 Train +TyTORMEourg_270.000_280.000.wav 270.000 280.000 Train +UQx0EMXtLZA_60.000_70.000.wav 60.000 70.000 Train +UZx7OAgRMRY_90.000_100.000.wav 90.000 100.000 Train +UerX5Bv2hcs_70.000_80.000.wav 70.000 80.000 Train +UxSUGCvpskM_340.000_350.000.wav 340.000 350.000 Train +V2hln47cP78_130.000_140.000.wav 130.000 140.000 Train +VIe_Qkg5RJI_130.000_140.000.wav 130.000 140.000 Train +WDn851XbWTk_30.000_40.000.wav 30.000 40.000 Train +WFdpQCtpBB4_30.000_40.000.wav 30.000 40.000 Train +XAUtk9lwzU8_30.000_40.000.wav 30.000 40.000 Train +XDTlBb3aYqo_30.000_40.000.wav 30.000 40.000 Train +XKvLkIM8dck_40.000_50.000.wav 40.000 50.000 Train +XQbeLJYzY9k_90.000_100.000.wav 90.000 100.000 Train +XW8pSKLyr0o_20.000_30.000.wav 20.000 30.000 Train +XeYiNanFS_M_120.000_130.000.wav 120.000 130.000 Train +Y10I9JSvJuQ_30.000_40.000.wav 30.000 40.000 Train +YDGf-razgyU_250.000_260.000.wav 250.000 260.000 Train +YFD1Qrlskrg_60.000_70.000.wav 60.000 70.000 Train +Y_jwEflLthg_190.000_200.000.wav 190.000 200.000 Train +Y_ynIwm3qm0_370.000_380.000.wav 370.000 380.000 Train +Zy0goYEHPHU_30.000_40.000.wav 30.000 40.000 Train +_dkeW6lqmq4_30.000_40.000.wav 30.000 40.000 Train +aNO2KEXBCOk_30.000_40.000.wav 30.000 40.000 Train +aXsUHAKbyLs_30.000_40.000.wav 30.000 40.000 Train +ahct5yzUtdE_20.000_30.000.wav 20.000 30.000 Train +arevYmB0qGg_30.000_40.000.wav 30.000 40.000 Train +bCGtzspNbNo_30.000_40.000.wav 30.000 40.000 Train +bI6wPI9kAm8_70.000_80.000.wav 70.000 80.000 Train +bpdCMWWiB_0_30.000_40.000.wav 30.000 40.000 Train +cdrjKqyDrak_420.000_430.000.wav 420.000 430.000 Train +d1o334I5X_k_30.000_40.000.wav 30.000 40.000 Train +dSzZWgbJ378_30.000_40.000.wav 30.000 40.000 Train +eRclX9l0F_c_150.000_160.000.wav 150.000 160.000 Train +fOVsAMJ3Yms_30.000_40.000.wav 30.000 40.000 Train +fWVfi9pAh_4_10.000_20.000.wav 10.000 20.000 Train +fztkF47lVQg_0.000_10.000.wav 0.000 10.000 Train +g0ICxHjC9Uc_30.000_40.000.wav 30.000 40.000 Train +g2scd3YVgwQ_30.000_40.000.wav 30.000 40.000 Train +g4cA-ifQc70_30.000_40.000.wav 30.000 40.000 Train +g9JVq7wfDIo_30.000_40.000.wav 30.000 40.000 Train +gKMpowHeyKc_30.000_40.000.wav 30.000 40.000 Train +gTFCK9TuLOQ_30.000_40.000.wav 30.000 40.000 Train +gU0mD2fSh4c_500.000_510.000.wav 500.000 510.000 Train +gkH_Zxasn8o_40.000_50.000.wav 40.000 50.000 Train +gvnM4kK4r70_10.000_20.000.wav 10.000 20.000 Train +hH_M56EnnDk_30.000_40.000.wav 30.000 40.000 Train +hVvtTC9AmNs_30.000_40.000.wav 30.000 40.000 Train +hYqzr_rIIAw_30.000_40.000.wav 30.000 40.000 Train +hdYQzH2E-e4_310.000_320.000.wav 310.000 320.000 Train +iZgzRfa-xPQ_30.000_40.000.wav 30.000 40.000 Train +j9Z63H5hvrQ_0.000_10.000.wav 0.000 10.000 Train +jbW2ew8VMfU_50.000_60.000.wav 50.000 60.000 Train +jlz7r-NSUuA_50.000_60.000.wav 50.000 60.000 Train +k0vRZm7ZnQk_280.000_290.000.wav 280.000 290.000 Train +k8H8rn4NaSM_0.000_10.000.wav 0.000 10.000 Train +kbfkq3TuAe0_470.000_480.000.wav 470.000 480.000 Train +lf1Sblrda3A_560.000_570.000.wav 560.000 570.000 Train +m4DS9-5Gkds_30.000_40.000.wav 30.000 40.000 Train +m5HeCy87QYY_380.000_390.000.wav 380.000 390.000 Train +nKM4MUAsVzg_100.000_110.000.wav 100.000 110.000 Train +nY1gcEMzsWI_10.000_20.000.wav 10.000 20.000 Train +nfY_zkJceDw_30.000_40.000.wav 30.000 40.000 Train +oogrnx-_LBA_60.000_70.000.wav 60.000 70.000 Train +pW5SI1ZKUpA_30.000_40.000.wav 30.000 40.000 Train +pbOZLMrJy0A_0.000_10.000.wav 0.000 10.000 Train +pxmrmtEnROk_30.000_40.000.wav 30.000 40.000 Train +q7zzKHFWGkg_30.000_40.000.wav 30.000 40.000 Train +qu8vVFWKszA_30.000_40.000.wav 30.000 40.000 Train +r6mHSfFkY_8_30.000_40.000.wav 30.000 40.000 Train +rNNPQ9DD4no_30.000_40.000.wav 30.000 40.000 Train +rSrBDAgLUoI_460.000_470.000.wav 460.000 470.000 Train +stdjjG6Y5IU_30.000_40.000.wav 30.000 40.000 Train +t_lFhyZaZR0_150.000_160.000.wav 150.000 160.000 Train +txXSE7kgrc8_30.000_40.000.wav 30.000 40.000 Train +uZfsEDo3elY_20.000_30.000.wav 20.000 30.000 Train +umcnfA9veOw_160.000_170.000.wav 160.000 170.000 Train +uysTr0SfhLI_10.000_20.000.wav 10.000 20.000 Train +wM9wNgY8d4g_150.000_160.000.wav 150.000 160.000 Train +xabrKa79prM_30.000_40.000.wav 30.000 40.000 Train +xshKOSEF_6o_0.000_10.000.wav 0.000 10.000 Train +yBVxtq9k8Sg_0.000_10.000.wav 0.000 10.000 Train +yH1r2Bblluw_240.000_250.000.wav 240.000 250.000 Train +yywGJu6jp8U_30.000_40.000.wav 30.000 40.000 Train +z5uKFGeTtNg_30.000_40.000.wav 30.000 40.000 Train diff --git a/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv new file mode 100644 index 0000000000000000000000000000000000000000..d98569b2bb2a47882ab09081c204bc66823b5053 --- /dev/null +++ b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv @@ -0,0 +1,606 @@ +-5QrBL6MzLg_60.000_70.000.wav 60.000 70.000 Train horn +-E0shPRxAbo_30.000_40.000.wav 30.000 40.000 Train horn +-GCwoyCnYsY_0.000_10.000.wav 0.000 10.000 Train horn +-Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Train horn +-Qfk_Q2ctBs_30.000_40.000.wav 30.000 40.000 Train horn +-Wd1pV7UjWg_60.000_70.000.wav 60.000 70.000 Train horn +-Zq22n4OewA_30.000_40.000.wav 30.000 40.000 Train horn +-jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Train horn +-nGBPqlRNg4_30.000_40.000.wav 30.000 40.000 Train horn +-u9BxBNcrw4_30.000_40.000.wav 30.000 40.000 Train horn +-zqW9xCZd80_260.000_270.000.wav 260.000 270.000 Train horn +02w3vd_GgF0_390.000_400.000.wav 390.000 400.000 Train horn +0HqeYIREv8M_30.000_40.000.wav 30.000 40.000 Train horn +0IpYF91Fdt0_80.000_90.000.wav 80.000 90.000 Train horn +0NaZejdABG0_90.000_100.000.wav 90.000 100.000 Train horn +0RurXUfKyow_4.000_14.000.wav 4.000 14.000 Train horn +0_HnD-rW3lI_170.000_180.000.wav 170.000 180.000 Train horn +10i60V1RZkQ_210.000_220.000.wav 210.000 220.000 Train horn +1FJY5X1iY9I_170.000_180.000.wav 170.000 180.000 Train horn +1S5WKCcf-wU_40.000_50.000.wav 40.000 50.000 Train horn +1U0Ty6CW6AM_40.000_50.000.wav 40.000 50.000 Train horn +1hQLr88iCvg_30.000_40.000.wav 30.000 40.000 Train horn +1iUXERALOOs_190.000_200.000.wav 190.000 200.000 Train horn +1iWFlLpixKU_5.000_15.000.wav 5.000 15.000 Train horn +1oJAVJPX0YY_20.000_30.000.wav 20.000 30.000 Train horn +26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Train horn +2BMHsKLcb7E_90.000_100.000.wav 90.000 100.000 Train horn +2RpOd9MJjyQ_10.000_20.000.wav 10.000 20.000 Train horn +2U4wSdl10to_200.000_210.000.wav 200.000 210.000 Train horn +2aBV6AZt5nk_570.000_580.000.wav 570.000 580.000 Train horn +-8baTnilyjs_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +-Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +-jG26jT3fP8_230.000_240.000.wav 230.000 240.000 Air horn, truck horn +-jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Air horn, truck horn +-v7cUxke-f4_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +-yeWlsEpcpA_15.000_25.000.wav 15.000 25.000 Air horn, truck horn +04KOunVOkSA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +08y2LHhxmsM_400.000_410.000.wav 400.000 410.000 Air horn, truck horn +0G73yqtBwgE_11.000_21.000.wav 11.000 21.000 Air horn, truck horn +0UPY7ws-VFs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn +0euD32aKYUs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn +1T1i2rny8RU_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +1iRgwn7p0DA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +1myTsHAIvYc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +1z0XoG6GEv4_420.000_430.000.wav 420.000 430.000 Air horn, truck horn +26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Air horn, truck horn +2KmSuPb9gwA_24.000_34.000.wav 24.000 34.000 Air horn, truck horn +2Vy5NCEkg2I_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +2ZciT0XrifM_0.000_8.000.wav 0.000 8.000 Air horn, truck horn +2jOzX06bzuA_16.000_26.000.wav 16.000 26.000 Air horn, truck horn +35EOmSMTQ6I_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Air horn, truck horn +3ntFslTK6hM_90.000_100.000.wav 90.000 100.000 Air horn, truck horn +3rGOv4evODE_20.000_30.000.wav 20.000 30.000 Air horn, truck horn +42U7xIucU68_20.000_30.000.wav 20.000 30.000 Air horn, truck horn +46r7mO2k6zY_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +4EBnb2DN3Yg_13.000_23.000.wav 13.000 23.000 Air horn, truck horn +4NTjS5pFfSc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +4bvfOnX7BIE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +4l78f9VZ9uE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn +-ajCLjpfGKI_83.000_93.000.wav 83.000 93.000 Car alarm +-hLSc9aPOms_13.000_23.000.wav 13.000 23.000 Car alarm +-rgDWfvxxqw_30.000_40.000.wav 30.000 40.000 Car alarm +0C3kqtF76t8_50.000_60.000.wav 50.000 60.000 Car alarm +0Hz4R_m0hmI_80.000_90.000.wav 80.000 90.000 Car alarm +0ZPafgZftWk_80.000_90.000.wav 80.000 90.000 Car alarm +0npLQ4LzD0c_40.000_50.000.wav 40.000 50.000 Car alarm +17VuPl9Wxvs_20.000_30.000.wav 20.000 30.000 Car alarm +3HxQ83IMyw4_70.000_80.000.wav 70.000 80.000 Car alarm +3z05luLEc_Q_0.000_10.000.wav 0.000 10.000 Car alarm +4A1Ar1TIXIY_30.000_40.000.wav 30.000 40.000 Car alarm +4Kpklmj-ze0_53.000_63.000.wav 53.000 63.000 Car alarm +4h01lBkTVQY_18.000_28.000.wav 18.000 28.000 Car alarm +5-SzZotiaBU_30.000_40.000.wav 30.000 40.000 Car alarm +54PbkldEp9M_30.000_40.000.wav 30.000 40.000 Car alarm +5P6YYsMaIH4_30.000_40.000.wav 30.000 40.000 Car alarm +5tzTahLHylw_70.000_80.000.wav 70.000 80.000 Car alarm +7DC3HtNi4fU_160.000_170.000.wav 160.000 170.000 Car alarm +7NJ5TbNEIvA_250.000_260.000.wav 250.000 260.000 Car alarm +7NZ0kMj2HSI_54.000_64.000.wav 54.000 64.000 Car alarm +7RQpt1_1ZzU_30.000_40.000.wav 30.000 40.000 Car alarm +7ee54nr6jG8_30.000_40.000.wav 30.000 40.000 Car alarm +8OajsyPSNt8_40.000_50.000.wav 40.000 50.000 Car alarm +9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car alarm +9fzeD7CeI7Y_110.000_120.000.wav 110.000 120.000 Car alarm +9jYv9WuyknA_130.000_140.000.wav 130.000 140.000 Car alarm +A-GNszKtjJc_93.000_103.000.wav 93.000 103.000 Car alarm +A437a4Y_xag_230.000_240.000.wav 230.000 240.000 Car alarm +APMPW2YI-Zk_20.000_30.000.wav 20.000 30.000 Car alarm +AR-KmtlXg4Y_70.000_80.000.wav 70.000 80.000 Car alarm +-60XojQWWoc_30.000_40.000.wav 30.000 40.000 Reversing beeps +-6d-zxMvC5E_30.000_40.000.wav 30.000 40.000 Reversing beeps +-6qSMlbJJ58_30.000_40.000.wav 30.000 40.000 Reversing beeps +-8OITuFZha8_30.000_40.000.wav 30.000 40.000 Reversing beeps +-8n2NqDFRko_30.000_40.000.wav 30.000 40.000 Reversing beeps +-AIrHVeCgtM_30.000_40.000.wav 30.000 40.000 Reversing beeps +-AVzYvKHwPg_30.000_40.000.wav 30.000 40.000 Reversing beeps +-AXDeY-N2_M_30.000_40.000.wav 30.000 40.000 Reversing beeps +-B1uzsLG0Dk_30.000_40.000.wav 30.000 40.000 Reversing beeps +-BM_EAszxBg_30.000_40.000.wav 30.000 40.000 Reversing beeps +-Em3OpyaefM_30.000_40.000.wav 30.000 40.000 Reversing beeps +-FWkB2IDMhc_30.000_40.000.wav 30.000 40.000 Reversing beeps +-SP7KWmTRUU_30.000_40.000.wav 30.000 40.000 Reversing beeps +-h4or05bj_I_30.000_40.000.wav 30.000 40.000 Reversing beeps +-oV6dQu5tZo_30.000_40.000.wav 30.000 40.000 Reversing beeps +-r8mfjRiHrU_30.000_40.000.wav 30.000 40.000 Reversing beeps +-s9kwrRilOY_30.000_40.000.wav 30.000 40.000 Reversing beeps +-uMiGr6xvRA_30.000_40.000.wav 30.000 40.000 Reversing beeps +-x70B12Mb-8_30.000_40.000.wav 30.000 40.000 Reversing beeps +-xYsfYZOI-Y_30.000_40.000.wav 30.000 40.000 Reversing beeps +-zxrdL6MlKI_30.000_40.000.wav 30.000 40.000 Reversing beeps +03xMfqt4fZI_24.000_34.000.wav 24.000 34.000 Reversing beeps +0E4AqW9dmdk_30.000_40.000.wav 30.000 40.000 Reversing beeps +0FQo-2xRJ0E_30.000_40.000.wav 30.000 40.000 Reversing beeps +0HmiH-wKLB4_30.000_40.000.wav 30.000 40.000 Reversing beeps +0KskqFt3DoY_15.000_25.000.wav 15.000 25.000 Reversing beeps +0OiPtV9sd_w_30.000_40.000.wav 30.000 40.000 Reversing beeps +0P-YGHC5cBU_30.000_40.000.wav 30.000 40.000 Reversing beeps +0QKet-tdquc_30.000_40.000.wav 30.000 40.000 Reversing beeps +0VnoYVqd-yo_30.000_40.000.wav 30.000 40.000 Reversing beeps +-5px8DVPl8A_28.000_38.000.wav 28.000 38.000 Bicycle +-D08wyQwDPQ_10.000_20.000.wav 10.000 20.000 Bicycle +-F1_Gh78vJ0_30.000_40.000.wav 30.000 40.000 Bicycle +-FZQIkX44Pk_10.000_20.000.wav 10.000 20.000 Bicycle +-FsvS99nWTc_30.000_40.000.wav 30.000 40.000 Bicycle +-Holdef_BZ0_30.000_40.000.wav 30.000 40.000 Bicycle +-Inn26beF70_30.000_40.000.wav 30.000 40.000 Bicycle +-Jq9HNSs_ns_14.000_24.000.wav 14.000 24.000 Bicycle +-KlN_AXMM0Q_30.000_40.000.wav 30.000 40.000 Bicycle +-NCcqKWiGus_30.000_40.000.wav 30.000 40.000 Bicycle +-NNC_TqWfGw_30.000_40.000.wav 30.000 40.000 Bicycle +-OGFiXvmldM_30.000_40.000.wav 30.000 40.000 Bicycle +-RFpDUZhN-g_13.000_23.000.wav 13.000 23.000 Bicycle +-XUfeRTw3b4_0.000_6.000.wav 0.000 6.000 Bicycle +-XoATxJ-Qcg_30.000_40.000.wav 30.000 40.000 Bicycle +-bFNxvFwDts_470.000_480.000.wav 470.000 480.000 Bicycle +-e5PokL6Cyo_30.000_40.000.wav 30.000 40.000 Bicycle +-fNyOf9zIU0_30.000_40.000.wav 30.000 40.000 Bicycle +-fhpkRyZL90_30.000_40.000.wav 30.000 40.000 Bicycle +-fo3m0hiZbg_30.000_40.000.wav 30.000 40.000 Bicycle +-ikJkNwcmkA_27.000_37.000.wav 27.000 37.000 Bicycle +-k2nMcxAjWE_30.000_40.000.wav 30.000 40.000 Bicycle +-k80ibA-fyw_30.000_40.000.wav 30.000 40.000 Bicycle +-lBcEVa_NKw_30.000_40.000.wav 30.000 40.000 Bicycle +-mQyAYU_Bd4_50.000_60.000.wav 50.000 60.000 Bicycle +-ngrinYHF4c_30.000_40.000.wav 30.000 40.000 Bicycle +-nqm_RJ2xj8_40.000_50.000.wav 40.000 50.000 Bicycle +-oAw5iTeT1g_40.000_50.000.wav 40.000 50.000 Bicycle +-p2EMzpTE38_4.000_14.000.wav 4.000 14.000 Bicycle +-qmfWP_yzn4_30.000_40.000.wav 30.000 40.000 Bicycle +-0DIFwkUpjQ_50.000_60.000.wav 50.000 60.000 Skateboard +-53qltVyjpc_180.000_190.000.wav 180.000 190.000 Skateboard +-5y4jb9eUWs_110.000_120.000.wav 110.000 120.000 Skateboard +-81kolkG8M0_0.000_8.000.wav 0.000 8.000 Skateboard +-9dwTSq6JZg_70.000_80.000.wav 70.000 80.000 Skateboard +-9oKZsjjf_0_20.000_30.000.wav 20.000 30.000 Skateboard +-AFGfu5zOzQ_30.000_40.000.wav 30.000 40.000 Skateboard +-DHGwygUsQc_30.000_40.000.wav 30.000 40.000 Skateboard +-DkuTmIs7_Q_30.000_40.000.wav 30.000 40.000 Skateboard +-E1E17R7UBA_260.000_270.000.wav 260.000 270.000 Skateboard +-E1aIXhB4YU_30.000_40.000.wav 30.000 40.000 Skateboard +-McJLXNN3-o_50.000_60.000.wav 50.000 60.000 Skateboard +-N7nQ4CXGsY_170.000_180.000.wav 170.000 180.000 Skateboard +-O5vrHFRzcY_30.000_40.000.wav 30.000 40.000 Skateboard +-Plh9jAN_Eo_0.000_2.000.wav 0.000 2.000 Skateboard +-Qd_dXTbgK0_30.000_40.000.wav 30.000 40.000 Skateboard +-aVZ-H92M_s_0.000_4.000.wav 0.000 4.000 Skateboard +-cd-Zn8qFxU_90.000_100.000.wav 90.000 100.000 Skateboard +-esP4loyvjM_60.000_70.000.wav 60.000 70.000 Skateboard +-iB3a71aPew_30.000_40.000.wav 30.000 40.000 Skateboard +-lZapwtvwlg_0.000_10.000.wav 0.000 10.000 Skateboard +-mxMaMJCXL8_180.000_190.000.wav 180.000 190.000 Skateboard +-nYGTw9Sypg_20.000_30.000.wav 20.000 30.000 Skateboard +-oS19KshdlM_30.000_40.000.wav 30.000 40.000 Skateboard +-s6uxc77NWo_40.000_50.000.wav 40.000 50.000 Skateboard +-sCrXS2kJlA_30.000_40.000.wav 30.000 40.000 Skateboard +-saCvPTdQ7s_30.000_40.000.wav 30.000 40.000 Skateboard +-sb-knLiDic_20.000_30.000.wav 20.000 30.000 Skateboard +-tSwRvqaKWg_90.000_100.000.wav 90.000 100.000 Skateboard +-x_jV34hVq4_30.000_40.000.wav 30.000 40.000 Skateboard +--ljM2Kojag_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-4F1TX-T6T4_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-7HVWUwyMig_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-9pUUT-6o8U_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-Ei2LE71Dfg_20.000_30.000.wav 20.000 30.000 Ambulance (siren) +-LGTb-xyjzA_11.000_21.000.wav 11.000 21.000 Ambulance (siren) +-Y1qiiugnk8_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-YsrLG2K1TE_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-ZeMV790MXE_10.000_20.000.wav 10.000 20.000 Ambulance (siren) +-d-T8Y9-TOg_17.000_27.000.wav 17.000 27.000 Ambulance (siren) +-dcrL5JLmvo_11.000_21.000.wav 11.000 21.000 Ambulance (siren) +-fCSO8SVWZU_6.000_16.000.wav 6.000 16.000 Ambulance (siren) +-fGFQTGd2nA_10.000_20.000.wav 10.000 20.000 Ambulance (siren) +-hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Ambulance (siren) +-jnQgpHubNI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-k6p9n9y22Q_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-kr4SUjnm88_29.000_39.000.wav 29.000 39.000 Ambulance (siren) +-lyPnABQhCI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-od8LQAVgno_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-pVEgzu95Nc_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-w-9yF465IY_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-woquFRnQk8_16.000_26.000.wav 16.000 26.000 Ambulance (siren) +-xz75wUCln8_50.000_60.000.wav 50.000 60.000 Ambulance (siren) +-yGElLHdkEI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-yPSgCn9AWo_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-z8jsgl3iHE_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +00H_s-krtg8_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +02u3P99INjs_8.000_18.000.wav 8.000 18.000 Ambulance (siren) +06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Ambulance (siren) +0EPK7Pv_lbE_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +-0Eem_FuIto_15.000_25.000.wav 15.000 25.000 Fire engine, fire truck (siren) +-2sT5oBBWWY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-45cKZA7Jww_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-4B435WQvag_20.000_30.000.wav 20.000 30.000 Fire engine, fire truck (siren) +-6qhtwdfGOA_23.000_33.000.wav 23.000 33.000 Fire engine, fire truck (siren) +-8uyNBFbdFc_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-Jsu4dbuO4A_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-KsPTvgJJVE_350.000_360.000.wav 350.000 360.000 Fire engine, fire truck (siren) +-PRrNx6_MD0_16.000_26.000.wav 16.000 26.000 Fire engine, fire truck (siren) +-QBo1W2w8II_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-QX-ddNtUvE_24.000_34.000.wav 24.000 34.000 Fire engine, fire truck (siren) +-RlUu1el2G4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-SkO97C81Ms_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-T8QHPXfIC4_13.000_23.000.wav 13.000 23.000 Fire engine, fire truck (siren) +-USiTjZoh88_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-X0vNLwH1C0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-Z3ByS_RCwI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-ZtZOcg3s7M_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-cOjJ0Nvtlw_23.000_33.000.wav 23.000 33.000 Fire engine, fire truck (siren) +-cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Fire engine, fire truck (siren) +-eYUCWGQ_wU_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Fire engine, fire truck (siren) +-hplTh4SGvs_90.000_100.000.wav 90.000 100.000 Fire engine, fire truck (siren) +-nPhg6Eu4b4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-oCvKmNbhl0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-oEGuMg8hT4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-pvaJ4DwtRg_3.000_13.000.wav 3.000 13.000 Fire engine, fire truck (siren) +-qKRKDTbt4c_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-sJn3uUxpH8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-sfn1NDHWJI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +-09rxiqNNEs_30.000_40.000.wav 30.000 40.000 Civil defense siren +-3qh-WFUV2U_30.000_40.000.wav 30.000 40.000 Civil defense siren +-4JG_Ag99hY_30.000_40.000.wav 30.000 40.000 Civil defense siren +-60NmEaP0is_0.000_10.000.wav 0.000 10.000 Civil defense siren +-6cTEqIcics_30.000_40.000.wav 30.000 40.000 Civil defense siren +-6iVBmb5PZU_40.000_50.000.wav 40.000 50.000 Civil defense siren +-6qp8NjWffE_30.000_40.000.wav 30.000 40.000 Civil defense siren +-75iY1j3MeY_30.000_40.000.wav 30.000 40.000 Civil defense siren +-E3Yju3lrRo_30.000_40.000.wav 30.000 40.000 Civil defense siren +-FHSBdx5A3g_40.000_50.000.wav 40.000 50.000 Civil defense siren +-JhSzxTdcwY_30.000_40.000.wav 30.000 40.000 Civil defense siren +-OtNDK_Hxp8_30.000_40.000.wav 30.000 40.000 Civil defense siren +-S3_I0RiG3g_30.000_40.000.wav 30.000 40.000 Civil defense siren +-YMXgDKKAwU_30.000_40.000.wav 30.000 40.000 Civil defense siren +-c7XoYM-SSY_30.000_40.000.wav 30.000 40.000 Civil defense siren +-j8EeIX9ynk_30.000_40.000.wav 30.000 40.000 Civil defense siren +-t478yabOQw_30.000_40.000.wav 30.000 40.000 Civil defense siren +-uIyMR9luvg_30.000_40.000.wav 30.000 40.000 Civil defense siren +-wgP6ua-t4k_40.000_50.000.wav 40.000 50.000 Civil defense siren +-zGAb18JxmI_30.000_40.000.wav 30.000 40.000 Civil defense siren +03NLMEMi8-I_30.000_40.000.wav 30.000 40.000 Civil defense siren +0552YhBdeXo_30.000_40.000.wav 30.000 40.000 Civil defense siren +06TM6z3NvuY_30.000_40.000.wav 30.000 40.000 Civil defense siren +0CUi0oGUzjU_30.000_40.000.wav 30.000 40.000 Civil defense siren +0GpUFFJNFH8_30.000_40.000.wav 30.000 40.000 Civil defense siren +0H_WUo2srs0_30.000_40.000.wav 30.000 40.000 Civil defense siren +0HvYkBXQ44A_30.000_40.000.wav 30.000 40.000 Civil defense siren +0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Civil defense siren +0JKcTVpby0I_30.000_40.000.wav 30.000 40.000 Civil defense siren +0PhU-PIsUMw_40.000_50.000.wav 40.000 50.000 Civil defense siren +-122tCXtFhU_30.000_40.000.wav 30.000 40.000 Police car (siren) +-1U98XBTyB4_30.000_40.000.wav 30.000 40.000 Police car (siren) +-2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Police car (siren) +-6WqJCSmkCw_70.000_80.000.wav 70.000 80.000 Police car (siren) +-AF7wp3ezww_140.000_150.000.wav 140.000 150.000 Police car (siren) +-AFASmp1fpk_6.000_16.000.wav 6.000 16.000 Police car (siren) +-F2lk9A8B8M_30.000_40.000.wav 30.000 40.000 Police car (siren) +-GPv09qi9A8_120.000_130.000.wav 120.000 130.000 Police car (siren) +-Hi-WpRGUpc_9.000_19.000.wav 9.000 19.000 Police car (siren) +-KsPTvgJJVE_350.000_360.000.wav 350.000 360.000 Police car (siren) +-MfBpxtGQmE_20.000_30.000.wav 20.000 30.000 Police car (siren) +-Pg4vVPs4bE_30.000_40.000.wav 30.000 40.000 Police car (siren) +-UCf_-3yzWU_290.000_300.000.wav 290.000 300.000 Police car (siren) +-VULyMtKazE_0.000_7.000.wav 0.000 7.000 Police car (siren) +-XRiLbb3Syo_2.000_12.000.wav 2.000 12.000 Police car (siren) +-XrpzGb6xCU_190.000_200.000.wav 190.000 200.000 Police car (siren) +-YsrLG2K1TE_30.000_40.000.wav 30.000 40.000 Police car (siren) +-ZtZOcg3s7M_30.000_40.000.wav 30.000 40.000 Police car (siren) +-_8fdnv6Crg_30.000_40.000.wav 30.000 40.000 Police car (siren) +-az6BooRLxw_40.000_50.000.wav 40.000 50.000 Police car (siren) +-bs3c27rEtc_30.000_40.000.wav 30.000 40.000 Police car (siren) +-dBTGdL4RFs_30.000_40.000.wav 30.000 40.000 Police car (siren) +-gKNRXbpAKs_30.000_40.000.wav 30.000 40.000 Police car (siren) +-hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Police car (siren) +-haSUR_IUto_30.000_40.000.wav 30.000 40.000 Police car (siren) +-l-DEfDAvNA_30.000_40.000.wav 30.000 40.000 Police car (siren) +-lWs7_49gss_30.000_40.000.wav 30.000 40.000 Police car (siren) +-lhnhB4rbGw_3.000_13.000.wav 3.000 13.000 Police car (siren) +-rkJeBBmiTQ_60.000_70.000.wav 60.000 70.000 Police car (siren) +-rs7FPxzc6w_8.000_18.000.wav 8.000 18.000 Police car (siren) +-20uudT97E0_30.000_40.000.wav 30.000 40.000 Screaming +-3bGlOhRkAo_140.000_150.000.wav 140.000 150.000 Screaming +-4pUrlMafww_1.000_11.000.wav 1.000 11.000 Screaming +-7R0ybQQAHg_60.000_70.000.wav 60.000 70.000 Screaming +-7gojlG6bE4_30.000_40.000.wav 30.000 40.000 Screaming +-GI5PbO6j50_30.000_40.000.wav 30.000 40.000 Screaming +-MuIRudOtxw_30.000_40.000.wav 30.000 40.000 Screaming +-WfQBr42ymw_30.000_40.000.wav 30.000 40.000 Screaming +-YOjIgYspsY_30.000_40.000.wav 30.000 40.000 Screaming +-g_AcRVFfXU_30.000_40.000.wav 30.000 40.000 Screaming +-gb5uvwsRpI_30.000_40.000.wav 30.000 40.000 Screaming +-iAwqlQ3TEk_0.000_3.000.wav 0.000 3.000 Screaming +-nJoxcmxz5g_30.000_40.000.wav 30.000 40.000 Screaming +-pwgypWE-J8_30.000_40.000.wav 30.000 40.000 Screaming +-pzasCR0kpc_30.000_40.000.wav 30.000 40.000 Screaming +-sUgHKZQKYc_30.000_40.000.wav 30.000 40.000 Screaming +-uazzQEmQ7c_0.000_10.000.wav 0.000 10.000 Screaming +-vHJU1wDRsY_30.000_40.000.wav 30.000 40.000 Screaming +0-RnTXpp8Q0_30.000_40.000.wav 30.000 40.000 Screaming +09YQukdYVI4_30.000_40.000.wav 30.000 40.000 Screaming +0Ees8KFCUXM_30.000_40.000.wav 30.000 40.000 Screaming +0EymGuYWkFk_30.000_40.000.wav 30.000 40.000 Screaming +0Nw1OyTsaAo_30.000_40.000.wav 30.000 40.000 Screaming +0YnOMAls83g_30.000_40.000.wav 30.000 40.000 Screaming +0_gyUQkLCY8_30.000_40.000.wav 30.000 40.000 Screaming +0_hnDV2SHBI_7.000_17.000.wav 7.000 17.000 Screaming +0cqEaAkbrbI_80.000_90.000.wav 80.000 90.000 Screaming +0hC044mDsWA_30.000_40.000.wav 30.000 40.000 Screaming +0kQANiakiH0_30.000_40.000.wav 30.000 40.000 Screaming +0rVBXpbgO8s_30.000_40.000.wav 30.000 40.000 Screaming +---lTs1dxhU_30.000_40.000.wav 30.000 40.000 Car +--330hg-Ocw_30.000_40.000.wav 30.000 40.000 Car +--8puiAGLhs_30.000_40.000.wav 30.000 40.000 Car +--9VR_F7CtY_30.000_40.000.wav 30.000 40.000 Car +--F70LWypIg_30.000_40.000.wav 30.000 40.000 Car +--P4wuph3Mc_0.000_8.000.wav 0.000 8.000 Car +--QvRbvnbUE_30.000_40.000.wav 30.000 40.000 Car +--SeOZy3Yik_30.000_40.000.wav 30.000 40.000 Car +--Zz7BgxSUg_30.000_40.000.wav 30.000 40.000 Car +--e0Vu_ruTc_30.000_40.000.wav 30.000 40.000 Car +--iFD6IyQW8_30.000_40.000.wav 30.000 40.000 Car +--jGnLqFsQ4_24.000_34.000.wav 24.000 34.000 Car +--jc0NAxK8M_30.000_40.000.wav 30.000 40.000 Car +--v1WjOJv-w_150.000_160.000.wav 150.000 160.000 Car +--xDffQ9Mwo_30.000_40.000.wav 30.000 40.000 Car +--yaQA8d1dI_6.000_16.000.wav 6.000 16.000 Car +--zLzL0sq3M_30.000_40.000.wav 30.000 40.000 Car +-0-jXXldDOU_10.000_20.000.wav 10.000 20.000 Car +-03ld83JliM_29.000_39.000.wav 29.000 39.000 Car +-0B-egfXU7E_30.000_40.000.wav 30.000 40.000 Car +-0Bkyt8iZ1I_8.000_18.000.wav 8.000 18.000 Car +-0CIk-OOp7Y_30.000_40.000.wav 30.000 40.000 Car +-0CRb8H4hzY_4.000_14.000.wav 4.000 14.000 Car +-0CY5NWBHyY_20.000_30.000.wav 20.000 30.000 Car +-0HsrVfb5vc_20.000_30.000.wav 20.000 30.000 Car +-0I89-H0AFo_26.000_36.000.wav 26.000 36.000 Car +-0P6VDQ1YDs_80.000_90.000.wav 80.000 90.000 Car +-0PrEsytvc0_30.000_40.000.wav 30.000 40.000 Car +-0RqnaXZu_E_30.000_40.000.wav 30.000 40.000 Car +-0Yynyhm1AY_14.000_24.000.wav 14.000 24.000 Car +---lTs1dxhU_30.000_40.000.wav 30.000 40.000 Car passing by +--P4wuph3Mc_0.000_8.000.wav 0.000 8.000 Car passing by +--xDffQ9Mwo_30.000_40.000.wav 30.000 40.000 Car passing by +--zLzL0sq3M_30.000_40.000.wav 30.000 40.000 Car passing by +--zbPxnl27o_20.000_30.000.wav 20.000 30.000 Car passing by +-0CRb8H4hzY_4.000_14.000.wav 4.000 14.000 Car passing by +-0MnD7jBvkE_0.000_4.000.wav 0.000 4.000 Car passing by +-0U3c4PN8sc_30.000_40.000.wav 30.000 40.000 Car passing by +-0Yynyhm1AY_14.000_24.000.wav 14.000 24.000 Car passing by +-10fWp7Pqs4_30.000_40.000.wav 30.000 40.000 Car passing by +-14BFlDzjS4_6.000_16.000.wav 6.000 16.000 Car passing by +-15nPYi2v1g_30.000_40.000.wav 30.000 40.000 Car passing by +-19pq3HJoBM_30.000_40.000.wav 30.000 40.000 Car passing by +-1BrkFLHD74_19.000_29.000.wav 19.000 29.000 Car passing by +-1HlfoHZCEE_6.000_16.000.wav 6.000 16.000 Car passing by +-1McjOPUzbo_30.000_40.000.wav 30.000 40.000 Car passing by +-1sGSNmgiPs_4.000_14.000.wav 4.000 14.000 Car passing by +-2-luek6dI8_30.000_40.000.wav 30.000 40.000 Car passing by +-21-RfxQscI_30.000_40.000.wav 30.000 40.000 Car passing by +-25LkbSjEos_30.000_40.000.wav 30.000 40.000 Car passing by +-2LJWaL2PuA_30.000_40.000.wav 30.000 40.000 Car passing by +-2ZbvsBSZmY_2.000_12.000.wav 2.000 12.000 Car passing by +-2cz2qQDmr4_30.000_40.000.wav 30.000 40.000 Car passing by +-31KUAOSg5U_5.000_15.000.wav 5.000 15.000 Car passing by +-35qBdzN9ck_30.000_40.000.wav 30.000 40.000 Car passing by +-3929cmVE20_30.000_40.000.wav 30.000 40.000 Car passing by +-3M-k4nIYIM_30.000_40.000.wav 30.000 40.000 Car passing by +-3MNphBfq_0_30.000_40.000.wav 30.000 40.000 Car passing by +-3_RSVYKkkk_30.000_40.000.wav 30.000 40.000 Car passing by +-3exNVlj92w_30.000_40.000.wav 30.000 40.000 Car passing by +--0w1YA1Hm4_30.000_40.000.wav 30.000 40.000 Bus +-0_vEaaXndY_11.000_21.000.wav 11.000 21.000 Bus +-5GcZwBvBdI_30.000_40.000.wav 30.000 40.000 Bus +-5digoPWn6U_8.000_18.000.wav 8.000 18.000 Bus +-79l4w4DsYM_30.000_40.000.wav 30.000 40.000 Bus +-7B4pbkIEas_30.000_40.000.wav 30.000 40.000 Bus +-8YTu7ZGA2w_30.000_40.000.wav 30.000 40.000 Bus +-93IM29_8rs_14.000_24.000.wav 14.000 24.000 Bus +-9GhPxGkpio_26.000_36.000.wav 26.000 36.000 Bus +-9J9xs7LM9Y_25.000_35.000.wav 25.000 35.000 Bus +-AY_lZLYJR8_8.000_18.000.wav 8.000 18.000 Bus +-AdQBgtN_4E_30.000_40.000.wav 30.000 40.000 Bus +-BxfsWlPUPY_30.000_40.000.wav 30.000 40.000 Bus +-CgCr8Eknm0_14.000_24.000.wav 14.000 24.000 Bus +-CnsvTDIXdE_20.000_30.000.wav 20.000 30.000 Bus +-CpMlnGhxEU_0.000_9.000.wav 0.000 9.000 Bus +-DP_cv0x_Ng_30.000_40.000.wav 30.000 40.000 Bus +-FEXRjcryZE_30.000_40.000.wav 30.000 40.000 Bus +-Fp2-w-iLiE_20.000_30.000.wav 20.000 30.000 Bus +-GLk6G9U09A_30.000_40.000.wav 30.000 40.000 Bus +-Ga9sSkpngg_30.000_40.000.wav 30.000 40.000 Bus +-H8V23dZoLo_0.000_10.000.wav 0.000 10.000 Bus +-HeQfwKbFzg_30.000_40.000.wav 30.000 40.000 Bus +-HzzEuFBiDU_30.000_40.000.wav 30.000 40.000 Bus +-I4INTpMKT4_30.000_40.000.wav 30.000 40.000 Bus +-II-7qJxKPc_21.000_31.000.wav 21.000 31.000 Bus +-LnpzyfTkF8_30.000_40.000.wav 30.000 40.000 Bus +-OgRshQfsi8_30.000_40.000.wav 30.000 40.000 Bus +-P53lJ1ViWk_30.000_40.000.wav 30.000 40.000 Bus +-PvNUvEov4Q_30.000_40.000.wav 30.000 40.000 Bus +--12UOziMF0_30.000_40.000.wav 30.000 40.000 Truck +--73E04RpiQ_0.000_9.000.wav 0.000 9.000 Truck +--J947HxQVM_0.000_9.000.wav 0.000 9.000 Truck +--bD1DVKlzQ_30.000_40.000.wav 30.000 40.000 Truck +--ivFZu-hlc_30.000_40.000.wav 30.000 40.000 Truck +--wuU7kzB5o_30.000_40.000.wav 30.000 40.000 Truck +-0B_CYyG5Dg_30.000_40.000.wav 30.000 40.000 Truck +-0JqTq_4jaE_40.000_50.000.wav 40.000 50.000 Truck +-0MrEZKJ5MQ_30.000_40.000.wav 30.000 40.000 Truck +-0awng26xQ8_30.000_40.000.wav 30.000 40.000 Truck +-0dq1Vg9rd8_30.000_40.000.wav 30.000 40.000 Truck +-0wkq7CUYME_310.000_320.000.wav 310.000 320.000 Truck +-14RXdkqYuI_30.000_40.000.wav 30.000 40.000 Truck +-1B3CzpiW1M_30.000_40.000.wav 30.000 40.000 Truck +-1Q21cZhHDE_30.000_40.000.wav 30.000 40.000 Truck +-1ZXXnBXJ6c_8.000_18.000.wav 8.000 18.000 Truck +-1s0DWApvT8_30.000_40.000.wav 30.000 40.000 Truck +-1s84_2Vn4g_30.000_40.000.wav 30.000 40.000 Truck +-26ansJluVo_30.000_40.000.wav 30.000 40.000 Truck +-2EscdO0l-A_30.000_40.000.wav 30.000 40.000 Truck +-2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Truck +-2NBZUCcvm0_30.000_40.000.wav 30.000 40.000 Truck +-2sT5oBBWWY_30.000_40.000.wav 30.000 40.000 Truck +-2vmprMUw10_30.000_40.000.wav 30.000 40.000 Truck +-2x4TB8VWvE_18.000_28.000.wav 18.000 28.000 Truck +-39q4y0tt-g_30.000_40.000.wav 30.000 40.000 Truck +-3N5rjPrNCc_190.000_200.000.wav 190.000 200.000 Truck +-3NcUIyJtFY_30.000_40.000.wav 30.000 40.000 Truck +-3PplV0ErOk_30.000_40.000.wav 30.000 40.000 Truck +-3gSkrDKNSA_27.000_37.000.wav 27.000 37.000 Truck +--p-rk_HBuU_30.000_40.000.wav 30.000 40.000 Motorcycle +-1WK72M4xeg_220.000_230.000.wav 220.000 230.000 Motorcycle +-1XfuJcdvfg_30.000_40.000.wav 30.000 40.000 Motorcycle +-3XWBAmjmaQ_11.000_21.000.wav 11.000 21.000 Motorcycle +-4-87UgJcUw_70.000_80.000.wav 70.000 80.000 Motorcycle +-4D3Gkyisyc_30.000_40.000.wav 30.000 40.000 Motorcycle +-5k5GyHd2So_4.000_14.000.wav 4.000 14.000 Motorcycle +-6A2L1U9b5Y_54.000_64.000.wav 54.000 64.000 Motorcycle +-6Yfati1N10_80.000_90.000.wav 80.000 90.000 Motorcycle +-7_o_GhpZpM_12.000_22.000.wav 12.000 22.000 Motorcycle +-7rZwMK6uSs_70.000_80.000.wav 70.000 80.000 Motorcycle +-85f5DKKfSo_30.000_40.000.wav 30.000 40.000 Motorcycle +-9Smdrt5zwk_40.000_50.000.wav 40.000 50.000 Motorcycle +-9gZLVDKpnE_30.000_40.000.wav 30.000 40.000 Motorcycle +-BGebo8V4XY_30.000_40.000.wav 30.000 40.000 Motorcycle +-DdiduB5B_w_190.000_200.000.wav 190.000 200.000 Motorcycle +-HIPq7T3eFI_11.000_21.000.wav 11.000 21.000 Motorcycle +-H_3oEkKe0M_50.000_60.000.wav 50.000 60.000 Motorcycle +-HmuMoykRqA_500.000_510.000.wav 500.000 510.000 Motorcycle +-IMRE_psvtI_30.000_40.000.wav 30.000 40.000 Motorcycle +-Ie4LSPDEF4_6.000_16.000.wav 6.000 16.000 Motorcycle +-J0F29UCZiA_70.000_80.000.wav 70.000 80.000 Motorcycle +-KFCJ7ydu2E_0.000_10.000.wav 0.000 10.000 Motorcycle +-KmDAgYb0Uo_100.000_110.000.wav 100.000 110.000 Motorcycle +-P7iW3WzNfc_400.000_410.000.wav 400.000 410.000 Motorcycle +-QMAKXzIGx4_10.000_20.000.wav 10.000 20.000 Motorcycle +-S-5z2vYtxw_10.000_20.000.wav 10.000 20.000 Motorcycle +-SlL0NZh51w_30.000_40.000.wav 30.000 40.000 Motorcycle +-US2mpJxbj4_30.000_40.000.wav 30.000 40.000 Motorcycle +-VO-C9C0uqY_1.000_11.000.wav 1.000 11.000 Motorcycle +--H_-CEB2wA_30.000_40.000.wav 30.000 40.000 Train +-1VsFy0eVJs_30.000_40.000.wav 30.000 40.000 Train +-1X7kpLnOpM_60.000_70.000.wav 60.000 70.000 Train +-3FIglJti0s_30.000_40.000.wav 30.000 40.000 Train +-5QrBL6MzLg_60.000_70.000.wav 60.000 70.000 Train +-6KOEEiAf9s_19.000_29.000.wav 19.000 29.000 Train +-97l_c6PToE_30.000_40.000.wav 30.000 40.000 Train +-9S5Z-uciLo_70.000_80.000.wav 70.000 80.000 Train +-CkgGfKepO4_140.000_150.000.wav 140.000 150.000 Train +-E0shPRxAbo_30.000_40.000.wav 30.000 40.000 Train +-Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Train +-JpQivta6MQ_20.000_30.000.wav 20.000 30.000 Train +-K9oTZj3mVQ_30.000_40.000.wav 30.000 40.000 Train +-KjE40DlSdU_0.000_10.000.wav 0.000 10.000 Train +-NrFtZ_xxFU_30.000_40.000.wav 30.000 40.000 Train +-PYRamK58Ss_0.000_10.000.wav 0.000 10.000 Train +-P_XDJt4p_s_30.000_40.000.wav 30.000 40.000 Train +-Pjylzex7oc_350.000_360.000.wav 350.000 360.000 Train +-QHuZGmIy_I_30.000_40.000.wav 30.000 40.000 Train +-Qfk_Q2ctBs_30.000_40.000.wav 30.000 40.000 Train +-RXKRoRPWXg_30.000_40.000.wav 30.000 40.000 Train +-VH414svzI0_30.000_40.000.wav 30.000 40.000 Train +-WFdYxE-PYI_30.000_40.000.wav 30.000 40.000 Train +-Wd1pV7UjWg_60.000_70.000.wav 60.000 70.000 Train +-XcC-UlbcRA_30.000_40.000.wav 30.000 40.000 Train +-Y2cD8xvCHI_30.000_40.000.wav 30.000 40.000 Train +-ZKZkMHe3cY_70.000_80.000.wav 70.000 80.000 Train +-Zq22n4OewA_30.000_40.000.wav 30.000 40.000 Train +-aZ7XC4LG2A_30.000_40.000.wav 30.000 40.000 Train +-abVemAm9HM_430.000_440.000.wav 430.000 440.000 Train +1T1i2rny8RU_30.000_40.000.wav 30.000 40.000 Ambulance (siren) +7DC3HtNi4fU_160.000_170.000.wav 160.000 170.000 Ambulance (siren) +-z8jsgl3iHE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +00H_s-krtg8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Fire engine, fire truck (siren) +4l78f9VZ9uE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) +35EOmSMTQ6I_30.000_40.000.wav 30.000 40.000 Civil defense siren +06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Police car (siren) +0EPK7Pv_lbE_30.000_40.000.wav 30.000 40.000 Police car (siren) +0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Police car (siren) +17VuPl9Wxvs_20.000_30.000.wav 20.000 30.000 Police car (siren) +4A1Ar1TIXIY_30.000_40.000.wav 30.000 40.000 Police car (siren) +-10fWp7Pqs4_30.000_40.000.wav 30.000 40.000 Car +-122tCXtFhU_30.000_40.000.wav 30.000 40.000 Car +-14BFlDzjS4_6.000_16.000.wav 6.000 16.000 Car +-1BrkFLHD74_19.000_29.000.wav 19.000 29.000 Car +-1HlfoHZCEE_6.000_16.000.wav 6.000 16.000 Car +-1McjOPUzbo_30.000_40.000.wav 30.000 40.000 Car +-1sGSNmgiPs_4.000_14.000.wav 4.000 14.000 Car +-25LkbSjEos_30.000_40.000.wav 30.000 40.000 Car +-2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Car +-2LJWaL2PuA_30.000_40.000.wav 30.000 40.000 Car +-2ZbvsBSZmY_2.000_12.000.wav 2.000 12.000 Car +-2cz2qQDmr4_30.000_40.000.wav 30.000 40.000 Car +-31KUAOSg5U_5.000_15.000.wav 5.000 15.000 Car +-35qBdzN9ck_30.000_40.000.wav 30.000 40.000 Car +-3929cmVE20_30.000_40.000.wav 30.000 40.000 Car +-3M-k4nIYIM_30.000_40.000.wav 30.000 40.000 Car +-3MNphBfq_0_30.000_40.000.wav 30.000 40.000 Car +-3_RSVYKkkk_30.000_40.000.wav 30.000 40.000 Car +-AF7wp3ezww_140.000_150.000.wav 140.000 150.000 Car +-Pg4vVPs4bE_30.000_40.000.wav 30.000 40.000 Car +-VULyMtKazE_0.000_7.000.wav 0.000 7.000 Car +-cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Car +06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Car +0E4AqW9dmdk_30.000_40.000.wav 30.000 40.000 Car +0Hz4R_m0hmI_80.000_90.000.wav 80.000 90.000 Car +4Kpklmj-ze0_53.000_63.000.wav 53.000 63.000 Car +5tzTahLHylw_70.000_80.000.wav 70.000 80.000 Car +7NJ5TbNEIvA_250.000_260.000.wav 250.000 260.000 Car +9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car +9jYv9WuyknA_130.000_140.000.wav 130.000 140.000 Car +-l-DEfDAvNA_30.000_40.000.wav 30.000 40.000 Car passing by +9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car passing by +-jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Bus +-45cKZA7Jww_30.000_40.000.wav 30.000 40.000 Truck +-4B435WQvag_20.000_30.000.wav 20.000 30.000 Truck +-60XojQWWoc_30.000_40.000.wav 30.000 40.000 Truck +-6qhtwdfGOA_23.000_33.000.wav 23.000 33.000 Truck +-8OITuFZha8_30.000_40.000.wav 30.000 40.000 Truck +-8n2NqDFRko_30.000_40.000.wav 30.000 40.000 Truck +-AIrHVeCgtM_30.000_40.000.wav 30.000 40.000 Truck +-AVzYvKHwPg_30.000_40.000.wav 30.000 40.000 Truck +-BM_EAszxBg_30.000_40.000.wav 30.000 40.000 Truck +-Ei2LE71Dfg_20.000_30.000.wav 20.000 30.000 Truck +-FWkB2IDMhc_30.000_40.000.wav 30.000 40.000 Truck +-Jsu4dbuO4A_30.000_40.000.wav 30.000 40.000 Truck +-PRrNx6_MD0_16.000_26.000.wav 16.000 26.000 Truck +-X0vNLwH1C0_30.000_40.000.wav 30.000 40.000 Truck +-cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Truck +-oCvKmNbhl0_30.000_40.000.wav 30.000 40.000 Truck +-oV6dQu5tZo_30.000_40.000.wav 30.000 40.000 Truck +-qKRKDTbt4c_30.000_40.000.wav 30.000 40.000 Truck +-r8mfjRiHrU_30.000_40.000.wav 30.000 40.000 Truck +-s9kwrRilOY_30.000_40.000.wav 30.000 40.000 Truck +-uMiGr6xvRA_30.000_40.000.wav 30.000 40.000 Truck +-x70B12Mb-8_30.000_40.000.wav 30.000 40.000 Truck +-xYsfYZOI-Y_30.000_40.000.wav 30.000 40.000 Truck +-zxrdL6MlKI_30.000_40.000.wav 30.000 40.000 Truck +0C3kqtF76t8_50.000_60.000.wav 50.000 60.000 Truck +0HmiH-wKLB4_30.000_40.000.wav 30.000 40.000 Truck +0KskqFt3DoY_15.000_25.000.wav 15.000 25.000 Truck +0OiPtV9sd_w_30.000_40.000.wav 30.000 40.000 Truck +0VnoYVqd-yo_30.000_40.000.wav 30.000 40.000 Truck +3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Truck +-nGBPqlRNg4_30.000_40.000.wav 30.000 40.000 Train +02w3vd_GgF0_390.000_400.000.wav 390.000 400.000 Train +0HqeYIREv8M_30.000_40.000.wav 30.000 40.000 Train +0IpYF91Fdt0_80.000_90.000.wav 80.000 90.000 Train +0NaZejdABG0_90.000_100.000.wav 90.000 100.000 Train +0RurXUfKyow_4.000_14.000.wav 4.000 14.000 Train +0_HnD-rW3lI_170.000_180.000.wav 170.000 180.000 Train +10i60V1RZkQ_210.000_220.000.wav 210.000 220.000 Train +1FJY5X1iY9I_170.000_180.000.wav 170.000 180.000 Train +1U0Ty6CW6AM_40.000_50.000.wav 40.000 50.000 Train +1hQLr88iCvg_30.000_40.000.wav 30.000 40.000 Train +1iUXERALOOs_190.000_200.000.wav 190.000 200.000 Train +1iWFlLpixKU_5.000_15.000.wav 5.000 15.000 Train +1oJAVJPX0YY_20.000_30.000.wav 20.000 30.000 Train +26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Train +2BMHsKLcb7E_90.000_100.000.wav 90.000 100.000 Train +2RpOd9MJjyQ_10.000_20.000.wav 10.000 20.000 Train +2U4wSdl10to_200.000_210.000.wav 200.000 210.000 Train +2aBV6AZt5nk_570.000_580.000.wav 570.000 580.000 Train +3ntFslTK6hM_90.000_100.000.wav 90.000 100.000 Train diff --git a/audio_detection/audio_infer/metadata/class_labels_indices.csv b/audio_detection/audio_infer/metadata/class_labels_indices.csv new file mode 100644 index 0000000000000000000000000000000000000000..3a2767e81114adecde59992cf6607f31c1862f4c --- /dev/null +++ b/audio_detection/audio_infer/metadata/class_labels_indices.csv @@ -0,0 +1,528 @@ +index,mid,display_name +0,/m/09x0r,"Speech" +1,/m/05zppz,"Male speech, man speaking" +2,/m/02zsn,"Female speech, woman speaking" +3,/m/0ytgt,"Child speech, kid speaking" +4,/m/01h8n0,"Conversation" +5,/m/02qldy,"Narration, monologue" +6,/m/0261r1,"Babbling" +7,/m/0brhx,"Speech synthesizer" +8,/m/07p6fty,"Shout" +9,/m/07q4ntr,"Bellow" +10,/m/07rwj3x,"Whoop" +11,/m/07sr1lc,"Yell" +12,/m/04gy_2,"Battle cry" +13,/t/dd00135,"Children shouting" +14,/m/03qc9zr,"Screaming" +15,/m/02rtxlg,"Whispering" +16,/m/01j3sz,"Laughter" +17,/t/dd00001,"Baby laughter" +18,/m/07r660_,"Giggle" +19,/m/07s04w4,"Snicker" +20,/m/07sq110,"Belly laugh" +21,/m/07rgt08,"Chuckle, chortle" +22,/m/0463cq4,"Crying, sobbing" +23,/t/dd00002,"Baby cry, infant cry" +24,/m/07qz6j3,"Whimper" +25,/m/07qw_06,"Wail, moan" +26,/m/07plz5l,"Sigh" +27,/m/015lz1,"Singing" +28,/m/0l14jd,"Choir" +29,/m/01swy6,"Yodeling" +30,/m/02bk07,"Chant" +31,/m/01c194,"Mantra" +32,/t/dd00003,"Male singing" +33,/t/dd00004,"Female singing" +34,/t/dd00005,"Child singing" +35,/t/dd00006,"Synthetic singing" +36,/m/06bxc,"Rapping" +37,/m/02fxyj,"Humming" +38,/m/07s2xch,"Groan" +39,/m/07r4k75,"Grunt" +40,/m/01w250,"Whistling" +41,/m/0lyf6,"Breathing" +42,/m/07mzm6,"Wheeze" +43,/m/01d3sd,"Snoring" +44,/m/07s0dtb,"Gasp" +45,/m/07pyy8b,"Pant" +46,/m/07q0yl5,"Snort" +47,/m/01b_21,"Cough" +48,/m/0dl9sf8,"Throat clearing" +49,/m/01hsr_,"Sneeze" +50,/m/07ppn3j,"Sniff" +51,/m/06h7j,"Run" +52,/m/07qv_x_,"Shuffle" +53,/m/07pbtc8,"Walk, footsteps" +54,/m/03cczk,"Chewing, mastication" +55,/m/07pdhp0,"Biting" +56,/m/0939n_,"Gargling" +57,/m/01g90h,"Stomach rumble" +58,/m/03q5_w,"Burping, eructation" +59,/m/02p3nc,"Hiccup" +60,/m/02_nn,"Fart" +61,/m/0k65p,"Hands" +62,/m/025_jnm,"Finger snapping" +63,/m/0l15bq,"Clapping" +64,/m/01jg02,"Heart sounds, heartbeat" +65,/m/01jg1z,"Heart murmur" +66,/m/053hz1,"Cheering" +67,/m/028ght,"Applause" +68,/m/07rkbfh,"Chatter" +69,/m/03qtwd,"Crowd" +70,/m/07qfr4h,"Hubbub, speech noise, speech babble" +71,/t/dd00013,"Children playing" +72,/m/0jbk,"Animal" +73,/m/068hy,"Domestic animals, pets" +74,/m/0bt9lr,"Dog" +75,/m/05tny_,"Bark" +76,/m/07r_k2n,"Yip" +77,/m/07qf0zm,"Howl" +78,/m/07rc7d9,"Bow-wow" +79,/m/0ghcn6,"Growling" +80,/t/dd00136,"Whimper (dog)" +81,/m/01yrx,"Cat" +82,/m/02yds9,"Purr" +83,/m/07qrkrw,"Meow" +84,/m/07rjwbb,"Hiss" +85,/m/07r81j2,"Caterwaul" +86,/m/0ch8v,"Livestock, farm animals, working animals" +87,/m/03k3r,"Horse" +88,/m/07rv9rh,"Clip-clop" +89,/m/07q5rw0,"Neigh, whinny" +90,/m/01xq0k1,"Cattle, bovinae" +91,/m/07rpkh9,"Moo" +92,/m/0239kh,"Cowbell" +93,/m/068zj,"Pig" +94,/t/dd00018,"Oink" +95,/m/03fwl,"Goat" +96,/m/07q0h5t,"Bleat" +97,/m/07bgp,"Sheep" +98,/m/025rv6n,"Fowl" +99,/m/09b5t,"Chicken, rooster" +100,/m/07st89h,"Cluck" +101,/m/07qn5dc,"Crowing, cock-a-doodle-doo" +102,/m/01rd7k,"Turkey" +103,/m/07svc2k,"Gobble" +104,/m/09ddx,"Duck" +105,/m/07qdb04,"Quack" +106,/m/0dbvp,"Goose" +107,/m/07qwf61,"Honk" +108,/m/01280g,"Wild animals" +109,/m/0cdnk,"Roaring cats (lions, tigers)" +110,/m/04cvmfc,"Roar" +111,/m/015p6,"Bird" +112,/m/020bb7,"Bird vocalization, bird call, bird song" +113,/m/07pggtn,"Chirp, tweet" +114,/m/07sx8x_,"Squawk" +115,/m/0h0rv,"Pigeon, dove" +116,/m/07r_25d,"Coo" +117,/m/04s8yn,"Crow" +118,/m/07r5c2p,"Caw" +119,/m/09d5_,"Owl" +120,/m/07r_80w,"Hoot" +121,/m/05_wcq,"Bird flight, flapping wings" +122,/m/01z5f,"Canidae, dogs, wolves" +123,/m/06hps,"Rodents, rats, mice" +124,/m/04rmv,"Mouse" +125,/m/07r4gkf,"Patter" +126,/m/03vt0,"Insect" +127,/m/09xqv,"Cricket" +128,/m/09f96,"Mosquito" +129,/m/0h2mp,"Fly, housefly" +130,/m/07pjwq1,"Buzz" +131,/m/01h3n,"Bee, wasp, etc." +132,/m/09ld4,"Frog" +133,/m/07st88b,"Croak" +134,/m/078jl,"Snake" +135,/m/07qn4z3,"Rattle" +136,/m/032n05,"Whale vocalization" +137,/m/04rlf,"Music" +138,/m/04szw,"Musical instrument" +139,/m/0fx80y,"Plucked string instrument" +140,/m/0342h,"Guitar" +141,/m/02sgy,"Electric guitar" +142,/m/018vs,"Bass guitar" +143,/m/042v_gx,"Acoustic guitar" +144,/m/06w87,"Steel guitar, slide guitar" +145,/m/01glhc,"Tapping (guitar technique)" +146,/m/07s0s5r,"Strum" +147,/m/018j2,"Banjo" +148,/m/0jtg0,"Sitar" +149,/m/04rzd,"Mandolin" +150,/m/01bns_,"Zither" +151,/m/07xzm,"Ukulele" +152,/m/05148p4,"Keyboard (musical)" +153,/m/05r5c,"Piano" +154,/m/01s0ps,"Electric piano" +155,/m/013y1f,"Organ" +156,/m/03xq_f,"Electronic organ" +157,/m/03gvt,"Hammond organ" +158,/m/0l14qv,"Synthesizer" +159,/m/01v1d8,"Sampler" +160,/m/03q5t,"Harpsichord" +161,/m/0l14md,"Percussion" +162,/m/02hnl,"Drum kit" +163,/m/0cfdd,"Drum machine" +164,/m/026t6,"Drum" +165,/m/06rvn,"Snare drum" +166,/m/03t3fj,"Rimshot" +167,/m/02k_mr,"Drum roll" +168,/m/0bm02,"Bass drum" +169,/m/011k_j,"Timpani" +170,/m/01p970,"Tabla" +171,/m/01qbl,"Cymbal" +172,/m/03qtq,"Hi-hat" +173,/m/01sm1g,"Wood block" +174,/m/07brj,"Tambourine" +175,/m/05r5wn,"Rattle (instrument)" +176,/m/0xzly,"Maraca" +177,/m/0mbct,"Gong" +178,/m/016622,"Tubular bells" +179,/m/0j45pbj,"Mallet percussion" +180,/m/0dwsp,"Marimba, xylophone" +181,/m/0dwtp,"Glockenspiel" +182,/m/0dwt5,"Vibraphone" +183,/m/0l156b,"Steelpan" +184,/m/05pd6,"Orchestra" +185,/m/01kcd,"Brass instrument" +186,/m/0319l,"French horn" +187,/m/07gql,"Trumpet" +188,/m/07c6l,"Trombone" +189,/m/0l14_3,"Bowed string instrument" +190,/m/02qmj0d,"String section" +191,/m/07y_7,"Violin, fiddle" +192,/m/0d8_n,"Pizzicato" +193,/m/01xqw,"Cello" +194,/m/02fsn,"Double bass" +195,/m/085jw,"Wind instrument, woodwind instrument" +196,/m/0l14j_,"Flute" +197,/m/06ncr,"Saxophone" +198,/m/01wy6,"Clarinet" +199,/m/03m5k,"Harp" +200,/m/0395lw,"Bell" +201,/m/03w41f,"Church bell" +202,/m/027m70_,"Jingle bell" +203,/m/0gy1t2s,"Bicycle bell" +204,/m/07n_g,"Tuning fork" +205,/m/0f8s22,"Chime" +206,/m/026fgl,"Wind chime" +207,/m/0150b9,"Change ringing (campanology)" +208,/m/03qjg,"Harmonica" +209,/m/0mkg,"Accordion" +210,/m/0192l,"Bagpipes" +211,/m/02bxd,"Didgeridoo" +212,/m/0l14l2,"Shofar" +213,/m/07kc_,"Theremin" +214,/m/0l14t7,"Singing bowl" +215,/m/01hgjl,"Scratching (performance technique)" +216,/m/064t9,"Pop music" +217,/m/0glt670,"Hip hop music" +218,/m/02cz_7,"Beatboxing" +219,/m/06by7,"Rock music" +220,/m/03lty,"Heavy metal" +221,/m/05r6t,"Punk rock" +222,/m/0dls3,"Grunge" +223,/m/0dl5d,"Progressive rock" +224,/m/07sbbz2,"Rock and roll" +225,/m/05w3f,"Psychedelic rock" +226,/m/06j6l,"Rhythm and blues" +227,/m/0gywn,"Soul music" +228,/m/06cqb,"Reggae" +229,/m/01lyv,"Country" +230,/m/015y_n,"Swing music" +231,/m/0gg8l,"Bluegrass" +232,/m/02x8m,"Funk" +233,/m/02w4v,"Folk music" +234,/m/06j64v,"Middle Eastern music" +235,/m/03_d0,"Jazz" +236,/m/026z9,"Disco" +237,/m/0ggq0m,"Classical music" +238,/m/05lls,"Opera" +239,/m/02lkt,"Electronic music" +240,/m/03mb9,"House music" +241,/m/07gxw,"Techno" +242,/m/07s72n,"Dubstep" +243,/m/0283d,"Drum and bass" +244,/m/0m0jc,"Electronica" +245,/m/08cyft,"Electronic dance music" +246,/m/0fd3y,"Ambient music" +247,/m/07lnk,"Trance music" +248,/m/0g293,"Music of Latin America" +249,/m/0ln16,"Salsa music" +250,/m/0326g,"Flamenco" +251,/m/0155w,"Blues" +252,/m/05fw6t,"Music for children" +253,/m/02v2lh,"New-age music" +254,/m/0y4f8,"Vocal music" +255,/m/0z9c,"A capella" +256,/m/0164x2,"Music of Africa" +257,/m/0145m,"Afrobeat" +258,/m/02mscn,"Christian music" +259,/m/016cjb,"Gospel music" +260,/m/028sqc,"Music of Asia" +261,/m/015vgc,"Carnatic music" +262,/m/0dq0md,"Music of Bollywood" +263,/m/06rqw,"Ska" +264,/m/02p0sh1,"Traditional music" +265,/m/05rwpb,"Independent music" +266,/m/074ft,"Song" +267,/m/025td0t,"Background music" +268,/m/02cjck,"Theme music" +269,/m/03r5q_,"Jingle (music)" +270,/m/0l14gg,"Soundtrack music" +271,/m/07pkxdp,"Lullaby" +272,/m/01z7dr,"Video game music" +273,/m/0140xf,"Christmas music" +274,/m/0ggx5q,"Dance music" +275,/m/04wptg,"Wedding music" +276,/t/dd00031,"Happy music" +277,/t/dd00032,"Funny music" +278,/t/dd00033,"Sad music" +279,/t/dd00034,"Tender music" +280,/t/dd00035,"Exciting music" +281,/t/dd00036,"Angry music" +282,/t/dd00037,"Scary music" +283,/m/03m9d0z,"Wind" +284,/m/09t49,"Rustling leaves" +285,/t/dd00092,"Wind noise (microphone)" +286,/m/0jb2l,"Thunderstorm" +287,/m/0ngt1,"Thunder" +288,/m/0838f,"Water" +289,/m/06mb1,"Rain" +290,/m/07r10fb,"Raindrop" +291,/t/dd00038,"Rain on surface" +292,/m/0j6m2,"Stream" +293,/m/0j2kx,"Waterfall" +294,/m/05kq4,"Ocean" +295,/m/034srq,"Waves, surf" +296,/m/06wzb,"Steam" +297,/m/07swgks,"Gurgling" +298,/m/02_41,"Fire" +299,/m/07pzfmf,"Crackle" +300,/m/07yv9,"Vehicle" +301,/m/019jd,"Boat, Water vehicle" +302,/m/0hsrw,"Sailboat, sailing ship" +303,/m/056ks2,"Rowboat, canoe, kayak" +304,/m/02rlv9,"Motorboat, speedboat" +305,/m/06q74,"Ship" +306,/m/012f08,"Motor vehicle (road)" +307,/m/0k4j,"Car" +308,/m/0912c9,"Vehicle horn, car horn, honking" +309,/m/07qv_d5,"Toot" +310,/m/02mfyn,"Car alarm" +311,/m/04gxbd,"Power windows, electric windows" +312,/m/07rknqz,"Skidding" +313,/m/0h9mv,"Tire squeal" +314,/t/dd00134,"Car passing by" +315,/m/0ltv,"Race car, auto racing" +316,/m/07r04,"Truck" +317,/m/0gvgw0,"Air brake" +318,/m/05x_td,"Air horn, truck horn" +319,/m/02rhddq,"Reversing beeps" +320,/m/03cl9h,"Ice cream truck, ice cream van" +321,/m/01bjv,"Bus" +322,/m/03j1ly,"Emergency vehicle" +323,/m/04qvtq,"Police car (siren)" +324,/m/012n7d,"Ambulance (siren)" +325,/m/012ndj,"Fire engine, fire truck (siren)" +326,/m/04_sv,"Motorcycle" +327,/m/0btp2,"Traffic noise, roadway noise" +328,/m/06d_3,"Rail transport" +329,/m/07jdr,"Train" +330,/m/04zmvq,"Train whistle" +331,/m/0284vy3,"Train horn" +332,/m/01g50p,"Railroad car, train wagon" +333,/t/dd00048,"Train wheels squealing" +334,/m/0195fx,"Subway, metro, underground" +335,/m/0k5j,"Aircraft" +336,/m/014yck,"Aircraft engine" +337,/m/04229,"Jet engine" +338,/m/02l6bg,"Propeller, airscrew" +339,/m/09ct_,"Helicopter" +340,/m/0cmf2,"Fixed-wing aircraft, airplane" +341,/m/0199g,"Bicycle" +342,/m/06_fw,"Skateboard" +343,/m/02mk9,"Engine" +344,/t/dd00065,"Light engine (high frequency)" +345,/m/08j51y,"Dental drill, dentist's drill" +346,/m/01yg9g,"Lawn mower" +347,/m/01j4z9,"Chainsaw" +348,/t/dd00066,"Medium engine (mid frequency)" +349,/t/dd00067,"Heavy engine (low frequency)" +350,/m/01h82_,"Engine knocking" +351,/t/dd00130,"Engine starting" +352,/m/07pb8fc,"Idling" +353,/m/07q2z82,"Accelerating, revving, vroom" +354,/m/02dgv,"Door" +355,/m/03wwcy,"Doorbell" +356,/m/07r67yg,"Ding-dong" +357,/m/02y_763,"Sliding door" +358,/m/07rjzl8,"Slam" +359,/m/07r4wb8,"Knock" +360,/m/07qcpgn,"Tap" +361,/m/07q6cd_,"Squeak" +362,/m/0642b4,"Cupboard open or close" +363,/m/0fqfqc,"Drawer open or close" +364,/m/04brg2,"Dishes, pots, and pans" +365,/m/023pjk,"Cutlery, silverware" +366,/m/07pn_8q,"Chopping (food)" +367,/m/0dxrf,"Frying (food)" +368,/m/0fx9l,"Microwave oven" +369,/m/02pjr4,"Blender" +370,/m/02jz0l,"Water tap, faucet" +371,/m/0130jx,"Sink (filling or washing)" +372,/m/03dnzn,"Bathtub (filling or washing)" +373,/m/03wvsk,"Hair dryer" +374,/m/01jt3m,"Toilet flush" +375,/m/012xff,"Toothbrush" +376,/m/04fgwm,"Electric toothbrush" +377,/m/0d31p,"Vacuum cleaner" +378,/m/01s0vc,"Zipper (clothing)" +379,/m/03v3yw,"Keys jangling" +380,/m/0242l,"Coin (dropping)" +381,/m/01lsmm,"Scissors" +382,/m/02g901,"Electric shaver, electric razor" +383,/m/05rj2,"Shuffling cards" +384,/m/0316dw,"Typing" +385,/m/0c2wf,"Typewriter" +386,/m/01m2v,"Computer keyboard" +387,/m/081rb,"Writing" +388,/m/07pp_mv,"Alarm" +389,/m/07cx4,"Telephone" +390,/m/07pp8cl,"Telephone bell ringing" +391,/m/01hnzm,"Ringtone" +392,/m/02c8p,"Telephone dialing, DTMF" +393,/m/015jpf,"Dial tone" +394,/m/01z47d,"Busy signal" +395,/m/046dlr,"Alarm clock" +396,/m/03kmc9,"Siren" +397,/m/0dgbq,"Civil defense siren" +398,/m/030rvx,"Buzzer" +399,/m/01y3hg,"Smoke detector, smoke alarm" +400,/m/0c3f7m,"Fire alarm" +401,/m/04fq5q,"Foghorn" +402,/m/0l156k,"Whistle" +403,/m/06hck5,"Steam whistle" +404,/t/dd00077,"Mechanisms" +405,/m/02bm9n,"Ratchet, pawl" +406,/m/01x3z,"Clock" +407,/m/07qjznt,"Tick" +408,/m/07qjznl,"Tick-tock" +409,/m/0l7xg,"Gears" +410,/m/05zc1,"Pulleys" +411,/m/0llzx,"Sewing machine" +412,/m/02x984l,"Mechanical fan" +413,/m/025wky1,"Air conditioning" +414,/m/024dl,"Cash register" +415,/m/01m4t,"Printer" +416,/m/0dv5r,"Camera" +417,/m/07bjf,"Single-lens reflex camera" +418,/m/07k1x,"Tools" +419,/m/03l9g,"Hammer" +420,/m/03p19w,"Jackhammer" +421,/m/01b82r,"Sawing" +422,/m/02p01q,"Filing (rasp)" +423,/m/023vsd,"Sanding" +424,/m/0_ksk,"Power tool" +425,/m/01d380,"Drill" +426,/m/014zdl,"Explosion" +427,/m/032s66,"Gunshot, gunfire" +428,/m/04zjc,"Machine gun" +429,/m/02z32qm,"Fusillade" +430,/m/0_1c,"Artillery fire" +431,/m/073cg4,"Cap gun" +432,/m/0g6b5,"Fireworks" +433,/g/122z_qxw,"Firecracker" +434,/m/07qsvvw,"Burst, pop" +435,/m/07pxg6y,"Eruption" +436,/m/07qqyl4,"Boom" +437,/m/083vt,"Wood" +438,/m/07pczhz,"Chop" +439,/m/07pl1bw,"Splinter" +440,/m/07qs1cx,"Crack" +441,/m/039jq,"Glass" +442,/m/07q7njn,"Chink, clink" +443,/m/07rn7sz,"Shatter" +444,/m/04k94,"Liquid" +445,/m/07rrlb6,"Splash, splatter" +446,/m/07p6mqd,"Slosh" +447,/m/07qlwh6,"Squish" +448,/m/07r5v4s,"Drip" +449,/m/07prgkl,"Pour" +450,/m/07pqc89,"Trickle, dribble" +451,/t/dd00088,"Gush" +452,/m/07p7b8y,"Fill (with liquid)" +453,/m/07qlf79,"Spray" +454,/m/07ptzwd,"Pump (liquid)" +455,/m/07ptfmf,"Stir" +456,/m/0dv3j,"Boiling" +457,/m/0790c,"Sonar" +458,/m/0dl83,"Arrow" +459,/m/07rqsjt,"Whoosh, swoosh, swish" +460,/m/07qnq_y,"Thump, thud" +461,/m/07rrh0c,"Thunk" +462,/m/0b_fwt,"Electronic tuner" +463,/m/02rr_,"Effects unit" +464,/m/07m2kt,"Chorus effect" +465,/m/018w8,"Basketball bounce" +466,/m/07pws3f,"Bang" +467,/m/07ryjzk,"Slap, smack" +468,/m/07rdhzs,"Whack, thwack" +469,/m/07pjjrj,"Smash, crash" +470,/m/07pc8lb,"Breaking" +471,/m/07pqn27,"Bouncing" +472,/m/07rbp7_,"Whip" +473,/m/07pyf11,"Flap" +474,/m/07qb_dv,"Scratch" +475,/m/07qv4k0,"Scrape" +476,/m/07pdjhy,"Rub" +477,/m/07s8j8t,"Roll" +478,/m/07plct2,"Crushing" +479,/t/dd00112,"Crumpling, crinkling" +480,/m/07qcx4z,"Tearing" +481,/m/02fs_r,"Beep, bleep" +482,/m/07qwdck,"Ping" +483,/m/07phxs1,"Ding" +484,/m/07rv4dm,"Clang" +485,/m/07s02z0,"Squeal" +486,/m/07qh7jl,"Creak" +487,/m/07qwyj0,"Rustle" +488,/m/07s34ls,"Whir" +489,/m/07qmpdm,"Clatter" +490,/m/07p9k1k,"Sizzle" +491,/m/07qc9xj,"Clicking" +492,/m/07rwm0c,"Clickety-clack" +493,/m/07phhsh,"Rumble" +494,/m/07qyrcz,"Plop" +495,/m/07qfgpx,"Jingle, tinkle" +496,/m/07rcgpl,"Hum" +497,/m/07p78v5,"Zing" +498,/t/dd00121,"Boing" +499,/m/07s12q4,"Crunch" +500,/m/028v0c,"Silence" +501,/m/01v_m0,"Sine wave" +502,/m/0b9m1,"Harmonic" +503,/m/0hdsk,"Chirp tone" +504,/m/0c1dj,"Sound effect" +505,/m/07pt_g0,"Pulse" +506,/t/dd00125,"Inside, small room" +507,/t/dd00126,"Inside, large room or hall" +508,/t/dd00127,"Inside, public space" +509,/t/dd00128,"Outside, urban or manmade" +510,/t/dd00129,"Outside, rural or natural" +511,/m/01b9nn,"Reverberation" +512,/m/01jnbd,"Echo" +513,/m/096m7z,"Noise" +514,/m/06_y0by,"Environmental noise" +515,/m/07rgkc5,"Static" +516,/m/06xkwv,"Mains hum" +517,/m/0g12c5,"Distortion" +518,/m/08p9q4,"Sidetone" +519,/m/07szfh9,"Cacophony" +520,/m/0chx_,"White noise" +521,/m/0cj0r,"Pink noise" +522,/m/07p_0gm,"Throbbing" +523,/m/01jwx6,"Vibration" +524,/m/07c52,"Television" +525,/m/06bz3,"Radio" +526,/m/07hvw1,"Field recording" diff --git a/audio_detection/audio_infer/pytorch/__pycache__/models.cpython-38.pyc b/audio_detection/audio_infer/pytorch/__pycache__/models.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4699888fb861c2ffee9c8575b4116eba8e7a41b6 Binary files /dev/null and b/audio_detection/audio_infer/pytorch/__pycache__/models.cpython-38.pyc differ diff --git a/audio_detection/audio_infer/pytorch/__pycache__/pytorch_utils.cpython-38.pyc b/audio_detection/audio_infer/pytorch/__pycache__/pytorch_utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b4489e8b01c6cced77a08735295746c01e8f831 Binary files /dev/null and b/audio_detection/audio_infer/pytorch/__pycache__/pytorch_utils.cpython-38.pyc differ diff --git a/audio_detection/audio_infer/pytorch/evaluate.py b/audio_detection/audio_infer/pytorch/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..7f1fa38eedd9e9cd2580143ceb92aba8f81becf3 --- /dev/null +++ b/audio_detection/audio_infer/pytorch/evaluate.py @@ -0,0 +1,42 @@ +from sklearn import metrics + +from pytorch_utils import forward + + +class Evaluator(object): + def __init__(self, model): + """Evaluator. + + Args: + model: object + """ + self.model = model + + def evaluate(self, data_loader): + """Forward evaluation data and calculate statistics. + + Args: + data_loader: object + + Returns: + statistics: dict, + {'average_precision': (classes_num,), 'auc': (classes_num,)} + """ + + # Forward + output_dict = forward( + model=self.model, + generator=data_loader, + return_target=True) + + clipwise_output = output_dict['clipwise_output'] # (audios_num, classes_num) + target = output_dict['target'] # (audios_num, classes_num) + + average_precision = metrics.average_precision_score( + target, clipwise_output, average=None) + + auc = metrics.roc_auc_score(target, clipwise_output, average=None) + + statistics = {'average_precision': average_precision, 'auc': auc} + + return statistics \ No newline at end of file diff --git a/audio_detection/audio_infer/pytorch/finetune_template.py b/audio_detection/audio_infer/pytorch/finetune_template.py new file mode 100644 index 0000000000000000000000000000000000000000..dd43e462c47857f805b1ef4d345711354a1cff3d --- /dev/null +++ b/audio_detection/audio_infer/pytorch/finetune_template.py @@ -0,0 +1,127 @@ +import os +import sys +sys.path.insert(1, os.path.join(sys.path[0], '../utils')) +import numpy as np +import argparse +import h5py +import math +import time +import logging +import matplotlib.pyplot as plt + +import torch +torch.backends.cudnn.benchmark=True +torch.manual_seed(0) +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data + +from utilities import get_filename +from models import * +import config + + +class Transfer_Cnn14(nn.Module): + def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, classes_num, freeze_base): + """Classifier for a new task using pretrained Cnn14 as a sub module. + """ + super(Transfer_Cnn14, self).__init__() + audioset_classes_num = 527 + + self.base = Cnn14(sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, audioset_classes_num) + + # Transfer to another task layer + self.fc_transfer = nn.Linear(2048, classes_num, bias=True) + + if freeze_base: + # Freeze AudioSet pretrained layers + for param in self.base.parameters(): + param.requires_grad = False + + self.init_weights() + + def init_weights(self): + init_layer(self.fc_transfer) + + def load_from_pretrain(self, pretrained_checkpoint_path): + checkpoint = torch.load(pretrained_checkpoint_path) + self.base.load_state_dict(checkpoint['model']) + + def forward(self, input, mixup_lambda=None): + """Input: (batch_size, data_length) + """ + output_dict = self.base(input, mixup_lambda) + embedding = output_dict['embedding'] + + clipwise_output = torch.log_softmax(self.fc_transfer(embedding), dim=-1) + output_dict['clipwise_output'] = clipwise_output + + return output_dict + + +def train(args): + + # Arugments & parameters + sample_rate = args.sample_rate + window_size = args.window_size + hop_size = args.hop_size + mel_bins = args.mel_bins + fmin = args.fmin + fmax = args.fmax + model_type = args.model_type + pretrained_checkpoint_path = args.pretrained_checkpoint_path + freeze_base = args.freeze_base + device = 'cuda' if (args.cuda and torch.cuda.is_available()) else 'cpu' + + classes_num = config.classes_num + pretrain = True if pretrained_checkpoint_path else False + + # Model + Model = eval(model_type) + model = Model(sample_rate, window_size, hop_size, mel_bins, fmin, fmax, + classes_num, freeze_base) + + # Load pretrained model + if pretrain: + logging.info('Load pretrained model from {}'.format(pretrained_checkpoint_path)) + model.load_from_pretrain(pretrained_checkpoint_path) + + # Parallel + print('GPU number: {}'.format(torch.cuda.device_count())) + model = torch.nn.DataParallel(model) + + if 'cuda' in device: + model.to(device) + + print('Load pretrained model successfully!') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Example of parser. ') + subparsers = parser.add_subparsers(dest='mode') + + # Train + parser_train = subparsers.add_parser('train') + parser_train.add_argument('--sample_rate', type=int, required=True) + parser_train.add_argument('--window_size', type=int, required=True) + parser_train.add_argument('--hop_size', type=int, required=True) + parser_train.add_argument('--mel_bins', type=int, required=True) + parser_train.add_argument('--fmin', type=int, required=True) + parser_train.add_argument('--fmax', type=int, required=True) + parser_train.add_argument('--model_type', type=str, required=True) + parser_train.add_argument('--pretrained_checkpoint_path', type=str) + parser_train.add_argument('--freeze_base', action='store_true', default=False) + parser_train.add_argument('--cuda', action='store_true', default=False) + + # Parse arguments + args = parser.parse_args() + args.filename = get_filename(__file__) + + if args.mode == 'train': + train(args) + + else: + raise Exception('Error argument!') \ No newline at end of file diff --git a/audio_detection/audio_infer/pytorch/inference.py b/audio_detection/audio_infer/pytorch/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..49dc75f740aec7be287eab70bae1f7677ccc4662 --- /dev/null +++ b/audio_detection/audio_infer/pytorch/inference.py @@ -0,0 +1,206 @@ +import os +import sys +sys.path.insert(1, os.path.join(sys.path[0], '../utils')) +import numpy as np +import argparse +import librosa +import matplotlib.pyplot as plt +import torch + +from utilities import create_folder, get_filename +from models import * +from pytorch_utils import move_data_to_device +import config + +def audio_tagging(args): + """Inference audio tagging result of an audio clip. + """ + + # Arugments & parameters + sample_rate = args.sample_rate + window_size = args.window_size + hop_size = args.hop_size + mel_bins = args.mel_bins + fmin = args.fmin + fmax = args.fmax + model_type = args.model_type + checkpoint_path = args.checkpoint_path + audio_path = args.audio_path + device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu') + + classes_num = config.classes_num + labels = config.labels + + # Model + Model = eval(model_type) + model = Model(sample_rate=sample_rate, window_size=window_size, + hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax, + classes_num=classes_num) + + checkpoint = torch.load(checkpoint_path, map_location=device) + model.load_state_dict(checkpoint['model']) + + # Parallel + if 'cuda' in str(device): + model.to(device) + print('GPU number: {}'.format(torch.cuda.device_count())) + model = torch.nn.DataParallel(model) + else: + print('Using CPU.') + + # Load audio + (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) + + waveform = waveform[None, :] # (1, audio_length) + waveform = move_data_to_device(waveform, device) + + # Forward + with torch.no_grad(): + model.eval() + batch_output_dict = model(waveform, None) + + clipwise_output = batch_output_dict['clipwise_output'].data.cpu().numpy()[0] + """(classes_num,)""" + + sorted_indexes = np.argsort(clipwise_output)[::-1] + + # Print audio tagging top probabilities + for k in range(10): + print('{}: {:.3f}'.format(np.array(labels)[sorted_indexes[k]], + clipwise_output[sorted_indexes[k]])) + + # Print embedding + if 'embedding' in batch_output_dict.keys(): + embedding = batch_output_dict['embedding'].data.cpu().numpy()[0] + print('embedding: {}'.format(embedding.shape)) + + return clipwise_output, labels + + +def sound_event_detection(args): + """Inference sound event detection result of an audio clip. + """ + + # Arugments & parameters + sample_rate = args.sample_rate + window_size = args.window_size + hop_size = args.hop_size + mel_bins = args.mel_bins + fmin = args.fmin + fmax = args.fmax + model_type = args.model_type + checkpoint_path = args.checkpoint_path + audio_path = args.audio_path + device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu') + + classes_num = config.classes_num + labels = config.labels + frames_per_second = sample_rate // hop_size + + # Paths + fig_path = os.path.join('results', '{}.png'.format(get_filename(audio_path))) + create_folder(os.path.dirname(fig_path)) + + # Model + Model = eval(model_type) + model = Model(sample_rate=sample_rate, window_size=window_size, + hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax, + classes_num=classes_num) + + checkpoint = torch.load(checkpoint_path, map_location=device) + model.load_state_dict(checkpoint['model']) + + # Parallel + print('GPU number: {}'.format(torch.cuda.device_count())) + model = torch.nn.DataParallel(model) + + if 'cuda' in str(device): + model.to(device) + + # Load audio + (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) + + waveform = waveform[None, :] # (1, audio_length) + waveform = move_data_to_device(waveform, device) + + # Forward + with torch.no_grad(): + model.eval() + batch_output_dict = model(waveform, None) + + framewise_output = batch_output_dict['framewise_output'].data.cpu().numpy()[0] + """(time_steps, classes_num)""" + + print('Sound event detection result (time_steps x classes_num): {}'.format( + framewise_output.shape)) + + sorted_indexes = np.argsort(np.max(framewise_output, axis=0))[::-1] + + top_k = 10 # Show top results + top_result_mat = framewise_output[:, sorted_indexes[0 : top_k]] + """(time_steps, top_k)""" + + # Plot result + stft = librosa.core.stft(y=waveform[0].data.cpu().numpy(), n_fft=window_size, + hop_length=hop_size, window='hann', center=True) + frames_num = stft.shape[-1] + + fig, axs = plt.subplots(2, 1, sharex=True, figsize=(10, 4)) + axs[0].matshow(np.log(np.abs(stft)), origin='lower', aspect='auto', cmap='jet') + axs[0].set_ylabel('Frequency bins') + axs[0].set_title('Log spectrogram') + axs[1].matshow(top_result_mat.T, origin='upper', aspect='auto', cmap='jet', vmin=0, vmax=1) + axs[1].xaxis.set_ticks(np.arange(0, frames_num, frames_per_second)) + axs[1].xaxis.set_ticklabels(np.arange(0, frames_num / frames_per_second)) + axs[1].yaxis.set_ticks(np.arange(0, top_k)) + axs[1].yaxis.set_ticklabels(np.array(labels)[sorted_indexes[0 : top_k]]) + axs[1].yaxis.grid(color='k', linestyle='solid', linewidth=0.3, alpha=0.3) + axs[1].set_xlabel('Seconds') + axs[1].xaxis.set_ticks_position('bottom') + + plt.tight_layout() + plt.savefig(fig_path) + print('Save sound event detection visualization to {}'.format(fig_path)) + + return framewise_output, labels + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Example of parser. ') + subparsers = parser.add_subparsers(dest='mode') + + parser_at = subparsers.add_parser('audio_tagging') + parser_at.add_argument('--sample_rate', type=int, default=32000) + parser_at.add_argument('--window_size', type=int, default=1024) + parser_at.add_argument('--hop_size', type=int, default=320) + parser_at.add_argument('--mel_bins', type=int, default=64) + parser_at.add_argument('--fmin', type=int, default=50) + parser_at.add_argument('--fmax', type=int, default=14000) + parser_at.add_argument('--model_type', type=str, required=True) + parser_at.add_argument('--checkpoint_path', type=str, required=True) + parser_at.add_argument('--audio_path', type=str, required=True) + parser_at.add_argument('--cuda', action='store_true', default=False) + + parser_sed = subparsers.add_parser('sound_event_detection') + parser_sed.add_argument('--sample_rate', type=int, default=32000) + parser_sed.add_argument('--window_size', type=int, default=1024) + parser_sed.add_argument('--hop_size', type=int, default=320) + parser_sed.add_argument('--mel_bins', type=int, default=64) + parser_sed.add_argument('--fmin', type=int, default=50) + parser_sed.add_argument('--fmax', type=int, default=14000) + parser_sed.add_argument('--model_type', type=str, required=True) + parser_sed.add_argument('--checkpoint_path', type=str, required=True) + parser_sed.add_argument('--audio_path', type=str, required=True) + parser_sed.add_argument('--cuda', action='store_true', default=False) + + args = parser.parse_args() + + if args.mode == 'audio_tagging': + audio_tagging(args) + + elif args.mode == 'sound_event_detection': + sound_event_detection(args) + + else: + raise Exception('Error argument!') \ No newline at end of file diff --git a/audio_detection/audio_infer/pytorch/losses.py b/audio_detection/audio_infer/pytorch/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..587e8a64f2593e4a72c1a29cf374c1e24e20c366 --- /dev/null +++ b/audio_detection/audio_infer/pytorch/losses.py @@ -0,0 +1,14 @@ +import torch +import torch.nn.functional as F + + +def clip_bce(output_dict, target_dict): + """Binary crossentropy loss. + """ + return F.binary_cross_entropy( + output_dict['clipwise_output'], target_dict['target']) + + +def get_loss_func(loss_type): + if loss_type == 'clip_bce': + return clip_bce \ No newline at end of file diff --git a/audio_detection/audio_infer/pytorch/main.py b/audio_detection/audio_infer/pytorch/main.py new file mode 100644 index 0000000000000000000000000000000000000000..358293521706ff525f6f1b1274085a08236394ff --- /dev/null +++ b/audio_detection/audio_infer/pytorch/main.py @@ -0,0 +1,378 @@ +import os +import sys +sys.path.insert(1, os.path.join(sys.path[0], '../utils')) +import numpy as np +import argparse +import time +import logging + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data + +from utilities import (create_folder, get_filename, create_logging, Mixup, + StatisticsContainer) +from models import (PVT, PVT2, PVT_lr, PVT_nopretrain, PVT_2layer, Cnn14, Cnn14_no_specaug, Cnn14_no_dropout, + Cnn6, Cnn10, ResNet22, ResNet38, ResNet54, Cnn14_emb512, Cnn14_emb128, + Cnn14_emb32, MobileNetV1, MobileNetV2, LeeNet11, LeeNet24, DaiNet19, + Res1dNet31, Res1dNet51, Wavegram_Cnn14, Wavegram_Logmel_Cnn14, + Wavegram_Logmel128_Cnn14, Cnn14_16k, Cnn14_8k, Cnn14_mel32, Cnn14_mel128, + Cnn14_mixup_time_domain, Cnn14_DecisionLevelMax, Cnn14_DecisionLevelAtt, Cnn6_Transformer, GLAM, GLAM2, GLAM3, Cnn4, EAT) +#from models_test import (PVT_test) +#from models1 import (PVT1) +#from models_vig import (VIG, VIG2) +#from models_vvt import (VVT) +#from models2 import (MPVIT, MPVIT2) +#from models_reshape import (PVT_reshape, PVT_tscam) +#from models_swin import (Swin, Swin_nopretrain) +#from models_swin2 import (Swin2) +#from models_van import (Van, Van_tiny) +#from models_focal import (Focal) +#from models_cross import (Cross) +#from models_cov import (Cov) +#from models_cnn import (Cnn_light) +#from models_twins import (Twins) +#from models_cmt import (Cmt, Cmt1) +#from models_shunted import (Shunted) +#from models_quadtree import (Quadtree, Quadtree2, Quadtree_nopretrain) +#from models_davit import (Davit_tscam, Davit, Davit_nopretrain) +from pytorch_utils import (move_data_to_device, count_parameters, count_flops, + do_mixup) +from data_generator import (AudioSetDataset, TrainSampler, BalancedTrainSampler, + AlternateTrainSampler, EvaluateSampler, collate_fn) +from evaluate import Evaluator +import config +from losses import get_loss_func + + +def train(args): + """Train AudioSet tagging model. + + Args: + dataset_dir: str + workspace: str + data_type: 'balanced_train' | 'full_train' + window_size: int + hop_size: int + mel_bins: int + model_type: str + loss_type: 'clip_bce' + balanced: 'none' | 'balanced' | 'alternate' + augmentation: 'none' | 'mixup' + batch_size: int + learning_rate: float + resume_iteration: int + early_stop: int + accumulation_steps: int + cuda: bool + """ + + # Arugments & parameters + workspace = args.workspace + data_type = args.data_type + sample_rate = args.sample_rate + window_size = args.window_size + hop_size = args.hop_size + mel_bins = args.mel_bins + fmin = args.fmin + fmax = args.fmax + model_type = args.model_type + loss_type = args.loss_type + balanced = args.balanced + augmentation = args.augmentation + batch_size = args.batch_size + learning_rate = args.learning_rate + resume_iteration = args.resume_iteration + early_stop = args.early_stop + device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu') + filename = args.filename + + num_workers = 8 + clip_samples = config.clip_samples + classes_num = config.classes_num + loss_func = get_loss_func(loss_type) + + # Paths + black_list_csv = None + + train_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes', + '{}.h5'.format(data_type)) + + eval_bal_indexes_hdf5_path = os.path.join(workspace, + 'hdf5s', 'indexes', 'balanced_train.h5') + + eval_test_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes', + 'eval.h5') + + checkpoints_dir = os.path.join(workspace, 'checkpoints', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size)) + create_folder(checkpoints_dir) + + statistics_path = os.path.join(workspace, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + create_folder(os.path.dirname(statistics_path)) + + logs_dir = os.path.join(workspace, 'logs', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size)) + + create_logging(logs_dir, filemode='w') + logging.info(args) + + if 'cuda' in str(device): + logging.info('Using GPU.') + device = 'cuda' + else: + logging.info('Using CPU. Set --cuda flag to use GPU.') + device = 'cpu' + + # Model + Model = eval(model_type) + model = Model(sample_rate=sample_rate, window_size=window_size, + hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax, + classes_num=classes_num) + total = sum(p.numel() for p in model.parameters()) + print("Total params: %.2fM" % (total/1e6)) + logging.info("Total params: %.2fM" % (total/1e6)) + #params_num = count_parameters(model) + # flops_num = count_flops(model, clip_samples) + #logging.info('Parameters num: {}'.format(params_num)) + # logging.info('Flops num: {:.3f} G'.format(flops_num / 1e9)) + + # Dataset will be used by DataLoader later. Dataset takes a meta as input + # and return a waveform and a target. + dataset = AudioSetDataset(sample_rate=sample_rate) + + # Train sampler + if balanced == 'none': + Sampler = TrainSampler + elif balanced == 'balanced': + Sampler = BalancedTrainSampler + elif balanced == 'alternate': + Sampler = AlternateTrainSampler + + train_sampler = Sampler( + indexes_hdf5_path=train_indexes_hdf5_path, + batch_size=batch_size * 2 if 'mixup' in augmentation else batch_size, + black_list_csv=black_list_csv) + + # Evaluate sampler + eval_bal_sampler = EvaluateSampler( + indexes_hdf5_path=eval_bal_indexes_hdf5_path, batch_size=batch_size) + + eval_test_sampler = EvaluateSampler( + indexes_hdf5_path=eval_test_indexes_hdf5_path, batch_size=batch_size) + + # Data loader + train_loader = torch.utils.data.DataLoader(dataset=dataset, + batch_sampler=train_sampler, collate_fn=collate_fn, + num_workers=num_workers, pin_memory=True) + + eval_bal_loader = torch.utils.data.DataLoader(dataset=dataset, + batch_sampler=eval_bal_sampler, collate_fn=collate_fn, + num_workers=num_workers, pin_memory=True) + + eval_test_loader = torch.utils.data.DataLoader(dataset=dataset, + batch_sampler=eval_test_sampler, collate_fn=collate_fn, + num_workers=num_workers, pin_memory=True) + mix=0.5 + if 'mixup' in augmentation: + mixup_augmenter = Mixup(mixup_alpha=mix) + print(mix) + logging.info(mix) + + # Evaluator + evaluator = Evaluator(model=model) + + # Statistics + statistics_container = StatisticsContainer(statistics_path) + + # Optimizer + optimizer = optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.05, amsgrad=True) + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4, min_lr=1e-06, verbose=True) + train_bgn_time = time.time() + + # Resume training + if resume_iteration > 0: + resume_checkpoint_path = os.path.join(workspace, 'checkpoints', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + '{}_iterations.pth'.format(resume_iteration)) + + logging.info('Loading checkpoint {}'.format(resume_checkpoint_path)) + checkpoint = torch.load(resume_checkpoint_path) + model.load_state_dict(checkpoint['model']) + train_sampler.load_state_dict(checkpoint['sampler']) + statistics_container.load_state_dict(resume_iteration) + iteration = checkpoint['iteration'] + + else: + iteration = 0 + + # Parallel + print('GPU number: {}'.format(torch.cuda.device_count())) + model = torch.nn.DataParallel(model) + + if 'cuda' in str(device): + model.to(device) + + if resume_iteration: + optimizer.load_state_dict(checkpoint['optimizer']) + scheduler.load_state_dict(checkpoint['scheduler']) + print(optimizer.state_dict()['param_groups'][0]['lr']) + + time1 = time.time() + + for batch_data_dict in train_loader: + """batch_data_dict: { + 'audio_name': (batch_size [*2 if mixup],), + 'waveform': (batch_size [*2 if mixup], clip_samples), + 'target': (batch_size [*2 if mixup], classes_num), + (ifexist) 'mixup_lambda': (batch_size * 2,)} + """ + + # Evaluate + if (iteration % 2000 == 0 and iteration >= resume_iteration) or (iteration == 0): + train_fin_time = time.time() + + bal_statistics = evaluator.evaluate(eval_bal_loader) + test_statistics = evaluator.evaluate(eval_test_loader) + + logging.info('Validate bal mAP: {:.3f}'.format( + np.mean(bal_statistics['average_precision']))) + + logging.info('Validate test mAP: {:.3f}'.format( + np.mean(test_statistics['average_precision']))) + + statistics_container.append(iteration, bal_statistics, data_type='bal') + statistics_container.append(iteration, test_statistics, data_type='test') + statistics_container.dump() + + train_time = train_fin_time - train_bgn_time + validate_time = time.time() - train_fin_time + + logging.info( + 'iteration: {}, train time: {:.3f} s, validate time: {:.3f} s' + ''.format(iteration, train_time, validate_time)) + + logging.info('------------------------------------') + + train_bgn_time = time.time() + + # Save model + if iteration % 2000 == 0: + checkpoint = { + 'iteration': iteration, + 'model': model.module.state_dict(), + 'sampler': train_sampler.state_dict(), + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict()} + + checkpoint_path = os.path.join( + checkpoints_dir, '{}_iterations.pth'.format(iteration)) + + torch.save(checkpoint, checkpoint_path) + logging.info('Model saved to {}'.format(checkpoint_path)) + + # Mixup lambda + if 'mixup' in augmentation: + batch_data_dict['mixup_lambda'] = mixup_augmenter.get_lambda( + batch_size=len(batch_data_dict['waveform'])) + + # Move data to device + for key in batch_data_dict.keys(): + batch_data_dict[key] = move_data_to_device(batch_data_dict[key], device) + + # Forward + model.train() + + if 'mixup' in augmentation: + batch_output_dict = model(batch_data_dict['waveform'], + batch_data_dict['mixup_lambda']) + """{'clipwise_output': (batch_size, classes_num), ...}""" + + batch_target_dict = {'target': do_mixup(batch_data_dict['target'], + batch_data_dict['mixup_lambda'])} + """{'target': (batch_size, classes_num)}""" + else: + batch_output_dict = model(batch_data_dict['waveform'], None) + """{'clipwise_output': (batch_size, classes_num), ...}""" + + batch_target_dict = {'target': batch_data_dict['target']} + """{'target': (batch_size, classes_num)}""" + + # Loss + loss = loss_func(batch_output_dict, batch_target_dict) + # Backward + loss.backward() + + optimizer.step() + optimizer.zero_grad() + + if iteration % 10 == 0: + print(iteration, loss) + #print('--- Iteration: {}, train time: {:.3f} s / 10 iterations ---'\ + # .format(iteration, time.time() - time1)) + #time1 = time.time() + + if iteration % 2000 == 0: + scheduler.step(np.mean(test_statistics['average_precision'])) + print(optimizer.state_dict()['param_groups'][0]['lr']) + logging.info(optimizer.state_dict()['param_groups'][0]['lr']) + + # Stop learning + if iteration == early_stop: + break + + iteration += 1 + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Example of parser. ') + subparsers = parser.add_subparsers(dest='mode') + + parser_train = subparsers.add_parser('train') + parser_train.add_argument('--workspace', type=str, required=True) + parser_train.add_argument('--data_type', type=str, default='full_train', choices=['balanced_train', 'full_train']) + parser_train.add_argument('--sample_rate', type=int, default=32000) + parser_train.add_argument('--window_size', type=int, default=1024) + parser_train.add_argument('--hop_size', type=int, default=320) + parser_train.add_argument('--mel_bins', type=int, default=64) + parser_train.add_argument('--fmin', type=int, default=50) + parser_train.add_argument('--fmax', type=int, default=14000) + parser_train.add_argument('--model_type', type=str, required=True) + parser_train.add_argument('--loss_type', type=str, default='clip_bce', choices=['clip_bce']) + parser_train.add_argument('--balanced', type=str, default='balanced', choices=['none', 'balanced', 'alternate']) + parser_train.add_argument('--augmentation', type=str, default='mixup', choices=['none', 'mixup']) + parser_train.add_argument('--batch_size', type=int, default=32) + parser_train.add_argument('--learning_rate', type=float, default=1e-3) + parser_train.add_argument('--resume_iteration', type=int, default=0) + parser_train.add_argument('--early_stop', type=int, default=1000000) + parser_train.add_argument('--cuda', action='store_true', default=False) + + args = parser.parse_args() + args.filename = get_filename(__file__) + + if args.mode == 'train': + train(args) + + else: + raise Exception('Error argument!') \ No newline at end of file diff --git a/audio_detection/audio_infer/pytorch/models.py b/audio_detection/audio_infer/pytorch/models.py new file mode 100644 index 0000000000000000000000000000000000000000..3cf5456d1ee9a26a4afe58cea2b11ad78033e01e --- /dev/null +++ b/audio_detection/audio_infer/pytorch/models.py @@ -0,0 +1,951 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchlibrosa.stft import Spectrogram, LogmelFilterBank +from torchlibrosa.augmentation import SpecAugmentation + +from audio_infer.pytorch.pytorch_utils import do_mixup, interpolate, pad_framewise_output +import os +import sys +import math +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.parameter import Parameter +from torchlibrosa.stft import Spectrogram, LogmelFilterBank +from torchlibrosa.augmentation import SpecAugmentation +from audio_infer.pytorch.pytorch_utils import do_mixup +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ +import warnings +from functools import partial +#from mmdet.models.builder import BACKBONES +from mmdet.utils import get_root_logger +from mmcv.runner import load_checkpoint +os.environ['TORCH_HOME'] = '../pretrained_models' +from copy import deepcopy +from timm.models.helpers import load_pretrained +from torch.cuda.amp import autocast +from collections import OrderedDict +import io +import re +from mmcv.runner import _load_checkpoint, load_state_dict +import mmcv.runner +import copy +import random +from einops import rearrange +from einops.layers.torch import Rearrange, Reduce +from torch import nn, einsum + + +def load_checkpoint(model, + filename, + map_location=None, + strict=False, + logger=None, + revise_keys=[(r'^module\.', '')]): + """Load checkpoint from a file or URI. + + Args: + model (Module): Module to load checkpoint. + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str): Same as :func:`torch.load`. + strict (bool): Whether to allow different params for the model and + checkpoint. + logger (:mod:`logging.Logger` or None): The logger for error message. + revise_keys (list): A list of customized keywords to modify the + state_dict in checkpoint. Each item is a (pattern, replacement) + pair of the regular expression operations. Default: strip + the prefix 'module.' by [(r'^module\\.', '')]. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + checkpoint = _load_checkpoint(filename, map_location, logger) + new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2)) + new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1)) + checkpoint['patch_embed1.proj.weight'] = new_proj.weight + # OrderedDict is a subclass of dict + if not isinstance(checkpoint, dict): + raise RuntimeError( + f'No state_dict found in checkpoint file {filename}') + # get state_dict from checkpoint + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + + # strip prefix of state_dict + metadata = getattr(state_dict, '_metadata', OrderedDict()) + for p, r in revise_keys: + state_dict = OrderedDict( + {re.sub(p, r, k): v + for k, v in state_dict.items()}) + state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()}) + # Keep metadata in state_dict + state_dict._metadata = metadata + + # load state_dict + load_state_dict(model, state_dict, strict, logger) + return checkpoint + +def init_layer(layer): + """Initialize a Linear or Convolutional layer. """ + nn.init.xavier_uniform_(layer.weight) + + if hasattr(layer, 'bias'): + if layer.bias is not None: + layer.bias.data.fill_(0.) + + +def init_bn(bn): + """Initialize a Batchnorm layer. """ + bn.bias.data.fill_(0.) + bn.weight.data.fill_(1.) + + + + +class TimeShift(nn.Module): + def __init__(self, mean, std): + super().__init__() + self.mean = mean + self.std = std + + def forward(self, x): + if self.training: + shift = torch.empty(1).normal_(self.mean, self.std).int().item() + x = torch.roll(x, shift, dims=2) + return x + +class LinearSoftPool(nn.Module): + """LinearSoftPool + Linear softmax, takes logits and returns a probability, near to the actual maximum value. + Taken from the paper: + A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling + https://arxiv.org/abs/1810.09050 + """ + def __init__(self, pooldim=1): + super().__init__() + self.pooldim = pooldim + + def forward(self, logits, time_decision): + return (time_decision**2).sum(self.pooldim) / time_decision.sum( + self.pooldim) + +class PVT(nn.Module): + def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, classes_num): + + super(PVT, self).__init__() + + window = 'hann' + center = True + pad_mode = 'reflect' + ref = 1.0 + amin = 1e-10 + top_db = None + + # Spectrogram extractor + self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, + win_length=window_size, window=window, center=center, pad_mode=pad_mode, + freeze_parameters=True) + + # Logmel feature extractor + self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, + n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, + freeze_parameters=True) + + self.time_shift = TimeShift(0, 10) + # Spec augmenter + self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, + freq_drop_width=8, freq_stripes_num=2) + + self.bn0 = nn.BatchNorm2d(64) + self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001, + fdim=64, + patch_size=7, + stride=4, + in_chans=1, + num_classes=classes_num, + embed_dims=[64, 128, 320, 512], + depths=[3, 4, 6, 3], + num_heads=[1, 2, 5, 8], + mlp_ratios=[8, 8, 4, 4], + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + drop_path_rate=0.1, + sr_ratios=[8, 4, 2, 1], + norm_layer=partial(nn.LayerNorm, eps=1e-6), + num_stages=4, + #pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth' + ) + #self.temp_pool = LinearSoftPool() + self.avgpool = nn.AdaptiveAvgPool1d(1) + self.fc_audioset = nn.Linear(512, classes_num, bias=True) + + self.init_weights() + + def init_weights(self): + init_bn(self.bn0) + init_layer(self.fc_audioset) + + def forward(self, input, mixup_lambda=None): + """Input: (batch_size, times_steps, freq_bins)""" + + interpolate_ratio = 32 + + x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins) + x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) + frames_num = x.shape[2] + x = x.transpose(1, 3) + x = self.bn0(x) + x = x.transpose(1, 3) + + if self.training: + x = self.time_shift(x) + x = self.spec_augmenter(x) + + # Mixup on spectrogram + if self.training and mixup_lambda is not None: + x = do_mixup(x, mixup_lambda) + #print(x.shape) #torch.Size([10, 1, 1001, 64]) + x = self.pvt_transformer(x) + #print(x.shape) #torch.Size([10, 800, 128]) + x = torch.mean(x, dim=3) + + x = x.transpose(1, 2).contiguous() + framewise_output = torch.sigmoid(self.fc_audioset(x)) + #clipwise_output = torch.mean(framewise_output, dim=1) + #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1) + x = framewise_output.transpose(1, 2).contiguous() + x = self.avgpool(x) + clipwise_output = torch.flatten(x, 1) + #print(framewise_output.shape) #torch.Size([10, 100, 17]) + framewise_output = interpolate(framewise_output, interpolate_ratio) + #framewise_output = framewise_output[:,:1000,:] + #framewise_output = pad_framewise_output(framewise_output, frames_num) + output_dict = {'framewise_output': framewise_output, + 'clipwise_output': clipwise_output} + + return output_dict + +class PVT2(nn.Module): + def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, classes_num): + + super(PVT2, self).__init__() + + window = 'hann' + center = True + pad_mode = 'reflect' + ref = 1.0 + amin = 1e-10 + top_db = None + + # Spectrogram extractor + self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, + win_length=window_size, window=window, center=center, pad_mode=pad_mode, + freeze_parameters=True) + + # Logmel feature extractor + self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, + n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, + freeze_parameters=True) + + self.time_shift = TimeShift(0, 10) + # Spec augmenter + self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, + freq_drop_width=8, freq_stripes_num=2) + + self.bn0 = nn.BatchNorm2d(64) + self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001, + fdim=64, + patch_size=7, + stride=4, + in_chans=1, + num_classes=classes_num, + embed_dims=[64, 128, 320, 512], + depths=[3, 4, 6, 3], + num_heads=[1, 2, 5, 8], + mlp_ratios=[8, 8, 4, 4], + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + drop_path_rate=0.1, + sr_ratios=[8, 4, 2, 1], + norm_layer=partial(nn.LayerNorm, eps=1e-6), + num_stages=4, + pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth' + ) + #self.temp_pool = LinearSoftPool() + self.fc_audioset = nn.Linear(512, classes_num, bias=True) + + self.init_weights() + + def init_weights(self): + init_bn(self.bn0) + init_layer(self.fc_audioset) + + def forward(self, input, mixup_lambda=None): + """Input: (batch_size, times_steps, freq_bins)""" + + interpolate_ratio = 32 + + x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins) + x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) + frames_num = x.shape[2] + x = x.transpose(1, 3) + x = self.bn0(x) + x = x.transpose(1, 3) + + if self.training: + #x = self.time_shift(x) + x = self.spec_augmenter(x) + + # Mixup on spectrogram + if self.training and mixup_lambda is not None: + x = do_mixup(x, mixup_lambda) + #print(x.shape) #torch.Size([10, 1, 1001, 64]) + x = self.pvt_transformer(x) + #print(x.shape) #torch.Size([10, 800, 128]) + x = torch.mean(x, dim=3) + + x = x.transpose(1, 2).contiguous() + framewise_output = torch.sigmoid(self.fc_audioset(x)) + clipwise_output = torch.mean(framewise_output, dim=1) + #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1) + #print(framewise_output.shape) #torch.Size([10, 100, 17]) + framewise_output = interpolate(framewise_output, interpolate_ratio) + #framewise_output = framewise_output[:,:1000,:] + #framewise_output = pad_framewise_output(framewise_output, frames_num) + output_dict = {'framewise_output': framewise_output, + 'clipwise_output': clipwise_output} + + return output_dict + +class PVT_2layer(nn.Module): + def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, classes_num): + + super(PVT_2layer, self).__init__() + + window = 'hann' + center = True + pad_mode = 'reflect' + ref = 1.0 + amin = 1e-10 + top_db = None + + # Spectrogram extractor + self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, + win_length=window_size, window=window, center=center, pad_mode=pad_mode, + freeze_parameters=True) + + # Logmel feature extractor + self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, + n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, + freeze_parameters=True) + + self.time_shift = TimeShift(0, 10) + # Spec augmenter + self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, + freq_drop_width=8, freq_stripes_num=2) + + self.bn0 = nn.BatchNorm2d(64) + self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001, + fdim=64, + patch_size=7, + stride=4, + in_chans=1, + num_classes=classes_num, + embed_dims=[64, 128], + depths=[3, 4], + num_heads=[1, 2], + mlp_ratios=[8, 8], + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + drop_path_rate=0.1, + sr_ratios=[8, 4], + norm_layer=partial(nn.LayerNorm, eps=1e-6), + num_stages=2, + pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth' + ) + #self.temp_pool = LinearSoftPool() + self.avgpool = nn.AdaptiveAvgPool1d(1) + self.fc_audioset = nn.Linear(128, classes_num, bias=True) + + self.init_weights() + + def init_weights(self): + init_bn(self.bn0) + init_layer(self.fc_audioset) + + def forward(self, input, mixup_lambda=None): + """Input: (batch_size, times_steps, freq_bins)""" + + interpolate_ratio = 8 + + x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins) + x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) + frames_num = x.shape[2] + x = x.transpose(1, 3) + x = self.bn0(x) + x = x.transpose(1, 3) + + if self.training: + x = self.time_shift(x) + x = self.spec_augmenter(x) + + # Mixup on spectrogram + if self.training and mixup_lambda is not None: + x = do_mixup(x, mixup_lambda) + #print(x.shape) #torch.Size([10, 1, 1001, 64]) + x = self.pvt_transformer(x) + #print(x.shape) #torch.Size([10, 800, 128]) + x = torch.mean(x, dim=3) + + x = x.transpose(1, 2).contiguous() + framewise_output = torch.sigmoid(self.fc_audioset(x)) + #clipwise_output = torch.mean(framewise_output, dim=1) + #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1) + x = framewise_output.transpose(1, 2).contiguous() + x = self.avgpool(x) + clipwise_output = torch.flatten(x, 1) + #print(framewise_output.shape) #torch.Size([10, 100, 17]) + framewise_output = interpolate(framewise_output, interpolate_ratio) + #framewise_output = framewise_output[:,:1000,:] + #framewise_output = pad_framewise_output(framewise_output, frames_num) + output_dict = {'framewise_output': framewise_output, + 'clipwise_output': clipwise_output} + + return output_dict + +class PVT_lr(nn.Module): + def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, classes_num): + + super(PVT_lr, self).__init__() + + window = 'hann' + center = True + pad_mode = 'reflect' + ref = 1.0 + amin = 1e-10 + top_db = None + + # Spectrogram extractor + self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, + win_length=window_size, window=window, center=center, pad_mode=pad_mode, + freeze_parameters=True) + + # Logmel feature extractor + self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, + n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, + freeze_parameters=True) + + self.time_shift = TimeShift(0, 10) + # Spec augmenter + self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, + freq_drop_width=8, freq_stripes_num=2) + + self.bn0 = nn.BatchNorm2d(64) + self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001, + fdim=64, + patch_size=7, + stride=4, + in_chans=1, + num_classes=classes_num, + embed_dims=[64, 128, 320, 512], + depths=[3, 4, 6, 3], + num_heads=[1, 2, 5, 8], + mlp_ratios=[8, 8, 4, 4], + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + drop_path_rate=0.1, + sr_ratios=[8, 4, 2, 1], + norm_layer=partial(nn.LayerNorm, eps=1e-6), + num_stages=4, + pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth' + ) + self.temp_pool = LinearSoftPool() + self.fc_audioset = nn.Linear(512, classes_num, bias=True) + + self.init_weights() + + def init_weights(self): + init_bn(self.bn0) + init_layer(self.fc_audioset) + + def forward(self, input, mixup_lambda=None): + """Input: (batch_size, times_steps, freq_bins)""" + + interpolate_ratio = 32 + + x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins) + x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) + frames_num = x.shape[2] + x = x.transpose(1, 3) + x = self.bn0(x) + x = x.transpose(1, 3) + + if self.training: + x = self.time_shift(x) + x = self.spec_augmenter(x) + + # Mixup on spectrogram + if self.training and mixup_lambda is not None: + x = do_mixup(x, mixup_lambda) + #print(x.shape) #torch.Size([10, 1, 1001, 64]) + x = self.pvt_transformer(x) + #print(x.shape) #torch.Size([10, 800, 128]) + x = torch.mean(x, dim=3) + + x = x.transpose(1, 2).contiguous() + framewise_output = torch.sigmoid(self.fc_audioset(x)) + clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1) + #print(framewise_output.shape) #torch.Size([10, 100, 17]) + framewise_output = interpolate(framewise_output, interpolate_ratio) + #framewise_output = framewise_output[:,:1000,:] + #framewise_output = pad_framewise_output(framewise_output, frames_num) + output_dict = {'framewise_output': framewise_output, + 'clipwise_output': clipwise_output} + + return output_dict + + +class PVT_nopretrain(nn.Module): + def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, classes_num): + + super(PVT_nopretrain, self).__init__() + + window = 'hann' + center = True + pad_mode = 'reflect' + ref = 1.0 + amin = 1e-10 + top_db = None + + # Spectrogram extractor + self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, + win_length=window_size, window=window, center=center, pad_mode=pad_mode, + freeze_parameters=True) + + # Logmel feature extractor + self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, + n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, + freeze_parameters=True) + + self.time_shift = TimeShift(0, 10) + # Spec augmenter + self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, + freq_drop_width=8, freq_stripes_num=2) + + self.bn0 = nn.BatchNorm2d(64) + self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001, + fdim=64, + patch_size=7, + stride=4, + in_chans=1, + num_classes=classes_num, + embed_dims=[64, 128, 320, 512], + depths=[3, 4, 6, 3], + num_heads=[1, 2, 5, 8], + mlp_ratios=[8, 8, 4, 4], + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + drop_path_rate=0.1, + sr_ratios=[8, 4, 2, 1], + norm_layer=partial(nn.LayerNorm, eps=1e-6), + num_stages=4, + #pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth' + ) + self.temp_pool = LinearSoftPool() + self.fc_audioset = nn.Linear(512, classes_num, bias=True) + + self.init_weights() + + def init_weights(self): + init_bn(self.bn0) + init_layer(self.fc_audioset) + + def forward(self, input, mixup_lambda=None): + """Input: (batch_size, times_steps, freq_bins)""" + + interpolate_ratio = 32 + + x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins) + x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) + frames_num = x.shape[2] + x = x.transpose(1, 3) + x = self.bn0(x) + x = x.transpose(1, 3) + + if self.training: + x = self.time_shift(x) + x = self.spec_augmenter(x) + + # Mixup on spectrogram + if self.training and mixup_lambda is not None: + x = do_mixup(x, mixup_lambda) + #print(x.shape) #torch.Size([10, 1, 1001, 64]) + x = self.pvt_transformer(x) + #print(x.shape) #torch.Size([10, 800, 128]) + x = torch.mean(x, dim=3) + + x = x.transpose(1, 2).contiguous() + framewise_output = torch.sigmoid(self.fc_audioset(x)) + clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1) + #print(framewise_output.shape) #torch.Size([10, 100, 17]) + framewise_output = interpolate(framewise_output, interpolate_ratio) + framewise_output = framewise_output[:,:1000,:] + #framewise_output = pad_framewise_output(framewise_output, frames_num) + output_dict = {'framewise_output': framewise_output, + 'clipwise_output': clipwise_output} + + return output_dict + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.dwconv = DWConv(hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + self.linear = linear + if self.linear: + self.relu = nn.ReLU() + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x, H, W): + x = self.fc1(x) + if self.linear: + x = self.relu(x) + x = self.dwconv(x, H, W) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False): + super().__init__() + assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}." + + self.dim = dim + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.q = nn.Linear(dim, dim, bias=qkv_bias) + self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.linear = linear + self.sr_ratio = sr_ratio + if not linear: + if sr_ratio > 1: + self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) + self.norm = nn.LayerNorm(dim) + else: + self.pool = nn.AdaptiveAvgPool2d(7) + self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1) + self.norm = nn.LayerNorm(dim) + self.act = nn.GELU() + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x, H, W): + B, N, C = x.shape + q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + + if not self.linear: + if self.sr_ratio > 1: + x_ = x.permute(0, 2, 1).reshape(B, C, H, W) + x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) + x_ = self.norm(x_) + kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + else: + kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + else: + x_ = x.permute(0, 2, 1).reshape(B, C, H, W) + x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1) + x_ = self.norm(x_) + x_ = self.act(x_) + kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + k, v = kv[0], kv[1] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + + return x + + +class Pooling(nn.Module): + """ + Implementation of pooling for PoolFormer + --pool_size: pooling size + """ + def __init__(self, pool_size=3): + super().__init__() + self.pool = nn.AvgPool2d( + pool_size, stride=1, padding=pool_size//2, count_include_pad=False) + + def forward(self, x): + return self.pool(x) - x + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear) + #self.norm3 = norm_layer(dim) + #self.token_mixer = Pooling(pool_size=3) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + return x + + +class OverlapPatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, tdim, fdim, patch_size=7, stride=4, in_chans=3, embed_dim=768): + super().__init__() + img_size = (tdim, fdim) + patch_size = to_2tuple(patch_size) + + self.img_size = img_size + self.patch_size = patch_size + self.H, self.W = img_size[0] // stride, img_size[1] // stride + self.num_patches = self.H * self.W + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, + padding=(patch_size[0] // 3, patch_size[1] // 3)) + self.norm = nn.LayerNorm(embed_dim) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x): + x = self.proj(x) + _, _, H, W = x.shape + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + + return x, H, W + + +class PyramidVisionTransformerV2(nn.Module): + def __init__(self, tdim=1001, fdim=64, patch_size=16, stride=4, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512], + num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0., + attn_drop_rate=0., drop_path_rate=0.1, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], + sr_ratios=[8, 4, 2, 1], num_stages=2, linear=False, pretrained=None): + super().__init__() + # self.num_classes = num_classes + self.depths = depths + self.num_stages = num_stages + self.linear = linear + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + cur = 0 + + for i in range(num_stages): + patch_embed = OverlapPatchEmbed(tdim=tdim if i == 0 else tdim // (2 ** (i + 1)), + fdim=fdim if i == 0 else tdim // (2 ** (i + 1)), + patch_size=7 if i == 0 else 3, + stride=stride if i == 0 else 2, + in_chans=in_chans if i == 0 else embed_dims[i - 1], + embed_dim=embed_dims[i]) + block = nn.ModuleList([Block( + dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer, + sr_ratio=sr_ratios[i], linear=linear) + for j in range(depths[i])]) + norm = norm_layer(embed_dims[i]) + cur += depths[i] + + setattr(self, f"patch_embed{i + 1}", patch_embed) + setattr(self, f"block{i + 1}", block) + setattr(self, f"norm{i + 1}", norm) + #self.n = nn.Linear(125, 250, bias=True) + # classification head + # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity() + self.apply(self._init_weights) + self.init_weights(pretrained) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger) + + def freeze_patch_emb(self): + self.patch_embed1.requires_grad = False + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + B = x.shape[0] + + for i in range(self.num_stages): + patch_embed = getattr(self, f"patch_embed{i + 1}") + block = getattr(self, f"block{i + 1}") + norm = getattr(self, f"norm{i + 1}") + x, H, W = patch_embed(x) + #print(x.shape) + for blk in block: + x = blk(x, H, W) + #print(x.shape) + x = norm(x) + #if i != self.num_stages - 1: + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + #print(x.shape) + return x + + def forward(self, x): + x = self.forward_features(x) + # x = self.head(x) + + return x + +class DWConv(nn.Module): + def __init__(self, dim=768): + super(DWConv, self).__init__() + self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) + + def forward(self, x, H, W): + B, N, C = x.shape + x = x.transpose(1, 2).view(B, C, H, W) + x = self.dwconv(x) + x = x.flatten(2).transpose(1, 2) + + return x + + +def _conv_filter(state_dict, patch_size=16): + """ convert patch embedding weight from manual patchify + linear proj to conv""" + out_dict = {} + for k, v in state_dict.items(): + if 'patch_embed.proj.weight' in k: + v = v.reshape((v.shape[0], 3, patch_size, patch_size)) + out_dict[k] = v + + return out_dict diff --git a/audio_detection/audio_infer/pytorch/pytorch_utils.py b/audio_detection/audio_infer/pytorch/pytorch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a135b336866acc61e834e42e5aa9e9db3f7998ff --- /dev/null +++ b/audio_detection/audio_infer/pytorch/pytorch_utils.py @@ -0,0 +1,251 @@ +import numpy as np +import time +import torch +import torch.nn as nn + + +def move_data_to_device(x, device): + if 'float' in str(x.dtype): + x = torch.Tensor(x) + elif 'int' in str(x.dtype): + x = torch.LongTensor(x) + else: + return x + + return x.to(device) + + +def do_mixup(x, mixup_lambda): + """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes + (1, 3, 5, ...). + + Args: + x: (batch_size * 2, ...) + mixup_lambda: (batch_size * 2,) + + Returns: + out: (batch_size, ...) + """ + out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \ + x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1) + return out + + +def append_to_dict(dict, key, value): + if key in dict.keys(): + dict[key].append(value) + else: + dict[key] = [value] + + +def forward(model, generator, return_input=False, + return_target=False): + """Forward data to a model. + + Args: + model: object + generator: object + return_input: bool + return_target: bool + + Returns: + audio_name: (audios_num,) + clipwise_output: (audios_num, classes_num) + (ifexist) segmentwise_output: (audios_num, segments_num, classes_num) + (ifexist) framewise_output: (audios_num, frames_num, classes_num) + (optional) return_input: (audios_num, segment_samples) + (optional) return_target: (audios_num, classes_num) + """ + output_dict = {} + device = next(model.parameters()).device + time1 = time.time() + + # Forward data to a model in mini-batches + for n, batch_data_dict in enumerate(generator): + print(n) + batch_waveform = move_data_to_device(batch_data_dict['waveform'], device) + + with torch.no_grad(): + model.eval() + batch_output = model(batch_waveform) + + append_to_dict(output_dict, 'audio_name', batch_data_dict['audio_name']) + + append_to_dict(output_dict, 'clipwise_output', + batch_output['clipwise_output'].data.cpu().numpy()) + + if 'segmentwise_output' in batch_output.keys(): + append_to_dict(output_dict, 'segmentwise_output', + batch_output['segmentwise_output'].data.cpu().numpy()) + + if 'framewise_output' in batch_output.keys(): + append_to_dict(output_dict, 'framewise_output', + batch_output['framewise_output'].data.cpu().numpy()) + + if return_input: + append_to_dict(output_dict, 'waveform', batch_data_dict['waveform']) + + if return_target: + if 'target' in batch_data_dict.keys(): + append_to_dict(output_dict, 'target', batch_data_dict['target']) + + if n % 10 == 0: + print(' --- Inference time: {:.3f} s / 10 iterations ---'.format( + time.time() - time1)) + time1 = time.time() + + for key in output_dict.keys(): + output_dict[key] = np.concatenate(output_dict[key], axis=0) + + return output_dict + + +def interpolate(x, ratio): + """Interpolate data in time domain. This is used to compensate the + resolution reduction in downsampling of a CNN. + + Args: + x: (batch_size, time_steps, classes_num) + ratio: int, ratio to interpolate + + Returns: + upsampled: (batch_size, time_steps * ratio, classes_num) + """ + (batch_size, time_steps, classes_num) = x.shape + upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1) + upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num) + return upsampled + + +def pad_framewise_output(framewise_output, frames_num): + """Pad framewise_output to the same length as input frames. The pad value + is the same as the value of the last frame. + + Args: + framewise_output: (batch_size, frames_num, classes_num) + frames_num: int, number of frames to pad + + Outputs: + output: (batch_size, frames_num, classes_num) + """ + pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1) + """tensor for padding""" + + output = torch.cat((framewise_output, pad), dim=1) + """(batch_size, frames_num, classes_num)""" + + return output + + +def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +def count_flops(model, audio_length): + """Count flops. Code modified from others' implementation. + """ + multiply_adds = True + list_conv2d=[] + def conv2d_hook(self, input, output): + batch_size, input_channels, input_height, input_width = input[0].size() + output_channels, output_height, output_width = output[0].size() + + kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups) * (2 if multiply_adds else 1) + bias_ops = 1 if self.bias is not None else 0 + + params = output_channels * (kernel_ops + bias_ops) + flops = batch_size * params * output_height * output_width + + list_conv2d.append(flops) + + list_conv1d=[] + def conv1d_hook(self, input, output): + batch_size, input_channels, input_length = input[0].size() + output_channels, output_length = output[0].size() + + kernel_ops = self.kernel_size[0] * (self.in_channels / self.groups) * (2 if multiply_adds else 1) + bias_ops = 1 if self.bias is not None else 0 + + params = output_channels * (kernel_ops + bias_ops) + flops = batch_size * params * output_length + + list_conv1d.append(flops) + + list_linear=[] + def linear_hook(self, input, output): + batch_size = input[0].size(0) if input[0].dim() == 2 else 1 + + weight_ops = self.weight.nelement() * (2 if multiply_adds else 1) + bias_ops = self.bias.nelement() + + flops = batch_size * (weight_ops + bias_ops) + list_linear.append(flops) + + list_bn=[] + def bn_hook(self, input, output): + list_bn.append(input[0].nelement() * 2) + + list_relu=[] + def relu_hook(self, input, output): + list_relu.append(input[0].nelement() * 2) + + list_pooling2d=[] + def pooling2d_hook(self, input, output): + batch_size, input_channels, input_height, input_width = input[0].size() + output_channels, output_height, output_width = output[0].size() + + kernel_ops = self.kernel_size * self.kernel_size + bias_ops = 0 + params = output_channels * (kernel_ops + bias_ops) + flops = batch_size * params * output_height * output_width + + list_pooling2d.append(flops) + + list_pooling1d=[] + def pooling1d_hook(self, input, output): + batch_size, input_channels, input_length = input[0].size() + output_channels, output_length = output[0].size() + + kernel_ops = self.kernel_size[0] + bias_ops = 0 + + params = output_channels * (kernel_ops + bias_ops) + flops = batch_size * params * output_length + + list_pooling2d.append(flops) + + def foo(net): + childrens = list(net.children()) + if not childrens: + if isinstance(net, nn.Conv2d): + net.register_forward_hook(conv2d_hook) + elif isinstance(net, nn.Conv1d): + net.register_forward_hook(conv1d_hook) + elif isinstance(net, nn.Linear): + net.register_forward_hook(linear_hook) + elif isinstance(net, nn.BatchNorm2d) or isinstance(net, nn.BatchNorm1d): + net.register_forward_hook(bn_hook) + elif isinstance(net, nn.ReLU): + net.register_forward_hook(relu_hook) + elif isinstance(net, nn.AvgPool2d) or isinstance(net, nn.MaxPool2d): + net.register_forward_hook(pooling2d_hook) + elif isinstance(net, nn.AvgPool1d) or isinstance(net, nn.MaxPool1d): + net.register_forward_hook(pooling1d_hook) + else: + print('Warning: flop of module {} is not counted!'.format(net)) + return + for c in childrens: + foo(c) + + # Register hook + foo(model) + + device = device = next(model.parameters()).device + input = torch.rand(1, audio_length).to(device) + + out = model(input) + + total_flops = sum(list_conv2d) + sum(list_conv1d) + sum(list_linear) + \ + sum(list_bn) + sum(list_relu) + sum(list_pooling2d) + sum(list_pooling1d) + + return total_flops \ No newline at end of file diff --git a/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png b/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png new file mode 100644 index 0000000000000000000000000000000000000000..3c2b5d8cceac7f40a4bdba8bd1a75d590b4382ee Binary files /dev/null and b/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png differ diff --git a/audio_detection/audio_infer/useful_ckpts/audio_detection.pth b/audio_detection/audio_infer/useful_ckpts/audio_detection.pth new file mode 100644 index 0000000000000000000000000000000000000000..8bc6c65802de022080d76fc07bb68a563c6d87bf --- /dev/null +++ b/audio_detection/audio_infer/useful_ckpts/audio_detection.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f909808f17d424dc29063a21953ff2be103489518a4f60a6c649d2e3e7d3e81 +size 441042195 diff --git a/audio_detection/audio_infer/utils/__pycache__/config.cpython-38.pyc b/audio_detection/audio_infer/utils/__pycache__/config.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..724d543c0b401c546e16e5db5c7be6d7b1b78c8a Binary files /dev/null and b/audio_detection/audio_infer/utils/__pycache__/config.cpython-38.pyc differ diff --git a/audio_detection/audio_infer/utils/config.py b/audio_detection/audio_infer/utils/config.py new file mode 100644 index 0000000000000000000000000000000000000000..934be1c68f4e1562e5fcef81d2f8db131cb39b9f --- /dev/null +++ b/audio_detection/audio_infer/utils/config.py @@ -0,0 +1,94 @@ +import numpy as np +import csv + +sample_rate = 32000 +clip_samples = sample_rate * 10 # Audio clips are 10-second + +# Load label +with open('./audio_detection/audio_infer/metadata/class_labels_indices.csv', 'r') as f: + reader = csv.reader(f, delimiter=',') + lines = list(reader) + +labels = [] +ids = [] # Each label has a unique id such as "/m/068hy" +for i1 in range(1, len(lines)): + id = lines[i1][1] + label = lines[i1][2] + ids.append(id) + labels.append(label) + +classes_num = len(labels) + +lb_to_ix = {label : i for i, label in enumerate(labels)} +ix_to_lb = {i : label for i, label in enumerate(labels)} + +id_to_ix = {id : i for i, id in enumerate(ids)} +ix_to_id = {i : id for i, id in enumerate(ids)} + +full_samples_per_class = np.array([ + 937432, 16344, 7822, 10271, 2043, 14420, 733, 1511, + 1258, 424, 1751, 704, 369, 590, 1063, 1375, + 5026, 743, 853, 1648, 714, 1497, 1251, 2139, + 1093, 133, 224, 39469, 6423, 407, 1559, 4546, + 6826, 7464, 2468, 549, 4063, 334, 587, 238, + 1766, 691, 114, 2153, 236, 209, 421, 740, + 269, 959, 137, 4192, 485, 1515, 655, 274, + 69, 157, 1128, 807, 1022, 346, 98, 680, + 890, 352, 4169, 2061, 1753, 9883, 1339, 708, + 37857, 18504, 12864, 2475, 2182, 757, 3624, 677, + 1683, 3583, 444, 1780, 2364, 409, 4060, 3097, + 3143, 502, 723, 600, 230, 852, 1498, 1865, + 1879, 2429, 5498, 5430, 2139, 1761, 1051, 831, + 2401, 2258, 1672, 1711, 987, 646, 794, 25061, + 5792, 4256, 96, 8126, 2740, 752, 513, 554, + 106, 254, 1592, 556, 331, 615, 2841, 737, + 265, 1349, 358, 1731, 1115, 295, 1070, 972, + 174, 937780, 112337, 42509, 49200, 11415, 6092, 13851, + 2665, 1678, 13344, 2329, 1415, 2244, 1099, 5024, + 9872, 10948, 4409, 2732, 1211, 1289, 4807, 5136, + 1867, 16134, 14519, 3086, 19261, 6499, 4273, 2790, + 8820, 1228, 1575, 4420, 3685, 2019, 664, 324, + 513, 411, 436, 2997, 5162, 3806, 1389, 899, + 8088, 7004, 1105, 3633, 2621, 9753, 1082, 26854, + 3415, 4991, 2129, 5546, 4489, 2850, 1977, 1908, + 1719, 1106, 1049, 152, 136, 802, 488, 592, + 2081, 2712, 1665, 1128, 250, 544, 789, 2715, + 8063, 7056, 2267, 8034, 6092, 3815, 1833, 3277, + 8813, 2111, 4662, 2678, 2954, 5227, 1472, 2591, + 3714, 1974, 1795, 4680, 3751, 6585, 2109, 36617, + 6083, 16264, 17351, 3449, 5034, 3931, 2599, 4134, + 3892, 2334, 2211, 4516, 2766, 2862, 3422, 1788, + 2544, 2403, 2892, 4042, 3460, 1516, 1972, 1563, + 1579, 2776, 1647, 4535, 3921, 1261, 6074, 2922, + 3068, 1948, 4407, 712, 1294, 1019, 1572, 3764, + 5218, 975, 1539, 6376, 1606, 6091, 1138, 1169, + 7925, 3136, 1108, 2677, 2680, 1383, 3144, 2653, + 1986, 1800, 1308, 1344, 122231, 12977, 2552, 2678, + 7824, 768, 8587, 39503, 3474, 661, 430, 193, + 1405, 1442, 3588, 6280, 10515, 785, 710, 305, + 206, 4990, 5329, 3398, 1771, 3022, 6907, 1523, + 8588, 12203, 666, 2113, 7916, 434, 1636, 5185, + 1062, 664, 952, 3490, 2811, 2749, 2848, 15555, + 363, 117, 1494, 1647, 5886, 4021, 633, 1013, + 5951, 11343, 2324, 243, 372, 943, 734, 242, + 3161, 122, 127, 201, 1654, 768, 134, 1467, + 642, 1148, 2156, 1368, 1176, 302, 1909, 61, + 223, 1812, 287, 422, 311, 228, 748, 230, + 1876, 539, 1814, 737, 689, 1140, 591, 943, + 353, 289, 198, 490, 7938, 1841, 850, 457, + 814, 146, 551, 728, 1627, 620, 648, 1621, + 2731, 535, 88, 1736, 736, 328, 293, 3170, + 344, 384, 7640, 433, 215, 715, 626, 128, + 3059, 1833, 2069, 3732, 1640, 1508, 836, 567, + 2837, 1151, 2068, 695, 1494, 3173, 364, 88, + 188, 740, 677, 273, 1533, 821, 1091, 293, + 647, 318, 1202, 328, 532, 2847, 526, 721, + 370, 258, 956, 1269, 1641, 339, 1322, 4485, + 286, 1874, 277, 757, 1393, 1330, 380, 146, + 377, 394, 318, 339, 1477, 1886, 101, 1435, + 284, 1425, 686, 621, 221, 117, 87, 1340, + 201, 1243, 1222, 651, 1899, 421, 712, 1016, + 1279, 124, 351, 258, 7043, 368, 666, 162, + 7664, 137, 70159, 26179, 6321, 32236, 33320, 771, + 1169, 269, 1103, 444, 364, 2710, 121, 751, + 1609, 855, 1141, 2287, 1940, 3943, 289]) diff --git a/audio_detection/audio_infer/utils/crash.py b/audio_detection/audio_infer/utils/crash.py new file mode 100644 index 0000000000000000000000000000000000000000..98a06e20bc793687ec259e23c8b9e503887b34f5 --- /dev/null +++ b/audio_detection/audio_infer/utils/crash.py @@ -0,0 +1,12 @@ +import sys + +class ExceptionHook: + instance = None + def __call__(self, *args, **kwargs): + if self.instance is None: + from IPython.core import ultratb + self.instance = ultratb.FormattedTB(mode='Plain', + color_scheme='Linux', call_pdb=1) + return self.instance(*args, **kwargs) + +sys.excepthook = ExceptionHook() diff --git a/audio_detection/audio_infer/utils/create_black_list.py b/audio_detection/audio_infer/utils/create_black_list.py new file mode 100644 index 0000000000000000000000000000000000000000..fadbe94599997e3476f37f8c4cdd30ca86a8720e --- /dev/null +++ b/audio_detection/audio_infer/utils/create_black_list.py @@ -0,0 +1,64 @@ +import argparse +import csv +import os + +from utilities import create_folder + + +def dcase2017task4(args): + """Create black list. Black list is a list of audio ids that will be + skipped in training. + """ + + # Augments & parameters + workspace = args.workspace + + # Black list from DCASE 2017 Task 4 + test_weak_csv = 'metadata/black_list/groundtruth_weak_label_testing_set.csv' + evaluation_weak_csv = 'metadata/black_list/groundtruth_weak_label_evaluation_set.csv' + + black_list_csv = os.path.join(workspace, 'black_list', 'dcase2017task4.csv') + create_folder(os.path.dirname(black_list_csv)) + + def get_id_sets(csv_path): + with open(csv_path, 'r') as fr: + reader = csv.reader(fr, delimiter='\t') + lines = list(reader) + + ids_set = [] + + for line in lines: + """line: ['-5QrBL6MzLg_60.000_70.000.wav', '60.000', '70.000', 'Train horn']""" + ids_set.append(line[0][0 : 11]) + + ids_set = list(set(ids_set)) + return ids_set + + test_ids_set = get_id_sets(test_weak_csv) + evaluation_ids_set = get_id_sets(evaluation_weak_csv) + + full_ids_set = test_ids_set + evaluation_ids_set + + # Write black list + fw = open(black_list_csv, 'w') + + for id in full_ids_set: + fw.write('{}\n'.format(id)) + + print('Write black list to {}'.format(black_list_csv)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='') + subparsers = parser.add_subparsers(dest='mode') + + parser_dcase2017task4 = subparsers.add_parser('dcase2017task4') + parser_dcase2017task4.add_argument('--workspace', type=str, required=True) + + args = parser.parse_args() + + if args.mode == 'dcase2017task4': + dcase2017task4(args) + + else: + raise Exception('Error argument!') \ No newline at end of file diff --git a/audio_detection/audio_infer/utils/create_indexes.py b/audio_detection/audio_infer/utils/create_indexes.py new file mode 100644 index 0000000000000000000000000000000000000000..78be38cb3c693fa9ef7b44c52c407640e9e32aab --- /dev/null +++ b/audio_detection/audio_infer/utils/create_indexes.py @@ -0,0 +1,126 @@ +import numpy as np +import argparse +import csv +import os +import glob +import datetime +import time +import logging +import h5py +import librosa + +from utilities import create_folder, get_sub_filepaths +import config + + +def create_indexes(args): + """Create indexes a for dataloader to read for training. When users have + a new task and their own data, they need to create similar indexes. The + indexes contain meta information of "where to find the data for training". + """ + + # Arguments & parameters + waveforms_hdf5_path = args.waveforms_hdf5_path + indexes_hdf5_path = args.indexes_hdf5_path + + # Paths + create_folder(os.path.dirname(indexes_hdf5_path)) + + with h5py.File(waveforms_hdf5_path, 'r') as hr: + with h5py.File(indexes_hdf5_path, 'w') as hw: + audios_num = len(hr['audio_name']) + hw.create_dataset('audio_name', data=hr['audio_name'][:], dtype='S20') + hw.create_dataset('target', data=hr['target'][:], dtype=np.bool) + hw.create_dataset('hdf5_path', data=[waveforms_hdf5_path.encode()] * audios_num, dtype='S200') + hw.create_dataset('index_in_hdf5', data=np.arange(audios_num), dtype=np.int32) + + print('Write to {}'.format(indexes_hdf5_path)) + + +def combine_full_indexes(args): + """Combine all balanced and unbalanced indexes hdf5s to a single hdf5. This + combined indexes hdf5 is used for training with full data (~20k balanced + audio clips + ~1.9m unbalanced audio clips). + """ + + # Arguments & parameters + indexes_hdf5s_dir = args.indexes_hdf5s_dir + full_indexes_hdf5_path = args.full_indexes_hdf5_path + + classes_num = config.classes_num + + # Paths + paths = get_sub_filepaths(indexes_hdf5s_dir) + paths = [path for path in paths if ( + 'train' in path and 'full_train' not in path and 'mini' not in path)] + + print('Total {} hdf5 to combine.'.format(len(paths))) + + with h5py.File(full_indexes_hdf5_path, 'w') as full_hf: + full_hf.create_dataset( + name='audio_name', + shape=(0,), + maxshape=(None,), + dtype='S20') + + full_hf.create_dataset( + name='target', + shape=(0, classes_num), + maxshape=(None, classes_num), + dtype=np.bool) + + full_hf.create_dataset( + name='hdf5_path', + shape=(0,), + maxshape=(None,), + dtype='S200') + + full_hf.create_dataset( + name='index_in_hdf5', + shape=(0,), + maxshape=(None,), + dtype=np.int32) + + for path in paths: + with h5py.File(path, 'r') as part_hf: + print(path) + n = len(full_hf['audio_name'][:]) + new_n = n + len(part_hf['audio_name'][:]) + + full_hf['audio_name'].resize((new_n,)) + full_hf['audio_name'][n : new_n] = part_hf['audio_name'][:] + + full_hf['target'].resize((new_n, classes_num)) + full_hf['target'][n : new_n] = part_hf['target'][:] + + full_hf['hdf5_path'].resize((new_n,)) + full_hf['hdf5_path'][n : new_n] = part_hf['hdf5_path'][:] + + full_hf['index_in_hdf5'].resize((new_n,)) + full_hf['index_in_hdf5'][n : new_n] = part_hf['index_in_hdf5'][:] + + print('Write combined full hdf5 to {}'.format(full_indexes_hdf5_path)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest='mode') + + parser_create_indexes = subparsers.add_parser('create_indexes') + parser_create_indexes.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path of packed waveforms hdf5.') + parser_create_indexes.add_argument('--indexes_hdf5_path', type=str, required=True, help='Path to write out indexes hdf5.') + + parser_combine_full_indexes = subparsers.add_parser('combine_full_indexes') + parser_combine_full_indexes.add_argument('--indexes_hdf5s_dir', type=str, required=True, help='Directory containing indexes hdf5s to be combined.') + parser_combine_full_indexes.add_argument('--full_indexes_hdf5_path', type=str, required=True, help='Path to write out full indexes hdf5 file.') + + args = parser.parse_args() + + if args.mode == 'create_indexes': + create_indexes(args) + + elif args.mode == 'combine_full_indexes': + combine_full_indexes(args) + + else: + raise Exception('Incorrect arguments!') \ No newline at end of file diff --git a/audio_detection/audio_infer/utils/data_generator.py b/audio_detection/audio_infer/utils/data_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..b94b6d990b6726c791cbb4cb660abdb93233f965 --- /dev/null +++ b/audio_detection/audio_infer/utils/data_generator.py @@ -0,0 +1,421 @@ +import numpy as np +import h5py +import csv +import time +import logging + +from utilities import int16_to_float32 + + +def read_black_list(black_list_csv): + """Read audio names from black list. + """ + with open(black_list_csv, 'r') as fr: + reader = csv.reader(fr) + lines = list(reader) + + black_list_names = ['Y{}.wav'.format(line[0]) for line in lines] + return black_list_names + + +class AudioSetDataset(object): + def __init__(self, sample_rate=32000): + """This class takes the meta of an audio clip as input, and return + the waveform and target of the audio clip. This class is used by DataLoader. + """ + self.sample_rate = sample_rate + + def __getitem__(self, meta): + """Load waveform and target of an audio clip. + + Args: + meta: { + 'hdf5_path': str, + 'index_in_hdf5': int} + + Returns: + data_dict: { + 'audio_name': str, + 'waveform': (clip_samples,), + 'target': (classes_num,)} + """ + hdf5_path = meta['hdf5_path'] + index_in_hdf5 = meta['index_in_hdf5'] + with h5py.File(hdf5_path, 'r') as hf: + audio_name = hf['audio_name'][index_in_hdf5].decode() + waveform = int16_to_float32(hf['waveform'][index_in_hdf5]) + waveform = self.resample(waveform) + target = hf['target'][index_in_hdf5].astype(np.float32) + + data_dict = { + 'audio_name': audio_name, 'waveform': waveform, 'target': target} + + return data_dict + + def resample(self, waveform): + """Resample. + + Args: + waveform: (clip_samples,) + + Returns: + (resampled_clip_samples,) + """ + if self.sample_rate == 32000: + return waveform + elif self.sample_rate == 16000: + return waveform[0 :: 2] + elif self.sample_rate == 8000: + return waveform[0 :: 4] + else: + raise Exception('Incorrect sample rate!') + + +class Base(object): + def __init__(self, indexes_hdf5_path, batch_size, black_list_csv, random_seed): + """Base class of train sampler. + + Args: + indexes_hdf5_path: string + batch_size: int + black_list_csv: string + random_seed: int + """ + self.batch_size = batch_size + self.random_state = np.random.RandomState(random_seed) + + # Black list + if black_list_csv: + self.black_list_names = read_black_list(black_list_csv) + else: + self.black_list_names = [] + + logging.info('Black list samples: {}'.format(len(self.black_list_names))) + + # Load target + load_time = time.time() + + with h5py.File(indexes_hdf5_path, 'r') as hf: + self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]] + self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]] + self.indexes_in_hdf5 = hf['index_in_hdf5'][:] + self.targets = hf['target'][:].astype(np.float32) + + (self.audios_num, self.classes_num) = self.targets.shape + logging.info('Training number: {}'.format(self.audios_num)) + logging.info('Load target time: {:.3f} s'.format(time.time() - load_time)) + + +class TrainSampler(Base): + def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None, + random_seed=1234): + """Balanced sampler. Generate batch meta for training. + + Args: + indexes_hdf5_path: string + batch_size: int + black_list_csv: string + random_seed: int + """ + super(TrainSampler, self).__init__(indexes_hdf5_path, batch_size, + black_list_csv, random_seed) + + self.indexes = np.arange(self.audios_num) + + # Shuffle indexes + self.random_state.shuffle(self.indexes) + + self.pointer = 0 + + def __iter__(self): + """Generate batch meta for training. + + Returns: + batch_meta: e.g.: [ + {'hdf5_path': string, 'index_in_hdf5': int}, + ...] + """ + batch_size = self.batch_size + + while True: + batch_meta = [] + i = 0 + while i < batch_size: + index = self.indexes[self.pointer] + self.pointer += 1 + + # Shuffle indexes and reset pointer + if self.pointer >= self.audios_num: + self.pointer = 0 + self.random_state.shuffle(self.indexes) + + # If audio in black list then continue + if self.audio_names[index] in self.black_list_names: + continue + else: + batch_meta.append({ + 'hdf5_path': self.hdf5_paths[index], + 'index_in_hdf5': self.indexes_in_hdf5[index]}) + i += 1 + + yield batch_meta + + def state_dict(self): + state = { + 'indexes': self.indexes, + 'pointer': self.pointer} + return state + + def load_state_dict(self, state): + self.indexes = state['indexes'] + self.pointer = state['pointer'] + + +class BalancedTrainSampler(Base): + def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None, + random_seed=1234): + """Balanced sampler. Generate batch meta for training. Data are equally + sampled from different sound classes. + + Args: + indexes_hdf5_path: string + batch_size: int + black_list_csv: string + random_seed: int + """ + super(BalancedTrainSampler, self).__init__(indexes_hdf5_path, + batch_size, black_list_csv, random_seed) + + self.samples_num_per_class = np.sum(self.targets, axis=0) + logging.info('samples_num_per_class: {}'.format( + self.samples_num_per_class.astype(np.int32))) + + # Training indexes of all sound classes. E.g.: + # [[0, 11, 12, ...], [3, 4, 15, 16, ...], [7, 8, ...], ...] + self.indexes_per_class = [] + + for k in range(self.classes_num): + self.indexes_per_class.append( + np.where(self.targets[:, k] == 1)[0]) + + # Shuffle indexes + for k in range(self.classes_num): + self.random_state.shuffle(self.indexes_per_class[k]) + + self.queue = [] + self.pointers_of_classes = [0] * self.classes_num + + def expand_queue(self, queue): + classes_set = np.arange(self.classes_num).tolist() + self.random_state.shuffle(classes_set) + queue += classes_set + return queue + + def __iter__(self): + """Generate batch meta for training. + + Returns: + batch_meta: e.g.: [ + {'hdf5_path': string, 'index_in_hdf5': int}, + ...] + """ + batch_size = self.batch_size + + while True: + batch_meta = [] + i = 0 + while i < batch_size: + if len(self.queue) == 0: + self.queue = self.expand_queue(self.queue) + + class_id = self.queue.pop(0) + pointer = self.pointers_of_classes[class_id] + self.pointers_of_classes[class_id] += 1 + index = self.indexes_per_class[class_id][pointer] + + # When finish one epoch of a sound class, then shuffle its indexes and reset pointer + if self.pointers_of_classes[class_id] >= self.samples_num_per_class[class_id]: + self.pointers_of_classes[class_id] = 0 + self.random_state.shuffle(self.indexes_per_class[class_id]) + + # If audio in black list then continue + if self.audio_names[index] in self.black_list_names: + continue + else: + batch_meta.append({ + 'hdf5_path': self.hdf5_paths[index], + 'index_in_hdf5': self.indexes_in_hdf5[index]}) + i += 1 + + yield batch_meta + + def state_dict(self): + state = { + 'indexes_per_class': self.indexes_per_class, + 'queue': self.queue, + 'pointers_of_classes': self.pointers_of_classes} + return state + + def load_state_dict(self, state): + self.indexes_per_class = state['indexes_per_class'] + self.queue = state['queue'] + self.pointers_of_classes = state['pointers_of_classes'] + + +class AlternateTrainSampler(Base): + def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None, + random_seed=1234): + """AlternateSampler is a combination of Sampler and Balanced Sampler. + AlternateSampler alternately sample data from Sampler and Blanced Sampler. + + Args: + indexes_hdf5_path: string + batch_size: int + black_list_csv: string + random_seed: int + """ + self.sampler1 = TrainSampler(indexes_hdf5_path, batch_size, + black_list_csv, random_seed) + + self.sampler2 = BalancedTrainSampler(indexes_hdf5_path, batch_size, + black_list_csv, random_seed) + + self.batch_size = batch_size + self.count = 0 + + def __iter__(self): + """Generate batch meta for training. + + Returns: + batch_meta: e.g.: [ + {'hdf5_path': string, 'index_in_hdf5': int}, + ...] + """ + batch_size = self.batch_size + + while True: + self.count += 1 + + if self.count % 2 == 0: + batch_meta = [] + i = 0 + while i < batch_size: + index = self.sampler1.indexes[self.sampler1.pointer] + self.sampler1.pointer += 1 + + # Shuffle indexes and reset pointer + if self.sampler1.pointer >= self.sampler1.audios_num: + self.sampler1.pointer = 0 + self.sampler1.random_state.shuffle(self.sampler1.indexes) + + # If audio in black list then continue + if self.sampler1.audio_names[index] in self.sampler1.black_list_names: + continue + else: + batch_meta.append({ + 'hdf5_path': self.sampler1.hdf5_paths[index], + 'index_in_hdf5': self.sampler1.indexes_in_hdf5[index]}) + i += 1 + + elif self.count % 2 == 1: + batch_meta = [] + i = 0 + while i < batch_size: + if len(self.sampler2.queue) == 0: + self.sampler2.queue = self.sampler2.expand_queue(self.sampler2.queue) + + class_id = self.sampler2.queue.pop(0) + pointer = self.sampler2.pointers_of_classes[class_id] + self.sampler2.pointers_of_classes[class_id] += 1 + index = self.sampler2.indexes_per_class[class_id][pointer] + + # When finish one epoch of a sound class, then shuffle its indexes and reset pointer + if self.sampler2.pointers_of_classes[class_id] >= self.sampler2.samples_num_per_class[class_id]: + self.sampler2.pointers_of_classes[class_id] = 0 + self.sampler2.random_state.shuffle(self.sampler2.indexes_per_class[class_id]) + + # If audio in black list then continue + if self.sampler2.audio_names[index] in self.sampler2.black_list_names: + continue + else: + batch_meta.append({ + 'hdf5_path': self.sampler2.hdf5_paths[index], + 'index_in_hdf5': self.sampler2.indexes_in_hdf5[index]}) + i += 1 + + yield batch_meta + + def state_dict(self): + state = { + 'sampler1': self.sampler1.state_dict(), + 'sampler2': self.sampler2.state_dict()} + return state + + def load_state_dict(self, state): + self.sampler1.load_state_dict(state['sampler1']) + self.sampler2.load_state_dict(state['sampler2']) + + +class EvaluateSampler(object): + def __init__(self, indexes_hdf5_path, batch_size): + """Evaluate sampler. Generate batch meta for evaluation. + + Args: + indexes_hdf5_path: string + batch_size: int + """ + self.batch_size = batch_size + + with h5py.File(indexes_hdf5_path, 'r') as hf: + self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]] + self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]] + self.indexes_in_hdf5 = hf['index_in_hdf5'][:] + self.targets = hf['target'][:].astype(np.float32) + + self.audios_num = len(self.audio_names) + + def __iter__(self): + """Generate batch meta for training. + + Returns: + batch_meta: e.g.: [ + {'hdf5_path': string, + 'index_in_hdf5': int} + ...] + """ + batch_size = self.batch_size + pointer = 0 + + while pointer < self.audios_num: + batch_indexes = np.arange(pointer, + min(pointer + batch_size, self.audios_num)) + + batch_meta = [] + + for index in batch_indexes: + batch_meta.append({ + 'audio_name': self.audio_names[index], + 'hdf5_path': self.hdf5_paths[index], + 'index_in_hdf5': self.indexes_in_hdf5[index], + 'target': self.targets[index]}) + + pointer += batch_size + yield batch_meta + + +def collate_fn(list_data_dict): + """Collate data. + Args: + list_data_dict, e.g., [{'audio_name': str, 'waveform': (clip_samples,), ...}, + {'audio_name': str, 'waveform': (clip_samples,), ...}, + ...] + Returns: + np_data_dict, dict, e.g., + {'audio_name': (batch_size,), 'waveform': (batch_size, clip_samples), ...} + """ + np_data_dict = {} + + for key in list_data_dict[0].keys(): + np_data_dict[key] = np.array([data_dict[key] for data_dict in list_data_dict]) + + return np_data_dict \ No newline at end of file diff --git a/audio_detection/audio_infer/utils/dataset.py b/audio_detection/audio_infer/utils/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..c7f11755a027de97d236e447e576c4b1ed4e8a36 --- /dev/null +++ b/audio_detection/audio_infer/utils/dataset.py @@ -0,0 +1,224 @@ +import numpy as np +import argparse +import csv +import os +import glob +import datetime +import time +import logging +import h5py +import librosa + +from utilities import (create_folder, get_filename, create_logging, + float32_to_int16, pad_or_truncate, read_metadata) +import config + + +def split_unbalanced_csv_to_partial_csvs(args): + """Split unbalanced csv to part csvs. Each part csv contains up to 50000 ids. + """ + + unbalanced_csv_path = args.unbalanced_csv + unbalanced_partial_csvs_dir = args.unbalanced_partial_csvs_dir + + create_folder(unbalanced_partial_csvs_dir) + + with open(unbalanced_csv_path, 'r') as f: + lines = f.readlines() + + lines = lines[3:] # Remove head info + audios_num_per_file = 50000 + + files_num = int(np.ceil(len(lines) / float(audios_num_per_file))) + + for r in range(files_num): + lines_per_file = lines[r * audios_num_per_file : + (r + 1) * audios_num_per_file] + + out_csv_path = os.path.join(unbalanced_partial_csvs_dir, + 'unbalanced_train_segments_part{:02d}.csv'.format(r)) + + with open(out_csv_path, 'w') as f: + f.write('empty\n') + f.write('empty\n') + f.write('empty\n') + for line in lines_per_file: + f.write(line) + + print('Write out csv to {}'.format(out_csv_path)) + + +def download_wavs(args): + """Download videos and extract audio in wav format. + """ + + # Paths + csv_path = args.csv_path + audios_dir = args.audios_dir + mini_data = args.mini_data + + if mini_data: + logs_dir = '_logs/download_dataset/{}'.format(get_filename(csv_path)) + else: + logs_dir = '_logs/download_dataset_minidata/{}'.format(get_filename(csv_path)) + + create_folder(audios_dir) + create_folder(logs_dir) + create_logging(logs_dir, filemode='w') + logging.info('Download log is saved to {}'.format(logs_dir)) + + # Read csv + with open(csv_path, 'r') as f: + lines = f.readlines() + + lines = lines[3:] # Remove csv head info + + if mini_data: + lines = lines[0 : 10] # Download partial data for debug + + download_time = time.time() + + # Download + for (n, line) in enumerate(lines): + + items = line.split(', ') + audio_id = items[0] + start_time = float(items[1]) + end_time = float(items[2]) + duration = end_time - start_time + + logging.info('{} {} start_time: {:.1f}, end_time: {:.1f}'.format( + n, audio_id, start_time, end_time)) + + # Download full video of whatever format + video_name = os.path.join(audios_dir, '_Y{}.%(ext)s'.format(audio_id)) + os.system("youtube-dl --quiet -o '{}' -x https://www.youtube.com/watch?v={}"\ + .format(video_name, audio_id)) + + video_paths = glob.glob(os.path.join(audios_dir, '_Y' + audio_id + '.*')) + + # If download successful + if len(video_paths) > 0: + video_path = video_paths[0] # Choose one video + + # Add 'Y' to the head because some video ids are started with '-' + # which will cause problem + audio_path = os.path.join(audios_dir, 'Y' + audio_id + '.wav') + + # Extract audio in wav format + os.system("ffmpeg -loglevel panic -i {} -ac 1 -ar 32000 -ss {} -t 00:00:{} {} "\ + .format(video_path, + str(datetime.timedelta(seconds=start_time)), duration, + audio_path)) + + # Remove downloaded video + os.system("rm {}".format(video_path)) + + logging.info("Download and convert to {}".format(audio_path)) + + logging.info('Download finished! Time spent: {:.3f} s'.format( + time.time() - download_time)) + + logging.info('Logs can be viewed in {}'.format(logs_dir)) + + +def pack_waveforms_to_hdf5(args): + """Pack waveform and target of several audio clips to a single hdf5 file. + This can speed up loading and training. + """ + + # Arguments & parameters + audios_dir = args.audios_dir + csv_path = args.csv_path + waveforms_hdf5_path = args.waveforms_hdf5_path + mini_data = args.mini_data + + clip_samples = config.clip_samples + classes_num = config.classes_num + sample_rate = config.sample_rate + id_to_ix = config.id_to_ix + + # Paths + if mini_data: + prefix = 'mini_' + waveforms_hdf5_path += '.mini' + else: + prefix = '' + + create_folder(os.path.dirname(waveforms_hdf5_path)) + + logs_dir = '_logs/pack_waveforms_to_hdf5/{}{}'.format(prefix, get_filename(csv_path)) + create_folder(logs_dir) + create_logging(logs_dir, filemode='w') + logging.info('Write logs to {}'.format(logs_dir)) + + # Read csv file + meta_dict = read_metadata(csv_path, classes_num, id_to_ix) + + if mini_data: + mini_num = 10 + for key in meta_dict.keys(): + meta_dict[key] = meta_dict[key][0 : mini_num] + + audios_num = len(meta_dict['audio_name']) + + # Pack waveform to hdf5 + total_time = time.time() + + with h5py.File(waveforms_hdf5_path, 'w') as hf: + hf.create_dataset('audio_name', shape=((audios_num,)), dtype='S20') + hf.create_dataset('waveform', shape=((audios_num, clip_samples)), dtype=np.int16) + hf.create_dataset('target', shape=((audios_num, classes_num)), dtype=np.bool) + hf.attrs.create('sample_rate', data=sample_rate, dtype=np.int32) + + # Pack waveform & target of several audio clips to a single hdf5 file + for n in range(audios_num): + audio_path = os.path.join(audios_dir, meta_dict['audio_name'][n]) + + if os.path.isfile(audio_path): + logging.info('{} {}'.format(n, audio_path)) + (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) + audio = pad_or_truncate(audio, clip_samples) + + hf['audio_name'][n] = meta_dict['audio_name'][n].encode() + hf['waveform'][n] = float32_to_int16(audio) + hf['target'][n] = meta_dict['target'][n] + else: + logging.info('{} File does not exist! {}'.format(n, audio_path)) + + logging.info('Write to {}'.format(waveforms_hdf5_path)) + logging.info('Pack hdf5 time: {:.3f}'.format(time.time() - total_time)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest='mode') + + parser_split = subparsers.add_parser('split_unbalanced_csv_to_partial_csvs') + parser_split.add_argument('--unbalanced_csv', type=str, required=True, help='Path of unbalanced_csv file to read.') + parser_split.add_argument('--unbalanced_partial_csvs_dir', type=str, required=True, help='Directory to save out split unbalanced partial csv.') + + parser_download_wavs = subparsers.add_parser('download_wavs') + parser_download_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.') + parser_download_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.') + parser_download_wavs.add_argument('--mini_data', action='store_true', default=True, help='Set true to only download 10 audios for debugging.') + + parser_pack_wavs = subparsers.add_parser('pack_waveforms_to_hdf5') + parser_pack_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.') + parser_pack_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.') + parser_pack_wavs.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path to save out packed hdf5.') + parser_pack_wavs.add_argument('--mini_data', action='store_true', default=False, help='Set true to only download 10 audios for debugging.') + + args = parser.parse_args() + + if args.mode == 'split_unbalanced_csv_to_partial_csvs': + split_unbalanced_csv_to_partial_csvs(args) + + elif args.mode == 'download_wavs': + download_wavs(args) + + elif args.mode == 'pack_waveforms_to_hdf5': + pack_waveforms_to_hdf5(args) + + else: + raise Exception('Incorrect arguments!') \ No newline at end of file diff --git a/audio_detection/audio_infer/utils/plot_for_paper.py b/audio_detection/audio_infer/utils/plot_for_paper.py new file mode 100644 index 0000000000000000000000000000000000000000..25e799a7e7eea9ffc5bced214a8beb0a558842eb --- /dev/null +++ b/audio_detection/audio_infer/utils/plot_for_paper.py @@ -0,0 +1,565 @@ +import os +import sys +import numpy as np +import argparse +import h5py +import time +import pickle +import matplotlib.pyplot as plt +import csv +from sklearn import metrics + +from utilities import (create_folder, get_filename, d_prime) +import config + + +def load_statistics(statistics_path): + statistics_dict = pickle.load(open(statistics_path, 'rb')) + + bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num) + bal_map = np.mean(bal_map, axis=-1) + test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num) + test_map = np.mean(test_map, axis=-1) + + return bal_map, test_map + + +def crop_label(label): + max_len = 16 + if len(label) <= max_len: + return label + else: + words = label.split(' ') + cropped_label = '' + for w in words: + if len(cropped_label + ' ' + w) > max_len: + break + else: + cropped_label += ' {}'.format(w) + return cropped_label + + +def add_comma(integer): + """E.g., 1234567 -> 1,234,567 + """ + integer = int(integer) + if integer >= 1000: + return str(integer // 1000) + ',' + str(integer % 1000) + else: + return str(integer) + + +def plot_classwise_iteration_map(args): + + # Paths + save_out_path = 'results/classwise_iteration_map.pdf' + create_folder(os.path.dirname(save_out_path)) + + # Load statistics + statistics_dict = pickle.load(open('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl', 'rb')) + + mAP_mat = np.array([e['average_precision'] for e in statistics_dict['test']]) + mAP_mat = mAP_mat[0 : 300, :] # 300 * 2000 = 600k iterations + sorted_indexes = np.argsort(config.full_samples_per_class)[::-1] + + fig, axs = plt.subplots(1, 3, figsize=(20, 5)) + ranges = [np.arange(0, 10), np.arange(250, 260), np.arange(517, 527)] + axs[0].set_ylabel('AP') + + for col in range(0, 3): + axs[col].set_ylim(0, 1.) + axs[col].set_xlim(0, 301) + axs[col].set_xlabel('Iterations') + axs[col].set_ylabel('AP') + axs[col].xaxis.set_ticks(np.arange(0, 301, 100)) + axs[col].xaxis.set_ticklabels(['0', '200k', '400k', '600k']) + lines = [] + for _ix in ranges[col]: + _label = crop_label(config.labels[sorted_indexes[_ix]]) + \ + ' ({})'.format(add_comma(config.full_samples_per_class[sorted_indexes[_ix]])) + line, = axs[col].plot(mAP_mat[:, sorted_indexes[_ix]], label=_label) + lines.append(line) + box = axs[col].get_position() + axs[col].set_position([box.x0, box.y0, box.width * 1., box.height]) + axs[col].legend(handles=lines, bbox_to_anchor=(1., 1.)) + axs[col].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3) + + plt.tight_layout(pad=4, w_pad=1, h_pad=1) + plt.savefig(save_out_path) + print(save_out_path) + + +def plot_six_figures(args): + + # Arguments & parameters + classes_num = config.classes_num + labels = config.labels + max_plot_iteration = 540000 + iterations = np.arange(0, max_plot_iteration, 2000) + + # Paths + class_labels_indices_path = os.path.join('metadata', 'class_labels_indices.csv') + save_out_path = 'results/six_figures.pdf' + create_folder(os.path.dirname(save_out_path)) + + # Plot + fig, ax = plt.subplots(2, 3, figsize=(14, 7)) + bal_alpha = 0.3 + test_alpha = 1.0 + linewidth = 1. + + # (a) Comparison of architectures + if True: + lines = [] + + # Wavegram-Logmel-CNN + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl') + line, = ax[0, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 0].plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Cnn14 + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl') + line, = ax[0, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 0].plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # MobileNetV1 + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_MobileNetV1_balanced_mixup_bs32.pkl') + line, = ax[0, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 0].plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[0, 0].legend(handles=lines, loc=2) + ax[0, 0].set_title('(a) Comparison of architectures') + + # (b) Comparison of training data and augmentation' + if True: + lines = [] + + # Full data + balanced sampler + mixup + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl') + line, = ax[0, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Full data + balanced sampler + mixup in time domain + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_timedomain_bs32.pkl') + line, = ax[0, 1].plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Full data + balanced sampler + no mixup + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_nomixup_bs32.pkl') + line, = ax[0, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Full data + uniform sampler + no mixup + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_nobalanced_nomixup_bs32.pkl') + line, = ax[0, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Balanced data + balanced sampler + mixup + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_mixup_bs32.pkl') + line, = ax[0, 1].plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Balanced data + balanced sampler + no mixup + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_nomixup_bs32.pkl') + line, = ax[0, 1].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[0, 1].legend(handles=lines, loc=2, fontsize=8) + ax[0, 1].set_title('(b) Comparison of training data and augmentation') + + # (c) Comparison of embedding size + if True: + lines = [] + + # Embedding size 2048 + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl') + line, = ax[0, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 2].plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Embedding size 128 + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb128_balanced_mixup_bs32.pkl') + line, = ax[0, 2].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 2].plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Embedding size 32 + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb32_balanced_mixup_bs32.pkl') + line, = ax[0, 2].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 2].plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[0, 2].legend(handles=lines, loc=2) + ax[0, 2].set_title('(c) Comparison of embedding size') + + # (d) Comparison of amount of training data + if True: + lines = [] + + # 100% of full training data + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl') + line, = ax[1, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 0].plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # 80% of full training data + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.8full_train_Cnn14_balanced_mixup_bs32.pkl') + line, = ax[1, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 0].plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # 50% of full training data + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.5full_train_Cnn14_balanced_mixup_bs32.pkl') + line, = ax[1, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 0].plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[1, 0].legend(handles=lines, loc=2) + ax[1, 0].set_title('(d) Comparison of amount of training data') + + # (e) Comparison of sampling rate + if True: + lines = [] + + # Cnn14 + 32 kHz + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl') + line, = ax[1, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 1].plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Cnn14 + 16 kHz + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_16k_balanced_mixup_bs32.pkl') + line, = ax[1, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 1].plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Cnn14 + 8 kHz + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_8k_balanced_mixup_bs32.pkl') + line, = ax[1, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 1].plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[1, 1].legend(handles=lines, loc=2) + ax[1, 1].set_title('(e) Comparison of sampling rate') + + # (f) Comparison of mel bins number + if True: + lines = [] + + # Cnn14 + 128 mel bins + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel128_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl') + line, = ax[1, 2].plot(bal_map, color='g', alpha=bal_alpha) + line, = ax[1, 2].plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Cnn14 + 64 mel bins + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl') + line, = ax[1, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 2].plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # Cnn14 + 32 mel bins + (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel32_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl') + line, = ax[1, 2].plot(bal_map, color='b', alpha=bal_alpha) + line, = ax[1, 2].plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[1, 2].legend(handles=lines, loc=2) + ax[1, 2].set_title('(f) Comparison of mel bins number') + + for i in range(2): + for j in range(3): + ax[i, j].set_ylim(0, 0.8) + ax[i, j].set_xlim(0, len(iterations)) + ax[i, j].set_xlabel('Iterations') + ax[i, j].set_ylabel('mAP') + ax[i, j].xaxis.set_ticks(np.arange(0, len(iterations), 50)) + ax[i, j].xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k']) + ax[i, j].yaxis.set_ticks(np.arange(0, 0.81, 0.05)) + ax[i, j].yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3', + '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8']) + ax[i, j].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3) + ax[i, j].xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3) + + plt.tight_layout(0, 1, 0) + plt.savefig(save_out_path) + print('Save figure to {}'.format(save_out_path)) + + +def plot_complexity_map(args): + + # Paths + save_out_path = 'results/complexity_mAP.pdf' + create_folder(os.path.dirname(save_out_path)) + + plt.figure(figsize=(5, 5)) + fig, ax = plt.subplots(1, 1) + + model_types = np.array(['Cnn6', 'Cnn10', 'Cnn14', 'ResNet22', 'ResNet38', 'ResNet54', + 'MobileNetV1', 'MobileNetV2', 'DaiNet', 'LeeNet', 'LeeNet18', + 'Res1dNet30', 'Res1dNet44', 'Wavegram-CNN', 'Wavegram-\nLogmel-CNN']) + flops = np.array([21.986, 28.166, 42.220, 30.081, 48.962, 54.563, 3.614, 2.810, + 30.395, 4.741, 26.369, 32.688, 61.833, 44.234, 53.510]) + mAPs = np.array([0.343, 0.380, 0.431, 0.430, 0.434, 0.429, 0.389, 0.383, 0.295, + 0.266, 0.336, 0.365, 0.355, 0.389, 0.439]) + + sorted_indexes = np.sort(flops) + ax.scatter(flops, mAPs) + + shift = [[-5.5, -0.004], [1, -0.004], [-1, -0.014], [-2, 0.006], [-7, 0.006], + [1, -0.01], [0.5, 0.004], [-1, -0.014], [1, -0.007], [0.8, -0.008], + [1, -0.007], [1, 0.002], [-6, -0.015], [1, -0.008], [0.8, 0]] + + for i, model_type in enumerate(model_types): + ax.annotate(model_type, (flops[i] + shift[i][0], mAPs[i] + shift[i][1])) + + ax.plot(flops[[0, 1, 2]], mAPs[[0, 1, 2]]) + ax.plot(flops[[3, 4, 5]], mAPs[[3, 4, 5]]) + ax.plot(flops[[6, 7]], mAPs[[6, 7]]) + ax.plot(flops[[9, 10]], mAPs[[9, 10]]) + ax.plot(flops[[11, 12]], mAPs[[11, 12]]) + ax.plot(flops[[13, 14]], mAPs[[13, 14]]) + + ax.set_xlim(0, 70) + ax.set_ylim(0.2, 0.5) + ax.set_xlabel('Multi-load_statisticss (million)', fontsize=15) + ax.set_ylabel('mAP', fontsize=15) + ax.tick_params(axis='x', labelsize=12) + ax.tick_params(axis='y', labelsize=12) + + plt.tight_layout(0, 0, 0) + + plt.savefig(save_out_path) + print('Write out figure to {}'.format(save_out_path)) + + +def plot_long_fig(args): + + # Paths + stats = pickle.load(open('paper_statistics/stats_for_long_fig.pkl', 'rb')) + + save_out_path = 'results/long_fig.pdf' + create_folder(os.path.dirname(save_out_path)) + + # Load meta + N = len(config.labels) + sorted_indexes = stats['sorted_indexes_for_plot'] + sorted_labels = np.array(config.labels)[sorted_indexes] + audio_clips_per_class = stats['official_balanced_training_samples'] + stats['official_unbalanced_training_samples'] + audio_clips_per_class = audio_clips_per_class[sorted_indexes] + + # Prepare axes for plot + (ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b) = prepare_plot_long_4_rows(sorted_labels) + + # plot the number of training samples + ax1a.bar(np.arange(N), audio_clips_per_class, alpha=0.3) + ax2a.bar(np.arange(N), audio_clips_per_class, alpha=0.3) + ax3a.bar(np.arange(N), audio_clips_per_class, alpha=0.3) + ax4a.bar(np.arange(N), audio_clips_per_class, alpha=0.3) + + # Load mAP of different systems + """Average instance system of [1] with an mAP of 0.317. + [1] Kong, Qiuqiang, Changsong Yu, Yong Xu, Turab Iqbal, Wenwu Wang, and + Mark D. Plumbley. "Weakly labelled audioset tagging with attention neural + networks." IEEE/ACM Transactions on Audio, Speech, and Language Processing + 27, no. 11 (2019): 1791-1802.""" + maps_avg_instances = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision'] + maps_avg_instances = maps_avg_instances[sorted_indexes] + + # PANNs Cnn14 + maps_panns_cnn14 = stats['panns_cnn14']['eval']['average_precision'] + maps_panns_cnn14 = maps_panns_cnn14[sorted_indexes] + + # PANNs MobileNetV1 + maps_panns_mobilenetv1 = stats['panns_mobilenetv1']['eval']['average_precision'] + maps_panns_mobilenetv1 = maps_panns_mobilenetv1[sorted_indexes] + + # PANNs Wavegram-Logmel-Cnn14 + maps_panns_wavegram_logmel_cnn14 = stats['panns_wavegram_logmel_cnn14']['eval']['average_precision'] + maps_panns_wavegram_logmel_cnn14 = maps_panns_wavegram_logmel_cnn14[sorted_indexes] + + # Plot mAPs + _scatter_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='g') + _scatter_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='r') + _scatter_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, s=5, c='b') + _scatter_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, s=5, c='k') + + linewidth = 0.7 + line0te = _plot_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b, + c='g', linewidth=linewidth, label='AP with Wavegram-Logmel-CNN') + line1te = _plot_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, c='r', + linewidth=linewidth, label='AP with CNN14') + line2te = _plot_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, c='b', + linewidth=linewidth, label='AP with MobileNetV1') + line3te = _plot_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, c='k', + linewidth=linewidth, label='AP with averaging instances (baseline)') + + # Plot label quality + label_quality = stats['label_quality'] + sorted_label_quality = np.array(label_quality)[sorted_indexes] + for k in range(len(sorted_label_quality)): + if sorted_label_quality[k] and sorted_label_quality[k] == 1: + sorted_label_quality[k] = 0.99 + + ax1b.scatter(np.arange(N)[sorted_label_quality != None], + sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+') + ax2b.scatter(np.arange(N)[sorted_label_quality != None], + sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+') + ax3b.scatter(np.arange(N)[sorted_label_quality != None], + sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+') + line_label_quality = ax4b.scatter(np.arange(N)[sorted_label_quality != None], + sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+', label='Label quality') + ax1b.scatter(np.arange(N)[sorted_label_quality == None], + 0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_') + ax2b.scatter(np.arange(N)[sorted_label_quality == None], + 0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_') + ax3b.scatter(np.arange(N)[sorted_label_quality == None], + 0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_') + ax4b.scatter(np.arange(N)[sorted_label_quality == None], + 0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_') + + plt.legend(handles=[line0te, line1te, line2te, line3te, line_label_quality], fontsize=6, loc=1) + plt.tight_layout(0, 0, 0) + plt.savefig(save_out_path) + print('Save fig to {}'.format(save_out_path)) + + +def prepare_plot_long_4_rows(sorted_lbs): + N = len(sorted_lbs) + + f,(ax1a, ax2a, ax3a, ax4a) = plt.subplots(4, 1, sharey=False, facecolor='w', figsize=(10, 10.5)) + + fontsize = 5 + + K = 132 + ax1a.set_xlim(0, K) + ax2a.set_xlim(K, 2 * K) + ax3a.set_xlim(2 * K, 3 * K) + ax4a.set_xlim(3 * K, N) + + truncated_sorted_lbs = [] + for lb in sorted_lbs: + lb = lb[0 : 25] + words = lb.split(' ') + if len(words[-1]) < 3: + lb = ' '.join(words[0:-1]) + truncated_sorted_lbs.append(lb) + + ax1a.grid(which='major', axis='x', linestyle='-', alpha=0.3) + ax2a.grid(which='major', axis='x', linestyle='-', alpha=0.3) + ax3a.grid(which='major', axis='x', linestyle='-', alpha=0.3) + ax4a.grid(which='major', axis='x', linestyle='-', alpha=0.3) + + ax1a.set_yscale('log') + ax2a.set_yscale('log') + ax3a.set_yscale('log') + ax4a.set_yscale('log') + + ax1b = ax1a.twinx() + ax2b = ax2a.twinx() + ax3b = ax3a.twinx() + ax4b = ax4a.twinx() + ax1b.set_ylim(0., 1.) + ax2b.set_ylim(0., 1.) + ax3b.set_ylim(0., 1.) + ax4b.set_ylim(0., 1.) + ax1b.set_ylabel('Average precision') + ax2b.set_ylabel('Average precision') + ax3b.set_ylabel('Average precision') + ax4b.set_ylabel('Average precision') + + ax1b.yaxis.grid(color='grey', linestyle='--', alpha=0.5) + ax2b.yaxis.grid(color='grey', linestyle='--', alpha=0.5) + ax3b.yaxis.grid(color='grey', linestyle='--', alpha=0.5) + ax4b.yaxis.grid(color='grey', linestyle='--', alpha=0.5) + + ax1a.xaxis.set_ticks(np.arange(K)) + ax1a.xaxis.set_ticklabels(truncated_sorted_lbs[0:K], rotation=90, fontsize=fontsize) + ax1a.xaxis.tick_bottom() + ax1a.set_ylabel("Number of audio clips") + + ax2a.xaxis.set_ticks(np.arange(K, 2*K)) + ax2a.xaxis.set_ticklabels(truncated_sorted_lbs[K:2*K], rotation=90, fontsize=fontsize) + ax2a.xaxis.tick_bottom() + ax2a.set_ylabel("Number of audio clips") + + ax3a.xaxis.set_ticks(np.arange(2*K, 3*K)) + ax3a.xaxis.set_ticklabels(truncated_sorted_lbs[2*K:3*K], rotation=90, fontsize=fontsize) + ax3a.xaxis.tick_bottom() + ax3a.set_ylabel("Number of audio clips") + + ax4a.xaxis.set_ticks(np.arange(3*K, N)) + ax4a.xaxis.set_ticklabels(truncated_sorted_lbs[3*K:], rotation=90, fontsize=fontsize) + ax4a.xaxis.tick_bottom() + ax4a.set_ylabel("Number of audio clips") + + ax1a.spines['right'].set_visible(False) + ax1b.spines['right'].set_visible(False) + ax2a.spines['left'].set_visible(False) + ax2b.spines['left'].set_visible(False) + ax2a.spines['right'].set_visible(False) + ax2b.spines['right'].set_visible(False) + ax3a.spines['left'].set_visible(False) + ax3b.spines['left'].set_visible(False) + ax3a.spines['right'].set_visible(False) + ax3b.spines['right'].set_visible(False) + ax4a.spines['left'].set_visible(False) + ax4b.spines['left'].set_visible(False) + + plt.subplots_adjust(hspace = 0.8) + + return ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b + + +def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.): + N = len(x) + ax.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha) + ax2.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha) + ax3.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha) + ax4.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha) + + +def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, label=""): + N = len(x) + ax.plot(x, c=c, linewidth=linewidth, alpha=alpha) + ax2.plot(x, c=c, linewidth=linewidth, alpha=alpha) + ax3.plot(x, c=c, linewidth=linewidth, alpha=alpha) + line, = ax4.plot(x, c=c, linewidth=linewidth, alpha=alpha, label=label) + return line + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='') + subparsers = parser.add_subparsers(dest='mode') + + parser_classwise_iteration_map = subparsers.add_parser('plot_classwise_iteration_map') + parser_six_figures = subparsers.add_parser('plot_six_figures') + parser_complexity_map = subparsers.add_parser('plot_complexity_map') + parser_long_fig = subparsers.add_parser('plot_long_fig') + + args = parser.parse_args() + + if args.mode == 'plot_classwise_iteration_map': + plot_classwise_iteration_map(args) + + elif args.mode == 'plot_six_figures': + plot_six_figures(args) + + elif args.mode == 'plot_complexity_map': + plot_complexity_map(args) + + elif args.mode == 'plot_long_fig': + plot_long_fig(args) + + else: + raise Exception('Incorrect argument!') \ No newline at end of file diff --git a/audio_detection/audio_infer/utils/plot_statistics.py b/audio_detection/audio_infer/utils/plot_statistics.py new file mode 100644 index 0000000000000000000000000000000000000000..bebb28af3e3468e8422c6901e1aba9600270ef89 --- /dev/null +++ b/audio_detection/audio_infer/utils/plot_statistics.py @@ -0,0 +1,2034 @@ +import os +import sys +import numpy as np +import argparse +import h5py +import time +import _pickle as cPickle +import _pickle +import matplotlib.pyplot as plt +import csv +from sklearn import metrics + +from utilities import (create_folder, get_filename, d_prime) +import config + + +def _load_metrics0(filename, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size): + workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer' + statistics_path = os.path.join(workspace0, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + + statistics_dict = cPickle.load(open(statistics_path, 'rb')) + + bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num) + bal_map = np.mean(bal_map, axis=-1) + test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num) + test_map = np.mean(test_map, axis=-1) + legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size) + + # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend} + return bal_map, test_map, legend + + +def _load_metrics0_classwise(filename, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size): + workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer' + statistics_path = os.path.join(workspace0, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + + statistics_dict = cPickle.load(open(statistics_path, 'rb')) + + return statistics_dict['test'][300]['average_precision'] + + +def _load_metrics0_classwise2(filename, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size): + workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer' + statistics_path = os.path.join(workspace0, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + + statistics_dict = cPickle.load(open(statistics_path, 'rb')) + + k = 270 + mAP = np.mean(statistics_dict['test'][k]['average_precision']) + mAUC = np.mean(statistics_dict['test'][k]['auc']) + dprime = d_prime(mAUC) + return mAP, mAUC, dprime + + +def _load_metrics_classwise(filename, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size): + workspace = '/mnt/cephfs_new_wj/speechsv/kongqiuqiang/workspaces/cvssp/pub_audioset_tagging_cnn' + statistics_path = os.path.join(workspace, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + + statistics_dict = cPickle.load(open(statistics_path, 'rb')) + + k = 300 + mAP = np.mean(statistics_dict['test'][k]['average_precision']) + mAUC = np.mean(statistics_dict['test'][k]['auc']) + dprime = d_prime(mAUC) + return mAP, mAUC, dprime + + +def plot(args): + + # Arguments & parameters + dataset_dir = args.dataset_dir + workspace = args.workspace + select = args.select + + classes_num = config.classes_num + max_plot_iteration = 1000000 + iterations = np.arange(0, max_plot_iteration, 2000) + + class_labels_indices_path = os.path.join(dataset_dir, 'metadata', + 'class_labels_indices.csv') + + save_out_path = 'results/{}.pdf'.format(select) + create_folder(os.path.dirname(save_out_path)) + + # Read labels + labels = config.labels + + # Plot + fig, ax = plt.subplots(1, 1, figsize=(15, 8)) + lines = [] + + def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size): + statistics_path = os.path.join(workspace, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + + statistics_dict = cPickle.load(open(statistics_path, 'rb')) + + bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num) + bal_map = np.mean(bal_map, axis=-1) + test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num) + test_map = np.mean(test_map, axis=-1) + legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size) + + # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend} + return bal_map, test_map, legend + + bal_alpha = 0.3 + test_alpha = 1.0 + lines = [] + + if select == '1_cnn13': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_no_dropout', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13_no_specaug', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_no_specaug', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_no_dropout', color='g', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'none', 32) + line, = ax.plot(bal_map, color='k', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_no_mixup', color='k', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_mixup_in_wave', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='c', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_mixup_in_wave', color='c', alpha=test_alpha) + lines.append(line) + + elif select == '1_pooling': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_gwrp', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13_gmpgapgwrp', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_att', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13_gmpgapatt', color='g', alpha=test_alpha) + lines.append(line) + + elif select == '1_resnet': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'ResNet18', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='ResNet18', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='k', alpha=bal_alpha) + line, = ax.plot(test_map, label='resnet34', color='k', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'ResNet50', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='c', alpha=bal_alpha) + line, = ax.plot(test_map, label='resnet50', color='c', alpha=test_alpha) + lines.append(line) + + elif select == '1_densenet': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'DenseNet121', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='densenet121', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'DenseNet201', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='densenet201', color='g', alpha=test_alpha) + lines.append(line) + + elif select == '1_cnn9': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn5', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn9', color='g', alpha=test_alpha) + lines.append(line) + + elif select == '1_hop': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 500, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13_hop500', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 640, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13_hop640', color='g', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 1000, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='k', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13_hop1000', color='k', alpha=test_alpha) + lines.append(line) + + elif select == '1_emb': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13_emb32', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13_emb128', color='g', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='k', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13_emb512', color='k', alpha=test_alpha) + lines.append(line) + + elif select == '1_mobilenet': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='mobilenetv1', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'MobileNetV2', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='mobilenetv2', color='g', alpha=test_alpha) + lines.append(line) + + elif select == '1_waveform': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn1d_LeeNet', color='g', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet18', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn1d_LeeNet18', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn1d_DaiNet', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='k', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn1d_DaiNet', color='k', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='c', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='c', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet50', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='m', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn1d_ResNet50', color='m', alpha=test_alpha) + lines.append(line) + + elif select == '1_waveform_cnn2d': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_SpAndWav', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_WavCnn2d', color='g', alpha=test_alpha) + lines.append(line) + + elif select == '1_decision_level': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelMax', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_DecisionLevelMax', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAvg', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_DecisionLevelAvg', color='g', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAtt', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='k', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_DecisionLevelAtt', color='k', alpha=test_alpha) + lines.append(line) + + elif select == '1_transformer': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer1', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_Transformer1', color='g', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer3', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_Transformer3', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer6', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='k', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_Transformer6', color='k', alpha=test_alpha) + lines.append(line) + + elif select == '1_aug': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,balanced,mixup', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,none,none', color='g', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,balanced,none', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup_from_0_epoch', 32) + line, = ax.plot(bal_map, color='m', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,balanced,mixup_from_0_epoch', color='m', alpha=test_alpha) + lines.append(line) + + elif select == '1_bal_train_aug': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,balanced,mixup', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'none', 'none', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,none,none', color='g', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,balanced,none', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup_from_0_epoch', 32) + line, = ax.plot(bal_map, color='m', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,balanced,mixup_from_0_epoch', color='m', alpha=test_alpha) + lines.append(line) + + elif select == '1_sr': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14_16k', color='g', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14_8k', color='b', alpha=test_alpha) + lines.append(line) + + elif select == '1_time_domain': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14_time_domain', color='b', alpha=test_alpha) + lines.append(line) + + elif select == '1_partial_full': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'partial_0.9_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,partial_0.9', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,partial_0.8', color='g', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'partial_0.7_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='k', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,partial_0.7', color='k', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='m', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,partial_0.5', color='m', alpha=test_alpha) + lines.append(line) + + elif select == '1_window': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 2048, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14_win2048', color='b', alpha=test_alpha) + lines.append(line) + + elif select == '1_melbins': + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14_mel32', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14_mel128', color='g', alpha=test_alpha) + lines.append(line) + + elif select == '1_alternate': + max_plot_iteration = 2000000 + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'alternate', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14_alternate', color='b', alpha=test_alpha) + lines.append(line) + + elif select == '2_all': + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='MobileNetV1', color='k', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='ResNet34', color='grey', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_WavCnn2d', color='m', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_SpAndWav', color='orange', alpha=test_alpha) + lines.append(line) + + elif select == '2_emb': + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_emb32', color='r', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_128', color='k', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='Cnn13_512', color='g', alpha=test_alpha) + lines.append(line) + + elif select == '2_aug': + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_no_specaug', 'clip_bce', 'none', 'none', 32) + line, = ax.plot(bal_map, color='c', alpha=bal_alpha) + line, = ax.plot(test_map, label='cnn14,none,none', color='c', alpha=test_alpha) + lines.append(line) + + + + ax.set_ylim(0, 1.) + ax.set_xlim(0, len(iterations)) + ax.xaxis.set_ticks(np.arange(0, len(iterations), 25)) + ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000)) + ax.yaxis.set_ticks(np.arange(0, 1.01, 0.05)) + ax.yaxis.set_ticklabels(np.around(np.arange(0, 1.01, 0.05), decimals=2)) + ax.grid(color='b', linestyle='solid', linewidth=0.3) + plt.legend(handles=lines, loc=2) + # box = ax.get_position() + # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0)) + + plt.savefig(save_out_path) + print('Save figure to {}'.format(save_out_path)) + + +def plot_for_paper(args): + + # Arguments & parameters + dataset_dir = args.dataset_dir + workspace = args.workspace + select = args.select + + classes_num = config.classes_num + max_plot_iteration = 1000000 + iterations = np.arange(0, max_plot_iteration, 2000) + + class_labels_indices_path = os.path.join(dataset_dir, 'metadata', + 'class_labels_indices.csv') + + save_out_path = 'results/paper_{}.pdf'.format(select) + create_folder(os.path.dirname(save_out_path)) + + # Read labels + labels = config.labels + + # Plot + fig, ax = plt.subplots(1, 1, figsize=(6, 4)) + lines = [] + + def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size): + statistics_path = os.path.join(workspace, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + + statistics_dict = cPickle.load(open(statistics_path, 'rb')) + + bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num) + bal_map = np.mean(bal_map, axis=-1) + test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num) + test_map = np.mean(test_map, axis=-1) + legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size) + + # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend} + return bal_map, test_map, legend + + bal_alpha = 0.3 + test_alpha = 1.0 + lines = [] + linewidth = 1. + + max_plot_iteration = 540000 + + if select == '2_all': + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + # 320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + # line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha) + # lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + # 320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + # line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha) + # lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + # 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + # line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha) + # lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + # 320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + # line, = ax.plot(test_map, label='Wavegram-CNN', color='g', alpha=test_alpha, linewidth=linewidth) + # lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + elif select == '2_emb': + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + # 320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax.plot(bal_map, color='g', alpha=bal_alpha) + # line, = ax.plot(test_map, label='Cnn13_512', color='g', alpha=test_alpha) + # lines.append(line) + + elif select == '2_bal': + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32) + line, = ax.plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + elif select == '2_sr': + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + elif select == '2_partial': + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + # 320, 64, 50, 14000, 'partial_0.9_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + # line, = ax.plot(test_map, label='cnn14,partial_0.9', color='b', alpha=test_alpha, linewidth=linewidth) + # lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + # 320, 64, 50, 14000, 'partial_0.7_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax.plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth) + # line, = ax.plot(test_map, label='cnn14,partial_0.7', color='k', alpha=test_alpha, linewidth=linewidth) + # lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + elif select == '2_melbins': + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax.plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax.plot(bal_map, color='r', alpha=bal_alpha) + line, = ax.plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax.set_ylim(0, 0.8) + ax.set_xlim(0, len(iterations)) + ax.set_xlabel('Iterations') + ax.set_ylabel('mAP') + ax.xaxis.set_ticks(np.arange(0, len(iterations), 50)) + # ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000)) + ax.xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k']) + ax.yaxis.set_ticks(np.arange(0, 0.81, 0.05)) + ax.yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3', '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8']) + # ax.yaxis.set_ticklabels(np.around(np.arange(0, 0.81, 0.05), decimals=2)) + ax.yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3) + ax.xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3) + plt.legend(handles=lines, loc=2) + plt.tight_layout(0, 0, 0) + # box = ax.get_position() + # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0)) + + plt.savefig(save_out_path) + print('Save figure to {}'.format(save_out_path)) + + +def plot_for_paper2(args): + + # Arguments & parameters + dataset_dir = args.dataset_dir + workspace = args.workspace + + classes_num = config.classes_num + max_plot_iteration = 1000000 + iterations = np.arange(0, max_plot_iteration, 2000) + + class_labels_indices_path = os.path.join(dataset_dir, 'metadata', + 'class_labels_indices.csv') + + save_out_path = 'results/paper2.pdf' + create_folder(os.path.dirname(save_out_path)) + + # Read labels + labels = config.labels + + # Plot + fig, ax = plt.subplots(2, 3, figsize=(14, 7)) + lines = [] + + def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size): + statistics_path = os.path.join(workspace, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + + statistics_dict = cPickle.load(open(statistics_path, 'rb')) + + bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num) + bal_map = np.mean(bal_map, axis=-1) + test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num) + test_map = np.mean(test_map, axis=-1) + legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size) + + # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend} + return bal_map, test_map, legend + + def _load_metrics0(filename, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size): + workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer' + statistics_path = os.path.join(workspace0, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + + statistics_dict = cPickle.load(open(statistics_path, 'rb')) + + bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num) + bal_map = np.mean(bal_map, axis=-1) + test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num) + test_map = np.mean(test_map, axis=-1) + legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size) + + # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend} + return bal_map, test_map, legend + + bal_alpha = 0.3 + test_alpha = 1.0 + lines = [] + linewidth = 1. + + max_plot_iteration = 540000 + + if True: + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[0, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 0].plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + # 320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + # line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha) + # lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + # 320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + # line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha) + # lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[0, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 0].plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + # 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax.plot(bal_map, color='b', alpha=bal_alpha) + # line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha) + # lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + # 320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax[0, 0].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth) + # line, = ax[0, 0].plot(test_map, label='ResNet38', color='k', alpha=test_alpha, linewidth=linewidth) + # lines.append(line) + + # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + # 320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32) + # line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + # line, = ax.plot(test_map, label='Wavegram-CNN', color='g', alpha=test_alpha, linewidth=linewidth) + # lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[0, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 0].plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[0, 0].legend(handles=lines, loc=2) + ax[0, 0].set_title('(a) Comparison of architectures') + + if True: + lines = [] + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[0, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32) + line, = ax[0, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[0, 1].plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32) + line, = ax[0, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32) + line, = ax[0, 1].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[0, 1].plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[0, 1].legend(handles=lines, loc=2, fontsize=8) + + ax[0, 1].set_title('(b) Comparison of training data and augmentation') + + if True: + lines = [] + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[0, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 2].plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[0, 2].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 2].plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[0, 2].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax[0, 2].plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[0, 2].legend(handles=lines, loc=2) + ax[0, 2].set_title('(c) Comparison of embedding size') + + if True: + lines = [] + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[1, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 0].plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[1, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 0].plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[1, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 0].plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[1, 0].legend(handles=lines, loc=2) + ax[1, 0].set_title('(d) Comparison of amount of training data') + + if True: + lines = [] + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[1, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 1].plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[1, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 1].plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[1, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 1].plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[1, 1].legend(handles=lines, loc=2) + ax[1, 1].set_title('(e) Comparison of sampling rate') + + if True: + lines = [] + iterations = np.arange(0, max_plot_iteration, 2000) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[1, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth) + line, = ax[1, 2].plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[1, 2].plot(bal_map, color='b', alpha=bal_alpha) + line, = ax[1, 2].plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, + 320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32) + line, = ax[1, 2].plot(bal_map, color='g', alpha=bal_alpha) + line, = ax[1, 2].plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth) + lines.append(line) + + ax[1, 2].legend(handles=lines, loc=2) + ax[1, 2].set_title('(f) Comparison of mel bins number') + + for i in range(2): + for j in range(3): + ax[i, j].set_ylim(0, 0.8) + ax[i, j].set_xlim(0, len(iterations)) + ax[i, j].set_xlabel('Iterations') + ax[i, j].set_ylabel('mAP') + ax[i, j].xaxis.set_ticks(np.arange(0, len(iterations), 50)) + # ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000)) + ax[i, j].xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k']) + ax[i, j].yaxis.set_ticks(np.arange(0, 0.81, 0.05)) + ax[i, j].yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3', '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8']) + # ax.yaxis.set_ticklabels(np.around(np.arange(0, 0.81, 0.05), decimals=2)) + ax[i, j].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3) + ax[i, j].xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3) + + plt.tight_layout(0, 1, 0) + # box = ax.get_position() + # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0)) + + plt.savefig(save_out_path) + print('Save figure to {}'.format(save_out_path)) + + +def table_values(args): + + # Arguments & parameters + dataset_dir = args.dataset_dir + workspace = args.workspace + select = args.select + + def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration): + statistics_path = os.path.join(workspace, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + + statistics_dict = cPickle.load(open(statistics_path, 'rb')) + + idx = iteration // 2000 + mAP = np.mean(statistics_dict['test'][idx]['average_precision']) + mAUC = np.mean(statistics_dict['test'][idx]['auc']) + dprime = d_prime(mAUC) + + print('mAP: {:.3f}'.format(mAP)) + print('mAUC: {:.3f}'.format(mAUC)) + print('dprime: {:.3f}'.format(dprime)) + + + if select == 'cnn13': + iteration = 600000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'cnn5': + iteration = 440000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'cnn9': + iteration = 440000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'cnn13_decisionlevelmax': + iteration = 400000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelMax', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'cnn13_decisionlevelavg': + iteration = 600000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAvg', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'cnn13_decisionlevelatt': + iteration = 600000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAtt', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'cnn13_emb32': + iteration = 560000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'cnn13_emb128': + iteration = 560000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'cnn13_emb512': + iteration = 440000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'cnn13_hop500': + iteration = 440000 + _load_metrics('main', 32000, 1024, + 500, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'cnn13_hop640': + iteration = 440000 + _load_metrics('main', 32000, 1024, + 640, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'cnn13_hop1000': + iteration = 540000 + _load_metrics('main', 32000, 1024, + 1000, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'mobilenetv1': + iteration = 560000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'mobilenetv2': + iteration = 560000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'MobileNetV2', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'resnet18': + iteration = 600000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'ResNet18', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'resnet34': + iteration = 600000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'resnet50': + iteration = 600000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'ResNet50', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'dainet': + iteration = 600000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn1d_DaiNet', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'leenet': + iteration = 540000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'leenet18': + iteration = 440000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet18', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'resnet34_1d': + iteration = 500000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'resnet50_1d': + iteration = 500000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet50', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'waveform_cnn2d': + iteration = 660000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + elif select == 'waveform_spandwav': + iteration = 700000 + _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + +def crop_label(label): + max_len = 16 + if len(label) <= max_len: + return label + else: + words = label.split(' ') + cropped_label = '' + for w in words: + if len(cropped_label + ' ' + w) > max_len: + break + else: + cropped_label += ' {}'.format(w) + return cropped_label + +def add_comma(integer): + integer = int(integer) + if integer >= 1000: + return str(integer // 1000) + ',' + str(integer % 1000) + else: + return str(integer) + + +def plot_class_iteration(args): + + # Arguments & parameters + workspace = args.workspace + select = args.select + + save_out_path = 'results_map/class_iteration_map.pdf' + create_folder(os.path.dirname(save_out_path)) + + def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration): + statistics_path = os.path.join(workspace, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + + statistics_dict = cPickle.load(open(statistics_path, 'rb')) + return statistics_dict + + iteration = 600000 + statistics_dict = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + mAP_mat = np.array([e['average_precision'] for e in statistics_dict['test']]) + mAP_mat = mAP_mat[0 : 300, :] + sorted_indexes = np.argsort(config.full_samples_per_class)[::-1] + + + fig, axs = plt.subplots(1, 3, figsize=(20, 5)) + ranges = [np.arange(0, 10), np.arange(250, 260), np.arange(517, 527)] + axs[0].set_ylabel('AP') + + for col in range(0, 3): + axs[col].set_ylim(0, 1.) + axs[col].set_xlim(0, 301) + axs[col].set_xlabel('Iterations') + axs[col].set_ylabel('AP') + axs[col].xaxis.set_ticks(np.arange(0, 301, 100)) + axs[col].xaxis.set_ticklabels(['0', '200k', '400k', '600k']) + lines = [] + for _ix in ranges[col]: + _label = crop_label(config.labels[sorted_indexes[_ix]]) + \ + ' ({})'.format(add_comma(config.full_samples_per_class[sorted_indexes[_ix]])) + line, = axs[col].plot(mAP_mat[:, sorted_indexes[_ix]], label=_label) + lines.append(line) + box = axs[col].get_position() + axs[col].set_position([box.x0, box.y0, box.width * 1., box.height]) + axs[col].legend(handles=lines, bbox_to_anchor=(1., 1.)) + axs[col].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3) + + plt.tight_layout(pad=4, w_pad=1, h_pad=1) + plt.savefig(save_out_path) + print(save_out_path) + + +def _load_old_metrics(workspace, filename, iteration, data_type): + + assert data_type in ['train', 'test'] + + stat_name = "stat_{}_iters.p".format(iteration) + + # Load stats + stat_path = os.path.join(workspace, "stats", filename, data_type, stat_name) + try: + stats = cPickle.load(open(stat_path, 'rb')) + except: + stats = cPickle.load(open(stat_path, 'rb'), encoding='latin1') + + precisions = [stat['precisions'] for stat in stats] + recalls = [stat['recalls'] for stat in stats] + maps = np.array([stat['AP'] for stat in stats]) + aucs = np.array([stat['auc'] for stat in stats]) + + return {'average_precision': maps, 'AUC': aucs} + +def _sort(ys): + sorted_idxes = np.argsort(ys) + sorted_idxes = sorted_idxes[::-1] + sorted_ys = ys[sorted_idxes] + sorted_lbs = [config.labels[e] for e in sorted_idxes] + return sorted_ys, sorted_idxes, sorted_lbs + +def load_data(hdf5_path): + with h5py.File(hdf5_path, 'r') as hf: + x = hf['x'][:] + y = hf['y'][:] + video_id_list = list(hf['video_id_list'][:]) + return x, y, video_id_list + +def get_avg_stats(workspace, bgn_iter, fin_iter, interval_iter, filename, data_type): + + assert data_type in ['train', 'test'] + bal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/bal_train.h5" + eval_hdf5 = "/vol/vssp/msos/audioset/packed_features/eval.h5" + unbal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/unbal_train.h5" + + t1 = time.time() + if data_type == 'test': + (te_x, te_y, te_id_list) = load_data(eval_hdf5) + elif data_type == 'train': + (te_x, te_y, te_id_list) = load_data(bal_train_hdf5) + y = te_y + + prob_dir = os.path.join(workspace, "probs", filename, data_type) + names = os.listdir(prob_dir) + + probs = [] + iters = range(bgn_iter, fin_iter, interval_iter) + for iter in iters: + pickle_path = os.path.join(prob_dir, "prob_%d_iters.p" % iter) + try: + prob = cPickle.load(open(pickle_path, 'rb')) + except: + prob = cPickle.load(open(pickle_path, 'rb'), encoding='latin1') + probs.append(prob) + + avg_prob = np.mean(np.array(probs), axis=0) + + n_out = y.shape[1] + stats = [] + for k in range(n_out): # around 7 seconds + (precisions, recalls, thresholds) = metrics.precision_recall_curve(y[:, k], avg_prob[:, k]) + avg_precision = metrics.average_precision_score(y[:, k], avg_prob[:, k], average=None) + (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], avg_prob[:, k]) + auc = metrics.roc_auc_score(y[:, k], avg_prob[:, k], average=None) + # eer = pp_data.eer(avg_prob[:, k], y[:, k]) + + skip = 1000 + dict = {'precisions': precisions[0::skip], 'recalls': recalls[0::skip], 'AP': avg_precision, + 'fpr': fpr[0::skip], 'fnr': 1. - tpr[0::skip], 'auc': auc} + + stats.append(dict) + + mAPs = np.array([e['AP'] for e in stats]) + aucs = np.array([e['auc'] for e in stats]) + + print("Get avg time: {}".format(time.time() - t1)) + + return {'average_precision': mAPs, 'auc': aucs} + + +def _samples_num_per_class(): + bal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/bal_train.h5" + eval_hdf5 = "/vol/vssp/msos/audioset/packed_features/eval.h5" + unbal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/unbal_train.h5" + + (x, y, id_list) = load_data(eval_hdf5) + eval_num = np.sum(y, axis=0) + + (x, y, id_list) = load_data(bal_train_hdf5) + bal_num = np.sum(y, axis=0) + + (x, y, id_list) = load_data(unbal_train_hdf5) + unbal_num = np.sum(y, axis=0) + + return bal_num, unbal_num, eval_num + + +def get_label_quality(): + + rate_csv = '/vol/vssp/msos/qk/workspaces/pub_audioset_tagging_cnn_transfer/metadata/qa_true_counts.csv' + + with open(rate_csv, 'r') as f: + reader = csv.reader(f, delimiter=',') + lis = list(reader) + + rates = [] + + for n in range(1, len(lis)): + li = lis[n] + if float(li[1]) == 0: + rate = None + else: + rate = float(li[2]) / float(li[1]) + rates.append(rate) + + return rates + + +def summary_stats(args): + # Arguments & parameters + workspace = args.workspace + + out_stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl') + create_folder(os.path.dirname(out_stat_path)) + + # Old workspace + old_workspace = '/vol/vssp/msos/qk/workspaces/audioset_classification' + + # bal_train_metrics = _load_old_metrics(old_workspace, 'tmp127', 20000, 'train') + # eval_metrics = _load_old_metrics(old_workspace, 'tmp127', 20000, 'test') + + bal_train_metrics = get_avg_stats(old_workspace, bgn_iter=10000, fin_iter=50001, interval_iter=5000, filename='tmp127_re', data_type='train') + eval_metrics = get_avg_stats(old_workspace, bgn_iter=10000, fin_iter=50001, interval_iter=5000, filename='tmp127_re', data_type='test') + + maps0te = eval_metrics['average_precision'] + (maps0te, sorted_idxes, sorted_lbs) = _sort(maps0te) + + bal_num, unbal_num, eval_num = _samples_num_per_class() + + output_dict = { + 'labels': config.labels, + 'label_quality': get_label_quality(), + 'sorted_indexes_for_plot': sorted_idxes, + 'official_balanced_trainig_samples': bal_num, + 'official_unbalanced_training_samples': unbal_num, + 'official_eval_samples': eval_num, + 'downloaded_full_training_samples': config.full_samples_per_class, + 'averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations': + {'bal_train': bal_train_metrics, 'eval': eval_metrics} + } + + def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, + fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration): + _workspace = '/vol/vssp/msos/qk/bytedance/workspaces_important/pub_audioset_tagging_cnn_transfer' + statistics_path = os.path.join(_workspace, 'statistics', filename, + 'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format( + sample_rate, window_size, hop_size, mel_bins, fmin, fmax), + 'data_type={}'.format(data_type), model_type, + 'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), + 'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), + 'statistics.pkl') + + statistics_dict = cPickle.load(open(statistics_path, 'rb')) + + _idx = iteration // 2000 + _dict = {'bal_train': {'average_precision': statistics_dict['bal'][_idx]['average_precision'], + 'auc': statistics_dict['bal'][_idx]['auc']}, + 'eval': {'average_precision': statistics_dict['test'][_idx]['average_precision'], + 'auc': statistics_dict['test'][_idx]['auc']}} + return _dict + + iteration = 600000 + output_dict['cnn13_system_iteration60k'] = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + iteration = 560000 + output_dict['mobilenetv1_system_iteration56k'] = _load_metrics('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32, iteration) + + cPickle.dump(output_dict, open(out_stat_path, 'wb')) + print('Write stats for paper to {}'.format(out_stat_path)) + + +def prepare_plot_long_4_rows(sorted_lbs): + N = len(sorted_lbs) + + f,(ax1a, ax2a, ax3a, ax4a) = plt.subplots(4, 1,sharey=False, facecolor='w', figsize=(10, 12)) + + fontsize = 5 + + K = 132 + ax1a.set_xlim(0, K) + ax2a.set_xlim(K, 2 * K) + ax3a.set_xlim(2 * K, 3 * K) + ax4a.set_xlim(3 * K, N) + + truncated_sorted_lbs = [] + for lb in sorted_lbs: + lb = lb[0 : 25] + words = lb.split(' ') + if len(words[-1]) < 3: + lb = ' '.join(words[0:-1]) + truncated_sorted_lbs.append(lb) + + ax1a.grid(which='major', axis='x', linestyle='-', alpha=0.3) + ax2a.grid(which='major', axis='x', linestyle='-', alpha=0.3) + ax3a.grid(which='major', axis='x', linestyle='-', alpha=0.3) + ax4a.grid(which='major', axis='x', linestyle='-', alpha=0.3) + + ax1a.set_yscale('log') + ax2a.set_yscale('log') + ax3a.set_yscale('log') + ax4a.set_yscale('log') + + ax1b = ax1a.twinx() + ax2b = ax2a.twinx() + ax3b = ax3a.twinx() + ax4b = ax4a.twinx() + ax1b.set_ylim(0., 1.) + ax2b.set_ylim(0., 1.) + ax3b.set_ylim(0., 1.) + ax4b.set_ylim(0., 1.) + ax1b.set_ylabel('Average precision') + ax2b.set_ylabel('Average precision') + ax3b.set_ylabel('Average precision') + ax4b.set_ylabel('Average precision') + + ax1b.yaxis.grid(color='grey', linestyle='--', alpha=0.5) + ax2b.yaxis.grid(color='grey', linestyle='--', alpha=0.5) + ax3b.yaxis.grid(color='grey', linestyle='--', alpha=0.5) + ax4b.yaxis.grid(color='grey', linestyle='--', alpha=0.5) + + ax1a.xaxis.set_ticks(np.arange(K)) + ax1a.xaxis.set_ticklabels(truncated_sorted_lbs[0:K], rotation=90, fontsize=fontsize) + ax1a.xaxis.tick_bottom() + ax1a.set_ylabel("Number of audio clips") + + ax2a.xaxis.set_ticks(np.arange(K, 2*K)) + ax2a.xaxis.set_ticklabels(truncated_sorted_lbs[K:2*K], rotation=90, fontsize=fontsize) + ax2a.xaxis.tick_bottom() + # ax2a.tick_params(left='off', which='both') + ax2a.set_ylabel("Number of audio clips") + + ax3a.xaxis.set_ticks(np.arange(2*K, 3*K)) + ax3a.xaxis.set_ticklabels(truncated_sorted_lbs[2*K:3*K], rotation=90, fontsize=fontsize) + ax3a.xaxis.tick_bottom() + ax3a.set_ylabel("Number of audio clips") + + ax4a.xaxis.set_ticks(np.arange(3*K, N)) + ax4a.xaxis.set_ticklabels(truncated_sorted_lbs[3*K:], rotation=90, fontsize=fontsize) + ax4a.xaxis.tick_bottom() + # ax4a.tick_params(left='off', which='both') + ax4a.set_ylabel("Number of audio clips") + + ax1a.spines['right'].set_visible(False) + ax1b.spines['right'].set_visible(False) + ax2a.spines['left'].set_visible(False) + ax2b.spines['left'].set_visible(False) + ax2a.spines['right'].set_visible(False) + ax2b.spines['right'].set_visible(False) + ax3a.spines['left'].set_visible(False) + ax3b.spines['left'].set_visible(False) + ax3a.spines['right'].set_visible(False) + ax3b.spines['right'].set_visible(False) + ax4a.spines['left'].set_visible(False) + ax4b.spines['left'].set_visible(False) + + plt.subplots_adjust(hspace = 0.8) + + return ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b + +def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.): + N = len(x) + ax.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha) + ax2.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha) + ax3.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha) + ax4.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha) + +def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, label=""): + N = len(x) + ax.plot(x, c=c, linewidth=linewidth, alpha=alpha) + ax2.plot(x, c=c, linewidth=linewidth, alpha=alpha) + ax3.plot(x, c=c, linewidth=linewidth, alpha=alpha) + line, = ax4.plot(x, c=c, linewidth=linewidth, alpha=alpha, label=label) + return line + +def plot_long_fig(args): + # Arguments & parameters + workspace = args.workspace + + # Paths + stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl') + save_out_path = 'results/long_fig.pdf' + create_folder(os.path.dirname(save_out_path)) + + # Stats + stats = cPickle.load(open(stat_path, 'rb')) + + N = len(config.labels) + sorted_indexes = stats['sorted_indexes_for_plot'] + sorted_labels = np.array(config.labels)[sorted_indexes] + audio_clips_per_class = stats['official_balanced_trainig_samples'] + stats['official_unbalanced_training_samples'] + audio_clips_per_class = audio_clips_per_class[sorted_indexes] + + (ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b) = prepare_plot_long_4_rows(sorted_labels) + + # plot the same data on both axes + ax1a.bar(np.arange(N), audio_clips_per_class, alpha=0.3) + ax2a.bar(np.arange(N), audio_clips_per_class, alpha=0.3) + ax3a.bar(np.arange(N), audio_clips_per_class, alpha=0.3) + ax4a.bar(np.arange(N), audio_clips_per_class, alpha=0.3) + + maps_avg_instances = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision'] + maps_avg_instances = maps_avg_instances[sorted_indexes] + + maps_cnn13 = stats['cnn13_system_iteration60k']['eval']['average_precision'] + maps_cnn13 = maps_cnn13[sorted_indexes] + + maps_mobilenetv1 = stats['mobilenetv1_system_iteration56k']['eval']['average_precision'] + maps_mobilenetv1 = maps_mobilenetv1[sorted_indexes] + + maps_logmel_wavegram_cnn = _load_metrics0_classwise('main', 32000, 1024, + 320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32) + maps_logmel_wavegram_cnn = maps_logmel_wavegram_cnn[sorted_indexes] + + _scatter_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, s=5, c='k') + _scatter_4_rows(maps_cnn13, ax1b, ax2b, ax3b, ax4b, s=5, c='r') + _scatter_4_rows(maps_mobilenetv1, ax1b, ax2b, ax3b, ax4b, s=5, c='b') + _scatter_4_rows(maps_logmel_wavegram_cnn, ax1b, ax2b, ax3b, ax4b, s=5, c='g') + + linewidth = 0.7 + line0te = _plot_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, c='k', linewidth=linewidth, label='AP with averaging instances (baseline)') + line1te = _plot_4_rows(maps_cnn13, ax1b, ax2b, ax3b, ax4b, c='r', linewidth=linewidth, label='AP with CNN14') + line2te = _plot_4_rows(maps_mobilenetv1, ax1b, ax2b, ax3b, ax4b, c='b', linewidth=linewidth, label='AP with MobileNetV1') + line3te = _plot_4_rows(maps_logmel_wavegram_cnn, ax1b, ax2b, ax3b, ax4b, c='g', linewidth=linewidth, label='AP with Wavegram-Logmel-CNN') + + label_quality = stats['label_quality'] + sorted_rate = np.array(label_quality)[sorted_indexes] + for k in range(len(sorted_rate)): + if sorted_rate[k] and sorted_rate[k] == 1: + sorted_rate[k] = 0.99 + + ax1b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+') + ax2b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+') + ax3b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+') + line_label_quality = ax4b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+', label='Label quality') + ax1b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_') + ax2b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_') + ax3b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_') + ax4b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_') + + plt.legend(handles=[line0te, line1te, line2te, line3te, line_label_quality], fontsize=6, loc=1) + + plt.savefig(save_out_path) + print('Save fig to {}'.format(save_out_path)) + +def plot_flops(args): + + # Arguments & parameters + workspace = args.workspace + + # Paths + save_out_path = 'results_map/flops.pdf' + create_folder(os.path.dirname(save_out_path)) + + plt.figure(figsize=(5, 5)) + fig, ax = plt.subplots(1, 1) + + model_types = np.array(['Cnn6', 'Cnn10', 'Cnn14', 'ResNet22', 'ResNet38', 'ResNet54', + 'MobileNetV1', 'MobileNetV2', 'DaiNet', 'LeeNet', 'LeeNet18', + 'Res1dNet30', 'Res1dNet44', 'Wavegram-CNN', 'Wavegram-\nLogmel-CNN']) + flops = np.array([21.986, 21.986, 42.220, 30.081, 48.962, 54.563, 3.614, 2.810, + 30.395, 4.741, 26.369, 32.688, 61.833, 44.234, 53.510]) + mAPs = np.array([0.343, 0.380, 0.431, 0.430, 0.434, 0.429, 0.389, 0.383, 0.295, + 0.266, 0.336, 0.365, 0.355, 0.389, 0.439]) + + sorted_indexes = np.sort(flops) + ax.scatter(flops, mAPs) + + shift = [[1, 0.002], [1, -0.006], [-1, -0.014], [-2, 0.006], [-7, 0.006], + [1, -0.01], [0.5, 0.004], [-1, -0.014], [1, -0.007], [0.8, -0.008], + [1, -0.007], [1, 0.002], [-6, -0.015], [1, -0.008], [0.8, 0]] + + for i, model_type in enumerate(model_types): + ax.annotate(model_type, (flops[i] + shift[i][0], mAPs[i] + shift[i][1])) + + ax.plot(flops[[0, 1, 2]], mAPs[[0, 1, 2]]) + ax.plot(flops[[3, 4, 5]], mAPs[[3, 4, 5]]) + ax.plot(flops[[6, 7]], mAPs[[6, 7]]) + ax.plot(flops[[9, 10]], mAPs[[9, 10]]) + ax.plot(flops[[11, 12]], mAPs[[11, 12]]) + ax.plot(flops[[13, 14]], mAPs[[13, 14]]) + + ax.set_xlim(0, 70) + ax.set_ylim(0.2, 0.5) + ax.set_xlabel('Multi-adds (million)') + ax.set_ylabel('mAP') + + plt.tight_layout(0, 0, 0) + + plt.savefig(save_out_path) + print('Write out figure to {}'.format(save_out_path)) + + +def spearman(args): + + # Arguments & parameters + workspace = args.workspace + + # Paths + stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl') + + # Stats + stats = cPickle.load(open(stat_path, 'rb')) + + label_quality = np.array([qu if qu else 0.5 for qu in stats['label_quality']]) + training_samples = np.array(stats['official_balanced_trainig_samples']) + \ + np.array(stats['official_unbalanced_training_samples']) + mAP = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision'] + + import scipy + samples_spearman = scipy.stats.spearmanr(training_samples, mAP)[0] + quality_spearman = scipy.stats.spearmanr(label_quality, mAP)[0] + + print('Training samples spearman: {:.3f}'.format(samples_spearman)) + print('Quality spearman: {:.3f}'.format(quality_spearman)) + + +def print_results(args): + + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32) + + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32) + + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32) + + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'none', 'none', 32) + + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32) + + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + + # + (mAP, mAUC, dprime) = _load_metrics0_classwise2('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32) + + (mAP, mAUC, dprime) = _load_metrics0_classwise2('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32) + + # partial + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32) + + # Sample rate + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32) + + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32) + + # Mel bins + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32) + + (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32) + + import crash + asdf + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='') + subparsers = parser.add_subparsers(dest='mode') + + parser_plot = subparsers.add_parser('plot') + parser_plot.add_argument('--dataset_dir', type=str, required=True) + parser_plot.add_argument('--workspace', type=str, required=True) + parser_plot.add_argument('--select', type=str, required=True) + + parser_plot = subparsers.add_parser('plot_for_paper') + parser_plot.add_argument('--dataset_dir', type=str, required=True) + parser_plot.add_argument('--workspace', type=str, required=True) + parser_plot.add_argument('--select', type=str, required=True) + + parser_plot = subparsers.add_parser('plot_for_paper2') + parser_plot.add_argument('--dataset_dir', type=str, required=True) + parser_plot.add_argument('--workspace', type=str, required=True) + + parser_values = subparsers.add_parser('plot_class_iteration') + parser_values.add_argument('--workspace', type=str, required=True) + parser_values.add_argument('--select', type=str, required=True) + + parser_summary_stats = subparsers.add_parser('summary_stats') + parser_summary_stats.add_argument('--workspace', type=str, required=True) + + parser_plot_long = subparsers.add_parser('plot_long_fig') + parser_plot_long.add_argument('--workspace', type=str, required=True) + + parser_plot_flops = subparsers.add_parser('plot_flops') + parser_plot_flops.add_argument('--workspace', type=str, required=True) + + parser_spearman = subparsers.add_parser('spearman') + parser_spearman.add_argument('--workspace', type=str, required=True) + + parser_print = subparsers.add_parser('print') + parser_print.add_argument('--workspace', type=str, required=True) + + args = parser.parse_args() + + if args.mode == 'plot': + plot(args) + + elif args.mode == 'plot_for_paper': + plot_for_paper(args) + + elif args.mode == 'plot_for_paper2': + plot_for_paper2(args) + + elif args.mode == 'table_values': + table_values(args) + + elif args.mode == 'plot_class_iteration': + plot_class_iteration(args) + + elif args.mode == 'summary_stats': + summary_stats(args) + + elif args.mode == 'plot_long_fig': + plot_long_fig(args) + + elif args.mode == 'plot_flops': + plot_flops(args) + + elif args.mode == 'spearman': + spearman(args) + + elif args.mode == 'print': + print_results(args) + + else: + raise Exception('Error argument!') \ No newline at end of file diff --git a/audio_detection/audio_infer/utils/utilities.py b/audio_detection/audio_infer/utils/utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..8d1604579b88e7e1e79f6350376f89d9c1c85f44 --- /dev/null +++ b/audio_detection/audio_infer/utils/utilities.py @@ -0,0 +1,172 @@ +import os +import logging +import h5py +import soundfile +import librosa +import numpy as np +import pandas as pd +from scipy import stats +import datetime +import pickle + + +def create_folder(fd): + if not os.path.exists(fd): + os.makedirs(fd) + + +def get_filename(path): + path = os.path.realpath(path) + na_ext = path.split('/')[-1] + na = os.path.splitext(na_ext)[0] + return na + + +def get_sub_filepaths(folder): + paths = [] + for root, dirs, files in os.walk(folder): + for name in files: + path = os.path.join(root, name) + paths.append(path) + return paths + + +def create_logging(log_dir, filemode): + create_folder(log_dir) + i1 = 0 + + while os.path.isfile(os.path.join(log_dir, '{:04d}.log'.format(i1))): + i1 += 1 + + log_path = os.path.join(log_dir, '{:04d}.log'.format(i1)) + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', + datefmt='%a, %d %b %Y %H:%M:%S', + filename=log_path, + filemode=filemode) + + # Print to console + console = logging.StreamHandler() + console.setLevel(logging.INFO) + formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') + console.setFormatter(formatter) + logging.getLogger('').addHandler(console) + + return logging + + +def read_metadata(csv_path, classes_num, id_to_ix): + """Read metadata of AudioSet from a csv file. + + Args: + csv_path: str + + Returns: + meta_dict: {'audio_name': (audios_num,), 'target': (audios_num, classes_num)} + """ + + with open(csv_path, 'r') as fr: + lines = fr.readlines() + lines = lines[3:] # Remove heads + + audios_num = len(lines) + targets = np.zeros((audios_num, classes_num), dtype=np.bool) + audio_names = [] + + for n, line in enumerate(lines): + items = line.split(', ') + """items: ['--4gqARaEJE', '0.000', '10.000', '"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"\n']""" + + audio_name = 'Y{}.wav'.format(items[0]) # Audios are started with an extra 'Y' when downloading + label_ids = items[3].split('"')[1].split(',') + + audio_names.append(audio_name) + + # Target + for id in label_ids: + ix = id_to_ix[id] + targets[n, ix] = 1 + + meta_dict = {'audio_name': np.array(audio_names), 'target': targets} + return meta_dict + + +def float32_to_int16(x): + assert np.max(np.abs(x)) <= 1.2 + x = np.clip(x, -1, 1) + return (x * 32767.).astype(np.int16) + +def int16_to_float32(x): + return (x / 32767.).astype(np.float32) + + +def pad_or_truncate(x, audio_length): + """Pad all audio to specific length.""" + if len(x) <= audio_length: + return np.concatenate((x, np.zeros(audio_length - len(x))), axis=0) + else: + return x[0 : audio_length] + + +def d_prime(auc): + d_prime = stats.norm().ppf(auc) * np.sqrt(2.0) + return d_prime + + +class Mixup(object): + def __init__(self, mixup_alpha, random_seed=1234): + """Mixup coefficient generator. + """ + self.mixup_alpha = mixup_alpha + self.random_state = np.random.RandomState(random_seed) + + def get_lambda(self, batch_size): + """Get mixup random coefficients. + Args: + batch_size: int + Returns: + mixup_lambdas: (batch_size,) + """ + mixup_lambdas = [] + for n in range(0, batch_size, 2): + lam = self.random_state.beta(self.mixup_alpha, self.mixup_alpha, 1)[0] + mixup_lambdas.append(lam) + mixup_lambdas.append(1. - lam) + + return np.array(mixup_lambdas) + + +class StatisticsContainer(object): + def __init__(self, statistics_path): + """Contain statistics of different training iterations. + """ + self.statistics_path = statistics_path + + self.backup_statistics_path = '{}_{}.pkl'.format( + os.path.splitext(self.statistics_path)[0], + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + + self.statistics_dict = {'bal': [], 'test': []} + + def append(self, iteration, statistics, data_type): + statistics['iteration'] = iteration + self.statistics_dict[data_type].append(statistics) + + def dump(self): + pickle.dump(self.statistics_dict, open(self.statistics_path, 'wb')) + pickle.dump(self.statistics_dict, open(self.backup_statistics_path, 'wb')) + logging.info(' Dump statistics to {}'.format(self.statistics_path)) + logging.info(' Dump statistics to {}'.format(self.backup_statistics_path)) + + def load_state_dict(self, resume_iteration): + self.statistics_dict = pickle.load(open(self.statistics_path, 'rb')) + + resume_statistics_dict = {'bal': [], 'test': []} + + for key in self.statistics_dict.keys(): + for statistics in self.statistics_dict[key]: + if statistics['iteration'] <= resume_iteration: + resume_statistics_dict[key].append(statistics) + + self.statistics_dict = resume_statistics_dict \ No newline at end of file diff --git a/audio_detection/target_sound_detection/src/__pycache__/models.cpython-38.pyc b/audio_detection/target_sound_detection/src/__pycache__/models.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0982846c06b554669d8f290a24eb2fdb172893a Binary files /dev/null and b/audio_detection/target_sound_detection/src/__pycache__/models.cpython-38.pyc differ diff --git a/audio_detection/target_sound_detection/src/__pycache__/utils.cpython-38.pyc b/audio_detection/target_sound_detection/src/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c90bde06c05575070a8743337c5b2bc4e139be3b Binary files /dev/null and b/audio_detection/target_sound_detection/src/__pycache__/utils.cpython-38.pyc differ diff --git a/audio_detection/target_sound_detection/src/models.py b/audio_detection/target_sound_detection/src/models.py new file mode 100644 index 0000000000000000000000000000000000000000..3016b9274aeb86091d30d980803c7106f15ddd54 --- /dev/null +++ b/audio_detection/target_sound_detection/src/models.py @@ -0,0 +1,1288 @@ +# !/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2021/3/9 16:33 +# @Author : dongchao yang +# @File : train.py +from itertools import zip_longest +import numpy as np +from scipy import ndimage +import torch +import torch.nn as nn +import torch.nn.functional as F +import time +from torchlibrosa.augmentation import SpecAugmentation +from torchlibrosa.stft import Spectrogram, LogmelFilterBank +import math +from sklearn.cluster import KMeans +import os +import time +from functools import partial +# import timm +# from timm.models.layers import DropPath, to_2tuple, trunc_normal_ +import warnings +from functools import partial +# from timm.models.registry import register_model +# from timm.models.vision_transformer import _cfg +# from mmdet.utils import get_root_logger +# from mmcv.runner import load_checkpoint +# from mmcv.runner import _load_checkpoint, load_state_dict +# import mmcv.runner +import copy +from collections import OrderedDict +import io +import re +DEBUG=0 +event_labels = ['Alarm', 'Alarm_clock', 'Animal', 'Applause', 'Arrow', 'Artillery_fire', + 'Babbling', 'Baby_laughter', 'Bark', 'Basketball_bounce', 'Battle_cry', + 'Bell', 'Bird', 'Bleat', 'Bouncing', 'Breathing', 'Buzz', 'Camera', + 'Cap_gun', 'Car', 'Car_alarm', 'Cat', 'Caw', 'Cheering', 'Child_singing', + 'Choir', 'Chop', 'Chopping_(food)', 'Clapping', 'Clickety-clack', 'Clicking', + 'Clip-clop', 'Cluck', 'Coin_(dropping)', 'Computer_keyboard', 'Conversation', + 'Coo', 'Cough', 'Cowbell', 'Creak', 'Cricket', 'Croak', 'Crow', 'Crowd', 'DTMF', + 'Dog', 'Door', 'Drill', 'Drip', 'Engine', 'Engine_starting', 'Explosion', 'Fart', + 'Female_singing', 'Filing_(rasp)', 'Finger_snapping', 'Fire', 'Fire_alarm', 'Firecracker', + 'Fireworks', 'Frog', 'Gasp', 'Gears', 'Giggle', 'Glass', 'Glass_shatter', 'Gobble', 'Groan', + 'Growling', 'Hammer', 'Hands', 'Hiccup', 'Honk', 'Hoot', 'Howl', 'Human_sounds', 'Human_voice', + 'Insect', 'Laughter', 'Liquid', 'Machine_gun', 'Male_singing', 'Mechanisms', 'Meow', 'Moo', + 'Motorcycle', 'Mouse', 'Music', 'Oink', 'Owl', 'Pant', 'Pant_(dog)', 'Patter', 'Pig', 'Plop', + 'Pour', 'Power_tool', 'Purr', 'Quack', 'Radio', 'Rain_on_surface', 'Rapping', 'Rattle', + 'Reversing_beeps', 'Ringtone', 'Roar', 'Run', 'Rustle', 'Scissors', 'Scrape', 'Scratch', + 'Screaming', 'Sewing_machine', 'Shout', 'Shuffle', 'Shuffling_cards', 'Singing', + 'Single-lens_reflex_camera', 'Siren', 'Skateboard', 'Sniff', 'Snoring', 'Speech', + 'Speech_synthesizer', 'Spray', 'Squeak', 'Squeal', 'Steam', 'Stir', 'Surface_contact', + 'Tap', 'Tap_dance', 'Telephone_bell_ringing', 'Television', 'Tick', 'Tick-tock', 'Tools', + 'Train', 'Train_horn', 'Train_wheels_squealing', 'Truck', 'Turkey', 'Typewriter', 'Typing', + 'Vehicle', 'Video_game_sound', 'Water', 'Whimper_(dog)', 'Whip', 'Whispering', 'Whistle', + 'Whistling', 'Whoop', 'Wind', 'Writing', 'Yip', 'and_pans', 'bird_song', 'bleep', 'clink', + 'cock-a-doodle-doo', 'crinkling', 'dove', 'dribble', 'eructation', 'faucet', 'flapping_wings', + 'footsteps', 'gunfire', 'heartbeat', 'infant_cry', 'kid_speaking', 'man_speaking', 'mastication', + 'mice', 'river', 'rooster', 'silverware', 'skidding', 'smack', 'sobbing', 'speedboat', 'splatter', + 'surf', 'thud', 'thwack', 'toot', 'truck_horn', 'tweet', 'vroom', 'waterfowl', 'woman_speaking'] +def load_checkpoint(model, + filename, + map_location=None, + strict=False, + logger=None, + revise_keys=[(r'^module\.', '')]): + """Load checkpoint from a file or URI. + Args: + model (Module): Module to load checkpoint. + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str): Same as :func:`torch.load`. + strict (bool): Whether to allow different params for the model and + checkpoint. + logger (:mod:`logging.Logger` or None): The logger for error message. + revise_keys (list): A list of customized keywords to modify the + state_dict in checkpoint. Each item is a (pattern, replacement) + pair of the regular expression operations. Default: strip + the prefix 'module.' by [(r'^module\\.', '')]. + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + checkpoint = _load_checkpoint(filename, map_location, logger) + ''' + new_proj = torch.nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1)) + checkpoint['patch_embed1.proj.weight'] = new_proj.weight + new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=2).unsqueeze(2).repeat(1,1,3,1)) + checkpoint['patch_embed1.proj.weight'] = new_proj.weight + new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=3).unsqueeze(3).repeat(1,1,1,3)) + checkpoint['patch_embed1.proj.weight'] = new_proj.weight + ''' + new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2)) + new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1)) + checkpoint['patch_embed1.proj.weight'] = new_proj.weight + # OrderedDict is a subclass of dict + if not isinstance(checkpoint, dict): + raise RuntimeError( + f'No state_dict found in checkpoint file {filename}') + # get state_dict from checkpoint + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + + # strip prefix of state_dict + metadata = getattr(state_dict, '_metadata', OrderedDict()) + for p, r in revise_keys: + state_dict = OrderedDict( + {re.sub(p, r, k): v + for k, v in state_dict.items()}) + state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()}) + # Keep metadata in state_dict + state_dict._metadata = metadata + + # load state_dict + load_state_dict(model, state_dict, strict, logger) + return checkpoint + +def init_weights(m): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + nn.init.kaiming_normal_(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + if isinstance(m, nn.Linear): + nn.init.kaiming_uniform_(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0) +def init_layer(layer): + """Initialize a Linear or Convolutional layer. """ + nn.init.xavier_uniform_(layer.weight) + if hasattr(layer, 'bias'): + if layer.bias is not None: + layer.bias.data.fill_(0.) + + +def init_bn(bn): + """Initialize a Batchnorm layer. """ + bn.bias.data.fill_(0.) + bn.weight.data.fill_(1.) + +class MaxPool(nn.Module): + def __init__(self, pooldim=1): + super().__init__() + self.pooldim = pooldim + + def forward(self, logits, decision): + return torch.max(decision, dim=self.pooldim)[0] + + +class LinearSoftPool(nn.Module): + """LinearSoftPool + Linear softmax, takes logits and returns a probability, near to the actual maximum value. + Taken from the paper: + A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling + https://arxiv.org/abs/1810.09050 + """ + def __init__(self, pooldim=1): + super().__init__() + self.pooldim = pooldim + + def forward(self, logits, time_decision): + return (time_decision**2).sum(self.pooldim) / (time_decision.sum( + self.pooldim)+1e-7) + +class ConvBlock(nn.Module): + def __init__(self, in_channels, out_channels): + + super(ConvBlock, self).__init__() + + self.conv1 = nn.Conv2d(in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), stride=(1, 1), + padding=(1, 1), bias=False) + + self.conv2 = nn.Conv2d(in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), stride=(1, 1), + padding=(1, 1), bias=False) + + self.bn1 = nn.BatchNorm2d(out_channels) + self.bn2 = nn.BatchNorm2d(out_channels) + + self.init_weight() + + def init_weight(self): + init_layer(self.conv1) + init_layer(self.conv2) + init_bn(self.bn1) + init_bn(self.bn2) + + + def forward(self, input, pool_size=(2, 2), pool_type='avg'): + + x = input + x = F.relu_(self.bn1(self.conv1(x))) + x = F.relu_(self.bn2(self.conv2(x))) + if pool_type == 'max': + x = F.max_pool2d(x, kernel_size=pool_size) + elif pool_type == 'avg': + x = F.avg_pool2d(x, kernel_size=pool_size) + elif pool_type == 'avg+max': + x1 = F.avg_pool2d(x, kernel_size=pool_size) + x2 = F.max_pool2d(x, kernel_size=pool_size) + x = x1 + x2 + else: + raise Exception('Incorrect argument!') + + return x + +class ConvBlock_GLU(nn.Module): + def __init__(self, in_channels, out_channels,kernel_size=(3,3)): + super(ConvBlock_GLU, self).__init__() + self.conv1 = nn.Conv2d(in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, stride=(1, 1), + padding=(1, 1), bias=False) + self.bn1 = nn.BatchNorm2d(out_channels) + self.sigmoid = nn.Sigmoid() + self.init_weight() + + def init_weight(self): + init_layer(self.conv1) + init_bn(self.bn1) + + def forward(self, input, pool_size=(2, 2), pool_type='avg'): + x = input + x = self.bn1(self.conv1(x)) + cnn1 = self.sigmoid(x[:, :x.shape[1]//2, :, :]) + cnn2 = x[:,x.shape[1]//2:,:,:] + x = cnn1*cnn2 + if pool_type == 'max': + x = F.max_pool2d(x, kernel_size=pool_size) + elif pool_type == 'avg': + x = F.avg_pool2d(x, kernel_size=pool_size) + elif pool_type == 'avg+max': + x1 = F.avg_pool2d(x, kernel_size=pool_size) + x2 = F.max_pool2d(x, kernel_size=pool_size) + x = x1 + x2 + elif pool_type == 'None': + pass + elif pool_type == 'LP': + pass + #nn.LPPool2d(4, pool_size) + else: + raise Exception('Incorrect argument!') + return x + +class Mul_scale_GLU(nn.Module): + def __init__(self): + super(Mul_scale_GLU,self).__init__() + self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1)) # 1*1 + self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3)) # 3*3 + self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5)) # 5*5 + self.conv_block2 = ConvBlock_GLU(in_channels=96, out_channels=128*2) + # self.conv_block3 = ConvBlock(in_channels=64, out_channels=128) + self.conv_block3 = ConvBlock_GLU(in_channels=128, out_channels=128*2) + self.conv_block4 = ConvBlock_GLU(in_channels=128, out_channels=256*2) + self.conv_block5 = ConvBlock_GLU(in_channels=256, out_channels=256*2) + self.conv_block6 = ConvBlock_GLU(in_channels=256, out_channels=512*2) + self.conv_block7 = ConvBlock_GLU(in_channels=512, out_channels=512*2) + self.padding = nn.ReplicationPad2d((0,1,0,1)) + + def forward(self, input, fi=None): + """ + Input: (batch_size, data_length)""" + x1 = self.conv_block1_1(input, pool_size=(2, 2), pool_type='avg') + x1 = x1[:,:,:500,:32] + #print('x1 ',x1.shape) + x2 = self.conv_block1_2(input,pool_size=(2,2),pool_type='avg') + #print('x2 ',x2.shape) + x3 = self.conv_block1_3(input,pool_size=(2,2),pool_type='avg') + x3 = self.padding(x3) + #print('x3 ',x3.shape) + # assert 1==2 + x = torch.cat([x1,x2],dim=1) + x = torch.cat([x,x3],dim=1) + #print('x ',x.shape) + x = self.conv_block2(x, pool_size=(2, 2), pool_type='None') + x = self.conv_block3(x,pool_size=(2,2),pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) # + #print('x2,3 ',x.shape) + x = self.conv_block4(x, pool_size=(2, 4), pool_type='None') + x = self.conv_block5(x,pool_size=(2,4),pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + #print('x4,5 ',x.shape) + + x = self.conv_block6(x, pool_size=(1, 4), pool_type='None') + x = self.conv_block7(x, pool_size=(1, 4), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + # print('x6,7 ',x.shape) + # assert 1==2 + return x + +class Cnn14(nn.Module): + def __init__(self, sample_rate=32000, window_size=1024, hop_size=320, mel_bins=64, fmin=50, + fmax=14000, classes_num=527): + + super(Cnn14, self).__init__() + + window = 'hann' + center = True + pad_mode = 'reflect' + ref = 1.0 + amin = 1e-10 + top_db = None + + # Spectrogram extractor + self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, + win_length=window_size, window=window, center=center, pad_mode=pad_mode, + freeze_parameters=True) + + # Logmel feature extractor + self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, + n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, + freeze_parameters=True) + + # Spec augmenter + self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, + freq_drop_width=8, freq_stripes_num=2) + + self.bn0 = nn.BatchNorm2d(64) + + self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) + self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) + self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) + self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) + self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) + self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) + + self.fc1 = nn.Linear(2048, 128, bias=True) + self.fc_audioset = nn.Linear(128, classes_num, bias=True) + + self.init_weight() + + def init_weight(self): + init_layer(self.fc1) + init_layer(self.fc_audioset) + + def forward(self, input_, mixup_lambda=None): + """ + Input: (batch_size, data_length)""" + input_ = input_.unsqueeze(1) + x = self.conv_block1(input_, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block4(x, pool_size=(1, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block5(x, pool_size=(1, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block6(x, pool_size=(1, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + # print(x.shape) + # x = torch.mean(x, dim=3) + x = x.transpose(1, 2).contiguous().flatten(-2) + x = self.fc1(x) + # print(x.shape) + # assert 1==2 + # (x1,_) = torch.max(x, dim=2) + # x2 = torch.mean(x, dim=2) + # x = x1 + x2 + # x = F.dropout(x, p=0.5, training=self.training) + # x = F.relu_(self.fc1(x)) + # embedding = F.dropout(x, p=0.5, training=self.training) + return x + +class Cnn10_fi(nn.Module): + def __init__(self): + super(Cnn10_fi, self).__init__() + self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) + self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) + self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) + self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) + + # self.fc1 = nn.Linear(512, 512, bias=True) + # self.fc_audioset = nn.Linear(512, classes_num, bias=True) + + # self.init_weight() + + def forward(self, input, fi=None): + """ + Input: (batch_size, data_length)""" + + x = self.conv_block1(input, pool_size=(2, 2), pool_type='avg') + if fi != None: + gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) + beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) + x = (gamma)*x + beta + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') + if fi != None: + gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) + beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) + x = (gamma)*x + beta + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block3(x, pool_size=(2, 4), pool_type='avg') + if fi != None: + gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) + beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) + x = (gamma)*x + beta + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block4(x, pool_size=(1, 4), pool_type='avg') + if fi != None: + gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) + beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) + x = (gamma)*x + beta + x = F.dropout(x, p=0.2, training=self.training) + return x + +class Cnn10_mul_scale(nn.Module): + def __init__(self,scale=8): + super(Cnn10_mul_scale, self).__init__() + self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1)) + self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3)) + self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5)) + self.conv_block2 = ConvBlock(in_channels=96, out_channels=128) + self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) + self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) + self.scale = scale + self.padding = nn.ReplicationPad2d((0,1,0,1)) + def forward(self, input, pool_size=(2, 2), pool_type='avg'): + """ + Input: (batch_size, data_length)""" + if self.scale == 8: + pool_size1 = (2,2) + pool_size2 = (2,2) + pool_size3 = (2,4) + pool_size4 = (1,4) + elif self.scale == 4: + pool_size1 = (2,2) + pool_size2 = (2,2) + pool_size3 = (1,4) + pool_size4 = (1,4) + elif self.scale == 2: + pool_size1 = (2,2) + pool_size2 = (1,2) + pool_size3 = (1,4) + pool_size4 = (1,4) + else: + pool_size1 = (1,2) + pool_size2 = (1,2) + pool_size3 = (1,4) + pool_size4 = (1,4) + # print('input ',input.shape) + x1 = self.conv_block1_1(input, pool_size=pool_size1, pool_type='avg') + x1 = x1[:,:,:500,:32] + #print('x1 ',x1.shape) + x2 = self.conv_block1_2(input, pool_size=pool_size1, pool_type='avg') + #print('x2 ',x2.shape) + x3 = self.conv_block1_3(input, pool_size=pool_size1, pool_type='avg') + x3 = self.padding(x3) + #print('x3 ',x3.shape) + # assert 1==2 + m_i = min(x3.shape[2],min(x1.shape[2],x2.shape[2])) + #print('m_i ', m_i) + x = torch.cat([x1[:,:,:m_i,:],x2[:,:, :m_i,:],x3[:,:, :m_i,:]],dim=1) + # x = torch.cat([x,x3],dim=1) + + # x = self.conv_block1(input, pool_size=pool_size1, pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + return x + + +class Cnn10(nn.Module): + def __init__(self,scale=8): + super(Cnn10, self).__init__() + self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) + self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) + self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) + self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) + self.scale = scale + def forward(self, input, pool_size=(2, 2), pool_type='avg'): + """ + Input: (batch_size, data_length)""" + if self.scale == 8: + pool_size1 = (2,2) + pool_size2 = (2,2) + pool_size3 = (2,4) + pool_size4 = (1,4) + elif self.scale == 4: + pool_size1 = (2,2) + pool_size2 = (2,2) + pool_size3 = (1,4) + pool_size4 = (1,4) + elif self.scale == 2: + pool_size1 = (2,2) + pool_size2 = (1,2) + pool_size3 = (1,4) + pool_size4 = (1,4) + else: + pool_size1 = (1,2) + pool_size2 = (1,2) + pool_size3 = (1,4) + pool_size4 = (1,4) + x = self.conv_block1(input, pool_size=pool_size1, pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + return x + +class MeanPool(nn.Module): + def __init__(self, pooldim=1): + super().__init__() + self.pooldim = pooldim + + def forward(self, logits, decision): + return torch.mean(decision, dim=self.pooldim) + +class ResPool(nn.Module): + def __init__(self, pooldim=1): + super().__init__() + self.pooldim = pooldim + self.linPool = LinearSoftPool(pooldim=1) + +class AutoExpPool(nn.Module): + def __init__(self, outputdim=10, pooldim=1): + super().__init__() + self.outputdim = outputdim + self.alpha = nn.Parameter(torch.full((outputdim, ), 1)) + self.pooldim = pooldim + + def forward(self, logits, decision): + scaled = self.alpha * decision # \alpha * P(Y|x) in the paper + return (logits * torch.exp(scaled)).sum( + self.pooldim) / torch.exp(scaled).sum(self.pooldim) + + +class SoftPool(nn.Module): + def __init__(self, T=1, pooldim=1): + super().__init__() + self.pooldim = pooldim + self.T = T + + def forward(self, logits, decision): + w = torch.softmax(decision / self.T, dim=self.pooldim) + return torch.sum(decision * w, dim=self.pooldim) + + +class AutoPool(nn.Module): + """docstring for AutoPool""" + def __init__(self, outputdim=10, pooldim=1): + super().__init__() + self.outputdim = outputdim + self.alpha = nn.Parameter(torch.ones(outputdim)) + self.dim = pooldim + + def forward(self, logits, decision): + scaled = self.alpha * decision # \alpha * P(Y|x) in the paper + weight = torch.softmax(scaled, dim=self.dim) + return torch.sum(decision * weight, dim=self.dim) # B x C + + +class ExtAttentionPool(nn.Module): + def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs): + super().__init__() + self.inputdim = inputdim + self.outputdim = outputdim + self.pooldim = pooldim + self.attention = nn.Linear(inputdim, outputdim) + nn.init.zeros_(self.attention.weight) + nn.init.zeros_(self.attention.bias) + self.activ = nn.Softmax(dim=self.pooldim) + + def forward(self, logits, decision): + # Logits of shape (B, T, D), decision of shape (B, T, C) + w_x = self.activ(self.attention(logits) / self.outputdim) + h = (logits.permute(0, 2, 1).contiguous().unsqueeze(-2) * + w_x.unsqueeze(-1)).flatten(-2).contiguous() + return torch.sum(h, self.pooldim) + + +class AttentionPool(nn.Module): + """docstring for AttentionPool""" + def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs): + super().__init__() + self.inputdim = inputdim + self.outputdim = outputdim + self.pooldim = pooldim + self.transform = nn.Linear(inputdim, outputdim) + self.activ = nn.Softmax(dim=self.pooldim) + self.eps = 1e-7 + + def forward(self, logits, decision): + # Input is (B, T, D) + # B, T , D + w = self.activ(torch.clamp(self.transform(logits), -15, 15)) + detect = (decision * w).sum( + self.pooldim) / (w.sum(self.pooldim) + self.eps) + # B, T, D + return detect + +class Block2D(nn.Module): + def __init__(self, cin, cout, kernel_size=3, padding=1): + super().__init__() + self.block = nn.Sequential( + nn.BatchNorm2d(cin), + nn.Conv2d(cin, + cout, + kernel_size=kernel_size, + padding=padding, + bias=False), + nn.LeakyReLU(inplace=True, negative_slope=0.1)) + + def forward(self, x): + return self.block(x) + +class AudioCNN(nn.Module): + def __init__(self, classes_num): + super(AudioCNN, self).__init__() + self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) + self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) + self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) + self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) + self.fc1 = nn.Linear(512,128,bias=True) + self.fc = nn.Linear(128, classes_num, bias=True) + self.init_weights() + + def init_weights(self): + init_layer(self.fc) + + def forward(self, input): + ''' + Input: (batch_size, times_steps, freq_bins)''' + # [128, 801, 168] --> [128,1,801,168] + x = input[:, None, :, :] + '''(batch_size, 1, times_steps, freq_bins)''' + x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') # 128,64,400,84 + x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') # 128,128,200,42 + x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') # 128,256,100,21 + x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') # 128,512,50,10 + '''(batch_size, feature_maps, time_steps, freq_bins)''' + x = torch.mean(x, dim=3) # (batch_size, feature_maps, time_stpes) # 128,512,50 + (x, _) = torch.max(x, dim=2) # (batch_size, feature_maps) 128,512 + x = self.fc1(x) # 128,128 + output = self.fc(x) # 128,10 + return x,output + + def extract(self,input): + '''Input: (batch_size, times_steps, freq_bins)''' + x = input[:, None, :, :] + x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') + x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') + x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') + x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') + '''(batch_size, feature_maps, time_steps, freq_bins)''' + x = torch.mean(x, dim=3) # (batch_size, feature_maps, time_stpes) + (x, _) = torch.max(x, dim=2) # (batch_size, feature_maps) + x = self.fc1(x) # 128,128 + return x + +def parse_poolingfunction(poolingfunction_name='mean', **kwargs): + """parse_poolingfunction + A heler function to parse any temporal pooling + Pooling is done on dimension 1 + :param poolingfunction_name: + :param **kwargs: + """ + poolingfunction_name = poolingfunction_name.lower() + if poolingfunction_name == 'mean': + return MeanPool(pooldim=1) + elif poolingfunction_name == 'max': + return MaxPool(pooldim=1) + elif poolingfunction_name == 'linear': + return LinearSoftPool(pooldim=1) + elif poolingfunction_name == 'expalpha': + return AutoExpPool(outputdim=kwargs['outputdim'], pooldim=1) + + elif poolingfunction_name == 'soft': + return SoftPool(pooldim=1) + elif poolingfunction_name == 'auto': + return AutoPool(outputdim=kwargs['outputdim']) + elif poolingfunction_name == 'attention': + return AttentionPool(inputdim=kwargs['inputdim'], + outputdim=kwargs['outputdim']) +class conv1d(nn.Module): + def __init__(self, nin, nout, kernel_size=3, stride=1, padding='VALID', dilation=1): + super(conv1d, self).__init__() + if padding == 'VALID': + dconv_pad = 0 + elif padding == 'SAME': + dconv_pad = dilation * ((kernel_size - 1) // 2) + else: + raise ValueError("Padding Mode Error!") + self.conv = nn.Conv1d(nin, nout, kernel_size=kernel_size, stride=stride, padding=dconv_pad) + self.act = nn.ReLU() + self.init_layer(self.conv) + + def init_layer(self, layer, nonlinearity='relu'): + """Initialize a Linear or Convolutional layer. """ + nn.init.kaiming_normal_(layer.weight, nonlinearity=nonlinearity) + nn.init.constant_(layer.bias, 0.1) + + def forward(self, x): + out = self.act(self.conv(x)) + return out + +class Atten_1(nn.Module): + def __init__(self, input_dim, context=2, dropout_rate=0.2): + super(Atten_1, self).__init__() + self._matrix_k = nn.Linear(input_dim, input_dim // 4) + self._matrix_q = nn.Linear(input_dim, input_dim // 4) + self.relu = nn.ReLU() + self.context = context + self._dropout_layer = nn.Dropout(dropout_rate) + self.init_layer(self._matrix_k) + self.init_layer(self._matrix_q) + + def init_layer(self, layer, nonlinearity='leaky_relu'): + """Initialize a Linear or Convolutional layer. """ + nn.init.kaiming_uniform_(layer.weight, nonlinearity=nonlinearity) + if hasattr(layer, 'bias'): + if layer.bias is not None: + layer.bias.data.fill_(0.) + + def forward(self, input_x): + k_x = input_x + k_x = self.relu(self._matrix_k(k_x)) + k_x = self._dropout_layer(k_x) + # print('k_x ',k_x.shape) + q_x = input_x[:, self.context, :] + # print('q_x ',q_x.shape) + q_x = q_x[:, None, :] + # print('q_x1 ',q_x.shape) + q_x = self.relu(self._matrix_q(q_x)) + q_x = self._dropout_layer(q_x) + # print('q_x2 ',q_x.shape) + x_ = torch.matmul(k_x, q_x.transpose(-2, -1) / math.sqrt(k_x.size(-1))) + # print('x_ ',x_.shape) + x_ = x_.squeeze(2) + alpha = F.softmax(x_, dim=-1) + att_ = alpha + # print('alpha ',alpha) + alpha = alpha.unsqueeze(2).repeat(1,1,input_x.shape[2]) + # print('alpha ',alpha) + # alpha = alpha.view(alpha.size(0), alpha.size(1), alpha.size(2), 1) + out = alpha * input_x + # print('out ', out.shape) + # out = out.mean(2) + out = out.mean(1) + # print('out ',out.shape) + # assert 1==2 + #y = alpha * input_x + #return y, att_ + out = input_x[:, self.context, :] + out + return out + +class Fusion(nn.Module): + def __init__(self, inputdim, inputdim2, n_fac): + super().__init__() + self.fuse_layer1 = conv1d(inputdim, inputdim2*n_fac,1) + self.fuse_layer2 = conv1d(inputdim2, inputdim2*n_fac,1) + self.avg_pool = nn.AvgPool1d(n_fac, stride=n_fac) # 沿着最后一个维度进行pooling + + def forward(self,embedding,mix_embed): + embedding = embedding.permute(0,2,1) + fuse1_out = self.fuse_layer1(embedding) # [2, 501, 2560] ,512*5, 1D卷积融合,spk_embeding ,扩大其维度 + fuse1_out = fuse1_out.permute(0,2,1) + + mix_embed = mix_embed.permute(0,2,1) + fuse2_out = self.fuse_layer2(mix_embed) # [2, 501, 2560] ,512*5, 1D卷积融合,spk_embeding ,扩大其维度 + fuse2_out = fuse2_out.permute(0,2,1) + as_embs = torch.mul(fuse1_out, fuse2_out) # 相乘 [2, 501, 2560] + # (10, 501, 512) + as_embs = self.avg_pool(as_embs) # [2, 501, 512] 相当于 2560//5 + return as_embs + +class CDur_fusion(nn.Module): + def __init__(self, inputdim, outputdim, **kwargs): + super().__init__() + self.features = nn.Sequential( + Block2D(1, 32), + nn.LPPool2d(4, (2, 4)), + Block2D(32, 128), + Block2D(128, 128), + nn.LPPool2d(4, (2, 4)), + Block2D(128, 128), + Block2D(128, 128), + nn.LPPool2d(4, (1, 4)), + nn.Dropout(0.3), + ) + with torch.no_grad(): + rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape + rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] + + self.gru = nn.GRU(128, 128, bidirectional=True, batch_first=True) + self.fusion = Fusion(128,2) + self.fc = nn.Linear(256,256) + self.outputlayer = nn.Linear(256, outputdim) + self.features.apply(init_weights) + self.outputlayer.apply(init_weights) + + def forward(self, x, embedding): # + batch, time, dim = x.shape + x = x.unsqueeze(1) # (b,1,t,d) + x = self.features(x) # + x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128) + embedding = embedding.unsqueeze(1) + embedding = embedding.repeat(1, x.shape[1], 1) + x = self.fusion(embedding,x) + #x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim] + if not hasattr(self, '_flattened'): + self.gru.flatten_parameters() + x, _ = self.gru(x) # x torch.Size([16, 125, 256]) + x = self.fc(x) + decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2]) + decision_up = torch.nn.functional.interpolate( + decision_time.transpose(1, 2), # [16, 2, 125] + time, # 501 + mode='linear', + align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2) + return decision_time[:,:,0],decision_up + +class CDur(nn.Module): + def __init__(self, inputdim, outputdim,time_resolution, **kwargs): + super().__init__() + self.features = nn.Sequential( + Block2D(1, 32), + nn.LPPool2d(4, (2, 4)), + Block2D(32, 128), + Block2D(128, 128), + nn.LPPool2d(4, (2, 4)), + Block2D(128, 128), + Block2D(128, 128), + nn.LPPool2d(4, (2, 4)), + nn.Dropout(0.3), + ) + with torch.no_grad(): + rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape + rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] + + self.gru = nn.GRU(256, 256, bidirectional=True, batch_first=True) + self.fc = nn.Linear(512,256) + self.outputlayer = nn.Linear(256, outputdim) + self.features.apply(init_weights) + self.outputlayer.apply(init_weights) + + def forward(self, x, embedding,one_hot=None): # + batch, time, dim = x.shape + x = x.unsqueeze(1) # (b,1,t,d) + x = self.features(x) # + x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128) + embedding = embedding.unsqueeze(1) + embedding = embedding.repeat(1, x.shape[1], 1) + x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim] + if not hasattr(self, '_flattened'): + self.gru.flatten_parameters() + x, _ = self.gru(x) # x torch.Size([16, 125, 256]) + x = self.fc(x) + decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2]) + decision_up = torch.nn.functional.interpolate( + decision_time.transpose(1, 2), # [16, 2, 125] + time, # 501 + mode='linear', + align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2) + return decision_time[:,:,0],decision_up + +class CDur_big(nn.Module): + def __init__(self, inputdim, outputdim, **kwargs): + super().__init__() + self.features = nn.Sequential( + Block2D(1, 64), + Block2D(64, 64), + nn.LPPool2d(4, (2, 2)), + Block2D(64, 128), + Block2D(128, 128), + nn.LPPool2d(4, (2, 2)), + Block2D(128, 256), + Block2D(256, 256), + nn.LPPool2d(4, (2, 4)), + Block2D(256, 512), + Block2D(512, 512), + nn.LPPool2d(4, (1, 4)), + nn.Dropout(0.3),) + with torch.no_grad(): + rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape + rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] + self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True) + self.fc = nn.Linear(1024,256) + self.outputlayer = nn.Linear(256, outputdim) + self.features.apply(init_weights) + self.outputlayer.apply(init_weights) + + def forward(self, x, embedding): # + batch, time, dim = x.shape + x = x.unsqueeze(1) # (b,1,t,d) + x = self.features(x) # + x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512) + embedding = embedding.unsqueeze(1) + embedding = embedding.repeat(1, x.shape[1], 1) + x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim] + if not hasattr(self, '_flattened'): + self.gru.flatten_parameters() + x, _ = self.gru(x) # x torch.Size([16, 125, 256]) + x = self.fc(x) + decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2]) + decision_up = torch.nn.functional.interpolate( + decision_time.transpose(1, 2), # [16, 2, 125] + time, # 501 + mode='linear', + align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2) + return decision_time[:,:,0],decision_up + +class CDur_GLU(nn.Module): + def __init__(self, inputdim, outputdim, **kwargs): + super().__init__() + self.features = Mul_scale_GLU() + # with torch.no_grad(): + # rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape + # rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] + self.gru = nn.GRU(640, 512,1, bidirectional=True, batch_first=True) # previous is 640 + # self.gru = LSTMModel(640, 512,1) + self.fc = nn.Linear(1024,256) + self.outputlayer = nn.Linear(256, outputdim) + # self.features.apply(init_weights) + self.outputlayer.apply(init_weights) + + def forward(self, x, embedding,one_hot=None): # + batch, time, dim = x.shape + x = x.unsqueeze(1) # (b,1,t,d) + x = self.features(x) # + x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512) + # print('x ',x.shape) + # assert 1==2 + embedding = embedding.unsqueeze(1) + embedding = embedding.repeat(1, x.shape[1], 1) + + x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim] + if not hasattr(self, '_flattened'): + self.gru.flatten_parameters() + x, _ = self.gru(x) # x torch.Size([16, 125, 256]) + # x = self.gru(x) # x torch.Size([16, 125, 256]) + x = self.fc(x) + decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2]) + decision_up = torch.nn.functional.interpolate( + decision_time.transpose(1, 2), # [16, 2, 125] + time, # 501 + mode='linear', + align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2) + return decision_time[:,:,0],decision_up + +class CDur_CNN14(nn.Module): + def __init__(self, inputdim, outputdim,time_resolution,**kwargs): + super().__init__() + if time_resolution==125: + self.features = Cnn10(8) + elif time_resolution == 250: + #print('time_resolution ',time_resolution) + self.features = Cnn10(4) + elif time_resolution == 500: + self.features = Cnn10(2) + else: + self.features = Cnn10(0) + with torch.no_grad(): + rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape + rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] + # self.features = Cnn10() + self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True) + # self.gru = LSTMModel(640, 512,1) + self.fc = nn.Linear(1024,256) + self.outputlayer = nn.Linear(256, outputdim) + # self.features.apply(init_weights) + self.outputlayer.apply(init_weights) + + def forward(self, x, embedding,one_hot=None): + batch, time, dim = x.shape + x = x.unsqueeze(1) # (b,1,t,d) + x = self.features(x) # + x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512) + # print('x ',x.shape) + # assert 1==2 + embedding = embedding.unsqueeze(1) + embedding = embedding.repeat(1, x.shape[1], 1) + x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim] + if not hasattr(self, '_flattened'): + self.gru.flatten_parameters() + x, _ = self.gru(x) # x torch.Size([16, 125, 256]) + # x = self.gru(x) # x torch.Size([16, 125, 256]) + x = self.fc(x) + decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2]) + decision_up = torch.nn.functional.interpolate( + decision_time.transpose(1, 2), # [16, 2, 125] + time, # 501 + mode='linear', + align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2) + return decision_time[:,:,0],decision_up + +class CDur_CNN_mul_scale(nn.Module): + def __init__(self, inputdim, outputdim,time_resolution,**kwargs): + super().__init__() + if time_resolution==125: + self.features = Cnn10_mul_scale(8) + elif time_resolution == 250: + #print('time_resolution ',time_resolution) + self.features = Cnn10_mul_scale(4) + elif time_resolution == 500: + self.features = Cnn10_mul_scale(2) + else: + self.features = Cnn10_mul_scale(0) + # with torch.no_grad(): + # rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape + # rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] + # self.features = Cnn10() + self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True) + # self.gru = LSTMModel(640, 512,1) + self.fc = nn.Linear(1024,256) + self.outputlayer = nn.Linear(256, outputdim) + # self.features.apply(init_weights) + self.outputlayer.apply(init_weights) + + def forward(self, x, embedding,one_hot=None): + # print('x ',x.shape) + # assert 1==2 + batch, time, dim = x.shape + x = x.unsqueeze(1) # (b,1,t,d) + x = self.features(x) # + x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512) + # print('x ',x.shape) + # assert 1==2 + embedding = embedding.unsqueeze(1) + embedding = embedding.repeat(1, x.shape[1], 1) + x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim] + if not hasattr(self, '_flattened'): + self.gru.flatten_parameters() + x, _ = self.gru(x) # x torch.Size([16, 125, 256]) + # x = self.gru(x) # x torch.Size([16, 125, 256]) + x = self.fc(x) + decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2]) + decision_up = torch.nn.functional.interpolate( + decision_time.transpose(1, 2), # [16, 2, 125] + time, # 501 + mode='linear', + align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2) + return decision_time[:,:,0],decision_up + +class CDur_CNN_mul_scale_fusion(nn.Module): + def __init__(self, inputdim, outputdim, time_resolution,**kwargs): + super().__init__() + if time_resolution==125: + self.features = Cnn10_mul_scale(8) + elif time_resolution == 250: + #print('time_resolution ',time_resolution) + self.features = Cnn10_mul_scale(4) + elif time_resolution == 500: + self.features = Cnn10_mul_scale(2) + else: + self.features = Cnn10_mul_scale(0) + # with torch.no_grad(): + # rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape + # rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] + # self.features = Cnn10() + self.gru = nn.GRU(512, 512, bidirectional=True, batch_first=True) + # self.gru = LSTMModel(640, 512,1) + self.fc = nn.Linear(1024,256) + self.fusion = Fusion(128,512,2) + self.outputlayer = nn.Linear(256, outputdim) + # self.features.apply(init_weights) + self.outputlayer.apply(init_weights) + + def forward(self, x, embedding,one_hot=None): + # print('x ',x.shape) + # assert 1==2 + batch, time, dim = x.shape + x = x.unsqueeze(1) # (b,1,t,d) + x = self.features(x) # + x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512) + # print('x ',x.shape) + # assert 1==2 + embedding = embedding.unsqueeze(1) + embedding = embedding.repeat(1, x.shape[1], 1) + x = self.fusion(embedding, x) + #x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim] + if not hasattr(self, '_flattened'): + self.gru.flatten_parameters() + x, _ = self.gru(x) # x torch.Size([16, 125, 256]) + # x = self.gru(x) # x torch.Size([16, 125, 256]) + x = self.fc(x) + decision_time = torch.softmax(self.outputlayer(x),dim=2) # x torch.Size([16, 125, 2]) + decision_up = torch.nn.functional.interpolate( + decision_time.transpose(1, 2), # [16, 2, 125] + time, # 501 + mode='linear', + align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2) + return decision_time[:,:,0],decision_up + + +class RaDur_fusion(nn.Module): + def __init__(self, model_config, inputdim, outputdim, time_resolution, **kwargs): + super().__init__() + self.encoder = Cnn14() + self.detection = CDur_CNN_mul_scale_fusion(inputdim, outputdim, time_resolution) + self.softmax = nn.Softmax(dim=2) + #self.temperature = 5 + # if model_config['pre_train']: + # self.encoder.load_state_dict(torch.load(model_config['encoder_path'])['model']) + # self.detection.load_state_dict(torch.load(model_config['CDur_path'])) + + self.q = nn.Linear(128,128) + self.k = nn.Linear(128,128) + self.q_ee = nn.Linear(128, 128) + self.k_ee = nn.Linear(128, 128) + self.temperature = 11.3 # sqrt(128) + self.att_pool = model_config['att_pool'] + self.enhancement = model_config['enhancement'] + self.tao = model_config['tao'] + self.top = model_config['top'] + self.bn = nn.BatchNorm1d(128) + self.EE_fusion = Fusion(128, 128, 4) + + def get_w(self,q,k): + q = self.q(q) + k = self.k(k) + q = q.unsqueeze(1) + attn = torch.bmm(q, k.transpose(1, 2)) + attn = attn/self.temperature + attn = self.softmax(attn) + return attn + + def get_w_ee(self,q,k): + q = self.q_ee(q) + k = self.k_ee(k) + q = q.unsqueeze(1) + attn = torch.bmm(q, k.transpose(1, 2)) + attn = attn/self.temperature + attn = self.softmax(attn) + return attn + + def attention_pooling(self, embeddings, mean_embedding): + att_pool_w = self.get_w(mean_embedding,embeddings) + embedding = torch.bmm(att_pool_w, embeddings).squeeze(1) + # print(embedding.shape) + # print(att_pool_w.shape) + # print(att_pool_w[0]) + # assert 1==2 + return embedding + + def select_topk_embeddings(self, scores, embeddings, k): + _, idx_DESC = scores.sort(descending=True, dim=1) # 根据分数进行排序 + top_k = _[:,:k] + # print('top_k ', top_k) + # top_k = top_k.mean(1) + idx_topk = idx_DESC[:, :k] # 取top_k个 + # print('index ', idx_topk) + idx_topk = idx_topk.unsqueeze(2).expand([-1, -1, embeddings.shape[2]]) + selected_embeddings = torch.gather(embeddings, 1, idx_topk) + return selected_embeddings,top_k + + def sum_with_attention(self, embedding, top_k, selected_embeddings): + # print('embedding ',embedding) + # print('selected_embeddings ',selected_embeddings.shape) + att_1 = self.get_w_ee(embedding, selected_embeddings) + att_1 = att_1.squeeze(1) + #print('att_1 ',att_1.shape) + larger = top_k > self.tao + # print('larger ',larger) + top_k = top_k*larger + # print('top_k ',top_k.shape) + # print('top_k ',top_k) + att_1 = att_1*top_k + #print('att_1 ',att_1.shape) + # assert 1==2 + att_2 = att_1.unsqueeze(2).repeat(1,1,128) + Es = selected_embeddings*att_2 + return Es + + def orcal_EE(self, x, embedding, label): + batch, time, dim = x.shape + + mixture_embedding = self.encoder(x) # 8, 125, 128 + mixture_embedding = mixture_embedding.transpose(1,2) + mixture_embedding = self.bn(mixture_embedding) + mixture_embedding = mixture_embedding.transpose(1,2) + + x = x.unsqueeze(1) # (b,1,t,d) + x = self.detection.features(x) # + x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128) + embedding_pre = embedding.unsqueeze(1) + embedding_pre = embedding_pre.repeat(1, x.shape[1], 1) + f = self.detection.fusion(embedding_pre, x) # the first stage results + #f = torch.cat((x, embedding_pre), dim=2) # [B, T, 128 + emb_dim] + if not hasattr(self, '_flattened'): + self.detection.gru.flatten_parameters() + f, _ = self.detection.gru(f) # x torch.Size([16, 125, 256]) + f = self.detection.fc(f) + decision_time = torch.softmax(self.detection.outputlayer(f),dim=2) # x torch.Size([16, 125, 2]) + + selected_embeddings, top_k = self.select_topk_embeddings(decision_time[:,:,0], mixture_embedding, self.top) + + selected_embeddings = self.sum_with_attention(embedding, top_k, selected_embeddings) # add the weight + + mix_embedding = selected_embeddings.mean(1).unsqueeze(1) # + mix_embedding = mix_embedding.repeat(1, x.shape[1], 1) + embedding = embedding.unsqueeze(1) + embedding = embedding.repeat(1, x.shape[1], 1) + mix_embedding = self.EE_fusion(mix_embedding, embedding) # 使用神经网络进行融合 + # mix_embedding2 = selected_embeddings2.mean(1) + #mix_embedding = embedding + mix_embedding # 直接相加 + # new detection results + # embedding_now = mix_embedding.unsqueeze(1) + # embedding_now = embedding_now.repeat(1, x.shape[1], 1) + f_now = self.detection.fusion(mix_embedding, x) + #f_now = torch.cat((x, embedding_now), dim=2) # + f_now, _ = self.detection.gru(f_now) # x torch.Size([16, 125, 256]) + f_now = self.detection.fc(f_now) + decision_time_now = torch.softmax(self.detection.outputlayer(f_now), dim=2) # x torch.Size([16, 125, 2]) + + top_k = top_k.mean(1) # get avg score,higher score will have more weight + larger = top_k > self.tao + top_k = top_k * larger + top_k = top_k/2.0 + # print('top_k ',top_k) + # assert 1==2 + # print('tok_k[ ',top_k.shape) + # print('decision_time ',decision_time.shape) + # print('decision_time_now ',decision_time_now.shape) + neg_w = top_k.unsqueeze(1).unsqueeze(2) + neg_w = neg_w.repeat(1, decision_time_now.shape[1], decision_time_now.shape[2]) + # print('neg_w ',neg_w.shape) + #print('neg_w ',neg_w[:,0:10,0]) + pos_w = 1-neg_w + #print('pos_w ',pos_w[:,0:10,0]) + decision_time_final = decision_time*pos_w + neg_w*decision_time_now + #print('decision_time_final ',decision_time_final[0,0:10,0]) + # print(decision_time_final[0,:,:]) + #assert 1==2 + return decision_time_final + + def forward(self, x, ref, label=None): + batch, time, dim = x.shape + logit = torch.zeros(1).cuda() + embeddings = self.encoder(ref) + mean_embedding = embeddings.mean(1) + if self.att_pool == True: + mean_embedding = self.bn(mean_embedding) + embeddings = embeddings.transpose(1,2) + embeddings = self.bn(embeddings) + embeddings = embeddings.transpose(1,2) + embedding = self.attention_pooling(embeddings, mean_embedding) + else: + embedding = mean_embedding + if self.enhancement == True: + decision_time = self.orcal_EE(x, embedding, label) + decision_up = torch.nn.functional.interpolate( + decision_time.transpose(1, 2), # [16, 2, 125] + time, # 501 + mode='linear', + align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2) + return decision_time[:,:,0], decision_up, logit + + x = x.unsqueeze(1) # (b,1,t,d) + x = self.detection.features(x) # + x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128) + embedding = embedding.unsqueeze(1) + embedding = embedding.repeat(1, x.shape[1], 1) + # x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim] + x = self.detection.fusion(embedding, x) + # embedding = embedding.unsqueeze(1) + # embedding = embedding.repeat(1, x.shape[1], 1) + # x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim] + if not hasattr(self, '_flattened'): + self.detection.gru.flatten_parameters() + x, _ = self.detection.gru(x) # x torch.Size([16, 125, 256]) + x = self.detection.fc(x) + decision_time = torch.softmax(self.detection.outputlayer(x),dim=2) # x torch.Size([16, 125, 2]) + decision_up = torch.nn.functional.interpolate( + decision_time.transpose(1, 2), + time, # 501 + mode='linear', + align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2) + return decision_time[:,:,0], decision_up, logit \ No newline at end of file diff --git a/audio_detection/target_sound_detection/src/utils.py b/audio_detection/target_sound_detection/src/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..cf1deeaef4e51fcc7cc42f4f3e2d9a34296371f9 --- /dev/null +++ b/audio_detection/target_sound_detection/src/utils.py @@ -0,0 +1,353 @@ +# !/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2021/3/9 16:33 +# @Author : dongchao yang +# @File : train.py + +import collections +import sys +from loguru import logger +from pprint import pformat + +import numpy as np +import pandas as pd +import scipy +import six +import sklearn.preprocessing as pre +import torch +import tqdm +import yaml + +from scipy.interpolate import interp1d + +def parse_config_or_kwargs(config_file, **kwargs): + """parse_config_or_kwargs + :param config_file: Config file that has parameters, yaml format + :param **kwargs: Other alternative parameters or overwrites for config + """ + with open(config_file) as con_read: + yaml_config = yaml.load(con_read, Loader=yaml.FullLoader) + arguments = dict(yaml_config, **kwargs) + return arguments + + +def find_contiguous_regions(activity_array): # in this part, if you cannot understand the binary operation, I think you can write a O(n) complexity method + """Find contiguous regions from bool valued numpy.array. + Copy of https://dcase-repo.github.io/dcase_util/_modules/dcase_util/data/decisions.html#DecisionEncoder + Reason is: + 1. This does not belong to a class necessarily + 2. Import DecisionEncoder requires sndfile over some other imports..which causes some problems on clusters + """ + change_indices = np.logical_xor(activity_array[1:], activity_array[:-1]).nonzero()[0] + change_indices += 1 + if activity_array[0]: + # If the first element of activity_array is True add 0 at the beginning + change_indices = np.r_[0, change_indices] + + if activity_array[-1]: + # If the last element of activity_array is True, add the length of the array + change_indices = np.r_[change_indices, activity_array.size] + # print(change_indices.reshape((-1, 2))) + # Reshape the result into two columns + return change_indices.reshape((-1, 2)) + + +def split_train_cv( + data_frame: pd.DataFrame, + frac: float = 0.9, + y=None, # Only for stratified, computes necessary split + **kwargs): + """split_train_cv + + :param data_frame: + :type data_frame: pd.DataFrame + :param frac: + :type frac: float + """ + if kwargs.get('mode', + None) == 'urbansed': # Filenames are DATA_-1 DATA_-2 etc + data_frame.loc[:, 'id'] = data_frame.groupby( + data_frame['filename'].str.split('_').apply( + lambda x: '_'.join(x[:-1]))).ngroup() + sampler = np.random.permutation(data_frame['id'].nunique()) + num_train = int(frac * len(sampler)) + train_indexes = sampler[:num_train] + cv_indexes = sampler[num_train:] + train_data = data_frame[data_frame['id'].isin(train_indexes)] + cv_data = data_frame[data_frame['id'].isin(cv_indexes)] + del train_data['id'] + del cv_data['id'] + elif kwargs.get('mode', None) == 'stratified': # stratified --> 分层的 ? + # Use statified sampling + from skmultilearn.model_selection import iterative_train_test_split + index_train, _, index_cv, _ = iterative_train_test_split( + data_frame.index.values.reshape(-1, 1), y, test_size=1. - frac) + train_data = data_frame[data_frame.index.isin(index_train.squeeze())] + cv_data = data_frame[data_frame.index.isin(index_cv.squeeze())] # cv --> cross validation + else: + # Simply split train_test + train_data = data_frame.sample(frac=frac, random_state=10) + cv_data = data_frame[~data_frame.index.isin(train_data.index)] + return train_data, cv_data + + + +def pprint_dict(in_dict, outputfun=sys.stdout.write, formatter='yaml'): # print yaml file + """pprint_dict + :param outputfun: function to use, defaults to sys.stdout + :param in_dict: dict to print + """ + if formatter == 'yaml': + format_fun = yaml.dump + elif formatter == 'pretty': + format_fun = pformat + for line in format_fun(in_dict).split('\n'): + outputfun(line) + + +def getfile_outlogger(outputfile): + log_format = "[{time:YYYY-MM-DD HH:mm:ss}] {message}" + logger.configure(handlers=[{"sink": sys.stderr, "format": log_format}]) + if outputfile: + logger.add(outputfile, enqueue=True, format=log_format) + return logger + +# according label, get encoder +def train_labelencoder(labels: pd.Series, sparse=True): + """encode_labels + + Encodes labels + + :param labels: pd.Series representing the raw labels e.g., Speech, Water + :param encoder (optional): Encoder already fitted + returns encoded labels (many hot) and the encoder + """ + assert isinstance(labels, pd.Series), "Labels need to be series" + if isinstance(labels[0], six.string_types): + # In case of using non processed strings, e.g., Vaccum, Speech + label_array = labels.str.split(',').values.tolist() # split label according to ',' + elif isinstance(labels[0], np.ndarray): + # Encoder does not like to see numpy array + label_array = [lab.tolist() for lab in labels] + elif isinstance(labels[0], collections.Iterable): + label_array = labels + encoder = pre.MultiLabelBinarizer(sparse_output=sparse) + encoder.fit(label_array) + return encoder + + +def encode_labels(labels: pd.Series, encoder=None, sparse=True): + """encode_labels + + Encodes labels + + :param labels: pd.Series representing the raw labels e.g., Speech, Water + :param encoder (optional): Encoder already fitted + returns encoded labels (many hot) and the encoder + """ + assert isinstance(labels, pd.Series), "Labels need to be series" + instance = labels.iloc[0] + if isinstance(instance, six.string_types): + # In case of using non processed strings, e.g., Vaccum, Speech + label_array = labels.str.split(',').values.tolist() + elif isinstance(instance, np.ndarray): + # Encoder does not like to see numpy array + label_array = [lab.tolist() for lab in labels] + elif isinstance(instance, collections.Iterable): + label_array = labels + # get label_array, it is a list ,contain a lot of label, this label are string type + if not encoder: + encoder = pre.MultiLabelBinarizer(sparse_output=sparse) # if we encoder is None, we should init a encoder firstly. + encoder.fit(label_array) + labels_encoded = encoder.transform(label_array) # transform string to digit + return labels_encoded, encoder + + # return pd.arrays.SparseArray( + # [row.toarray().ravel() for row in labels_encoded]), encoder + + +def decode_with_timestamps(events,labels: np.array): + """decode_with_timestamps + Decodes the predicted label array (2d) into a list of + [(Labelname, onset, offset), ...] + + :param encoder: Encoder during training + :type encoder: pre.MultiLabelBinarizer + :param labels: n-dim array + :type labels: np.array + """ + # print('events ',events) + # print('labels ',labels.shape) + #assert 1==2 + if labels.ndim == 2: + #print('...') + return [_decode_with_timestamps(events[i],labels[i]) for i in range(labels.shape[0])] + else: + return _decode_with_timestamps(events,labels) + + +def median_filter(x, window_size, threshold=0.5): + """median_filter + :param x: input prediction array of shape (B, T, C) or (B, T). + Input is a sequence of probabilities 0 <= x <= 1 + :param window_size: An integer to use + :param threshold: Binary thresholding threshold + """ + x = binarize(x, threshold=threshold) # transfer to 0 or 1 + if x.ndim == 3: + size = (1, window_size, 1) + elif x.ndim == 2 and x.shape[0] == 1: + # Assume input is class-specific median filtering + # E.g, Batch x Time [1, 501] + size = (1, window_size) + elif x.ndim == 2 and x.shape[0] > 1: + # Assume input is standard median pooling, class-independent + # E.g., Time x Class [501, 10] + size = (window_size, 1) + return scipy.ndimage.median_filter(x, size=size) + + +def _decode_with_timestamps(events,labels): + result_labels = [] + # print('.......') + # print('labels ',labels.shape) + # print(labels) + change_indices = find_contiguous_regions(labels) + # print(change_indices) + # assert 1==2 + for row in change_indices: + result_labels.append((events,row[0], row[1])) + return result_labels + +def inverse_transform_labels(encoder, pred): + if pred.ndim == 3: + return [encoder.inverse_transform(x) for x in pred] + else: + return encoder.inverse_transform(pred) + + +def binarize(pred, threshold=0.5): + # Batch_wise + if pred.ndim == 3: + return np.array( + [pre.binarize(sub, threshold=threshold) for sub in pred]) + else: + return pre.binarize(pred, threshold=threshold) + + +def double_threshold(x, high_thres, low_thres, n_connect=1): + """double_threshold + Helper function to calculate double threshold for n-dim arrays + + :param x: input array + :param high_thres: high threshold value + :param low_thres: Low threshold value + :param n_connect: Distance of <= n clusters will be merged + """ + assert x.ndim <= 3, "Whoops something went wrong with the input ({}), check if its <= 3 dims".format( + x.shape) + if x.ndim == 3: + apply_dim = 1 + elif x.ndim < 3: + apply_dim = 0 + # x is assumed to be 3d: (batch, time, dim) + # Assumed to be 2d : (time, dim) + # Assumed to be 1d : (time) + # time axis is therefore at 1 for 3d and 0 for 2d ( + return np.apply_along_axis(lambda x: _double_threshold( + x, high_thres, low_thres, n_connect=n_connect), + axis=apply_dim, + arr=x) + + +def _double_threshold(x, high_thres, low_thres, n_connect=1, return_arr=True): # in nature, double_threshold considers boundary question + """_double_threshold + Computes a double threshold over the input array + + :param x: input array, needs to be 1d + :param high_thres: High threshold over the array + :param low_thres: Low threshold over the array + :param n_connect: Postprocessing, maximal distance between clusters to connect + :param return_arr: By default this function returns the filtered indiced, but if return_arr = True it returns an array of tsame size as x filled with ones and zeros. + """ + assert x.ndim == 1, "Input needs to be 1d" + high_locations = np.where(x > high_thres)[0] # return the index, where value is greater than high_thres + locations = x > low_thres # return true of false + encoded_pairs = find_contiguous_regions(locations) + # print('encoded_pairs ',encoded_pairs) + filtered_list = list( + filter( + lambda pair: + ((pair[0] <= high_locations) & (high_locations <= pair[1])).any(), + encoded_pairs)) # find encoded_pair where inclide a high_lacations + #print('filtered_list ',filtered_list) + filtered_list = connect_(filtered_list, n_connect) # if the distance of two pair is less than n_connect, we can merge them + if return_arr: + zero_one_arr = np.zeros_like(x, dtype=int) + for sl in filtered_list: + zero_one_arr[sl[0]:sl[1]] = 1 + return zero_one_arr + return filtered_list + + +def connect_clusters(x, n=1): + if x.ndim == 1: + return connect_clusters_(x, n) + if x.ndim >= 2: + return np.apply_along_axis(lambda a: connect_clusters_(a, n=n), -2, x) + + +def connect_clusters_(x, n=1): + """connect_clusters_ + Connects clustered predictions (0,1) in x with range n + + :param x: Input array. zero-one format + :param n: Number of frames to skip until connection can be made + """ + assert x.ndim == 1, "input needs to be 1d" + reg = find_contiguous_regions(x) + start_end = connect_(reg, n=n) + zero_one_arr = np.zeros_like(x, dtype=int) + for sl in start_end: + zero_one_arr[sl[0]:sl[1]] = 1 + return zero_one_arr + + +def connect_(pairs, n=1): + """connect_ + Connects two adjacent clusters if their distance is <= n + + :param pairs: Clusters of iterateables e.g., [(1,5),(7,10)] + :param n: distance between two clusters + """ + if len(pairs) == 0: + return [] + start_, end_ = pairs[0] + new_pairs = [] + for i, (next_item, cur_item) in enumerate(zip(pairs[1:], pairs[0:])): + end_ = next_item[1] + if next_item[0] - cur_item[1] <= n: + pass + else: + new_pairs.append((start_, cur_item[1])) + start_ = next_item[0] + new_pairs.append((start_, end_)) + return new_pairs + + +def predictions_to_time(df, ratio): + df.onset = df.onset * ratio + df.offset = df.offset * ratio + return df + +def upgrade_resolution(arr, scale): + print('arr ',arr.shape) + x = np.arange(0, arr.shape[0]) + f = interp1d(x, arr, kind='linear', axis=0, fill_value='extrapolate') + scale_x = np.arange(0, arr.shape[0], 1 / scale) + up_scale = f(scale_x) + return up_scale +# a = [0.1,0.2,0.3,0.8,0.4,0.1,0.3,0.9,0.4] +# a = np.array(a) +# b = a>0.2 +# _double_threshold(a,0.7,0.2) \ No newline at end of file diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth b/audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth new file mode 100644 index 0000000000000000000000000000000000000000..30ee4a84d0ad9ada87a5ec32dc40ec789e559e82 --- /dev/null +++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e4525ad12621117c3a0fcfe974fd55e51583cd219106bf510438f4bec4edc18 +size 140604911 diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth new file mode 100644 index 0000000000000000000000000000000000000000..23719b4c8deee6c6bcac7d7704f6ced56fa289e1 --- /dev/null +++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1331dab1e4c3ac2bc5850156f2000a95fe333bdf06d08ce9b490550726548ab0 +size 2479 diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bae9021caa4dd01659303bc05d2227436e7a64d --- /dev/null +++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b44e30c4800462c177806bbd7009953d70d531c873e3791ca9aa85375d524d +size 343538489 diff --git a/audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth b/audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth new file mode 100644 index 0000000000000000000000000000000000000000..80e1bacdfbba7071092e562b4ddfb1d8fbee6e83 --- /dev/null +++ b/audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de482358747778181e4dc530ec61ae94f53ae0b202ac92e99491fe4ceb3cbb1c +size 255398 diff --git a/audio_to_text/__init__.py b/audio_to_text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_to_text/__pycache__/__init__.cpython-38.pyc b/audio_to_text/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd50a50ca70fb2a85f608f8dddd11a6abb7b807d Binary files /dev/null and b/audio_to_text/__pycache__/__init__.cpython-38.pyc differ diff --git a/audio_to_text/__pycache__/inference_waveform.cpython-38.pyc b/audio_to_text/__pycache__/inference_waveform.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbe230a79bcf0de51c959381e83483d5a9f322b8 Binary files /dev/null and b/audio_to_text/__pycache__/inference_waveform.cpython-38.pyc differ diff --git a/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dcdbfafa6487b60aeb8e60f7ad80da2cd1150308 --- /dev/null +++ b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml @@ -0,0 +1,23 @@ +model: + encoder: + type: Cnn14RnnEncoder + args: + sample_rate: 32000 + pretrained: ./audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth + freeze_cnn: True + freeze_cnn_bn: True + bidirectional: True + dropout: 0.5 + hidden_size: 256 + num_layers: 3 + decoder: + type: TransformerDecoder + args: + attn_emb_dim: 512 + dropout: 0.2 + emb_dim: 256 + fc_emb_dim: 512 + nlayers: 2 + type: TransformerModel + args: {} + diff --git a/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth new file mode 100644 index 0000000000000000000000000000000000000000..916026e45ca268db286047dacb1161a6a91a9613 --- /dev/null +++ b/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d22099e1025baae0f32ce09ec02c3d5fea001e295512fbf8754b5c66db21b0ec +size 43027289 diff --git a/audio_to_text/captioning/__init__.py b/audio_to_text/captioning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_to_text/captioning/__pycache__/__init__.cpython-38.pyc b/audio_to_text/captioning/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6c2da4d396315c560620b86eb2737a07e067ee9 Binary files /dev/null and b/audio_to_text/captioning/__pycache__/__init__.cpython-38.pyc differ diff --git a/audio_to_text/captioning/models/__init__.py b/audio_to_text/captioning/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7259d671aaa8a7278b5aaa12069dc25caaad3cd8 --- /dev/null +++ b/audio_to_text/captioning/models/__init__.py @@ -0,0 +1,3 @@ +from .base_model import * +from .transformer_model import * + diff --git a/audio_to_text/captioning/models/__pycache__/__init__.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c08c874fac4d909a82f27f959e743a4aba5436a8 Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/__init__.cpython-38.pyc differ diff --git a/audio_to_text/captioning/models/__pycache__/attn_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/attn_model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c38109625aa375a8953c1adb9e8493ba1c592dcb Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/attn_model.cpython-38.pyc differ diff --git a/audio_to_text/captioning/models/__pycache__/base_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/base_model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8610e38e506ce60292444561bc0a7652bf2d718f Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/base_model.cpython-38.pyc differ diff --git a/audio_to_text/captioning/models/__pycache__/decoder.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/decoder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9848a5dd6fd832d108179372880bed510ebc7da Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/decoder.cpython-38.pyc differ diff --git a/audio_to_text/captioning/models/__pycache__/encoder.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/encoder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a00468fcad9293ac03d90700e32320a3fa9e474 Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/encoder.cpython-38.pyc differ diff --git a/audio_to_text/captioning/models/__pycache__/fc_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/fc_model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c9b1b8f984d37e0daed0fc541737be2f24a5e94 Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/fc_model.cpython-38.pyc differ diff --git a/audio_to_text/captioning/models/__pycache__/rl_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/rl_model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06caef1f8b20de29821f255f2bf3263b5aa65211 Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/rl_model.cpython-38.pyc differ diff --git a/audio_to_text/captioning/models/__pycache__/style_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/style_model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d383220b793d5a36727995d722ff8bbb7affbab Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/style_model.cpython-38.pyc differ diff --git a/audio_to_text/captioning/models/__pycache__/transformer_model.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/transformer_model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..597cea7bac6d491e52c98c1f4e9f5f0ee9659e24 Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/transformer_model.cpython-38.pyc differ diff --git a/audio_to_text/captioning/models/__pycache__/utils.cpython-38.pyc b/audio_to_text/captioning/models/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ab5c1da309ac502cbce9dffb00956d5c668b63b Binary files /dev/null and b/audio_to_text/captioning/models/__pycache__/utils.cpython-38.pyc differ diff --git a/audio_to_text/captioning/models/base_model.py b/audio_to_text/captioning/models/base_model.py new file mode 100644 index 0000000000000000000000000000000000000000..cd014e9b9e68fc80f44179ccbbe066791ecdd7c0 --- /dev/null +++ b/audio_to_text/captioning/models/base_model.py @@ -0,0 +1,500 @@ +# -*- coding: utf-8 -*- + +from typing import Dict + +import torch +import torch.nn as nn + +from .utils import mean_with_lens, repeat_tensor + + +class CaptionModel(nn.Module): + """ + Encoder-decoder captioning model. + """ + + pad_idx = 0 + start_idx = 1 + end_idx = 2 + max_length = 20 + + def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs): + super().__init__() + self.encoder = encoder + self.decoder = decoder + self.vocab_size = decoder.vocab_size + self.train_forward_keys = ["cap", "cap_len", "ss_ratio"] + self.inference_forward_keys = ["sample_method", "max_length", "temp"] + freeze_encoder = kwargs.get("freeze_encoder", False) + if freeze_encoder: + for param in self.encoder.parameters(): + param.requires_grad = False + self.check_decoder_compatibility() + + def check_decoder_compatibility(self): + compatible_decoders = [x.__class__.__name__ for x in self.compatible_decoders] + assert isinstance(self.decoder, self.compatible_decoders), \ + f"{self.decoder.__class__.__name__} is incompatible with " \ + f"{self.__class__.__name__}, please use decoder in {compatible_decoders} " + + @classmethod + def set_index(cls, start_idx, end_idx): + cls.start_idx = start_idx + cls.end_idx = end_idx + + def forward(self, input_dict: Dict): + """ + input_dict: { + (required) + mode: train/inference, + spec, + spec_len, + fc, + attn, + attn_len, + [sample_method: greedy], + [temp: 1.0] (in case of no teacher forcing) + + (optional, mode=train) + cap, + cap_len, + ss_ratio, + + (optional, mode=inference) + sample_method: greedy/beam, + max_length, + temp, + beam_size (optional, sample_method=beam), + n_best (optional, sample_method=beam), + } + """ + # encoder_input_keys = ["spec", "spec_len", "fc", "attn", "attn_len"] + # encoder_input = { key: input_dict[key] for key in encoder_input_keys } + encoder_output_dict = self.encoder(input_dict) + if input_dict["mode"] == "train": + forward_dict = { + "mode": "train", "sample_method": "greedy", "temp": 1.0 + } + for key in self.train_forward_keys: + forward_dict[key] = input_dict[key] + forward_dict.update(encoder_output_dict) + output = self.train_forward(forward_dict) + elif input_dict["mode"] == "inference": + forward_dict = {"mode": "inference"} + default_args = { "sample_method": "greedy", "max_length": self.max_length, "temp": 1.0 } + for key in self.inference_forward_keys: + if key in input_dict: + forward_dict[key] = input_dict[key] + else: + forward_dict[key] = default_args[key] + + if forward_dict["sample_method"] == "beam": + forward_dict["beam_size"] = input_dict.get("beam_size", 3) + forward_dict["n_best"] = input_dict.get("n_best", False) + forward_dict["n_best_size"] = input_dict.get("n_best_size", forward_dict["beam_size"]) + elif forward_dict["sample_method"] == "dbs": + forward_dict["beam_size"] = input_dict.get("beam_size", 6) + forward_dict["group_size"] = input_dict.get("group_size", 3) + forward_dict["diversity_lambda"] = input_dict.get("diversity_lambda", 0.5) + forward_dict["group_nbest"] = input_dict.get("group_nbest", True) + + forward_dict.update(encoder_output_dict) + output = self.inference_forward(forward_dict) + else: + raise Exception("mode should be either 'train' or 'inference'") + + return output + + def prepare_output(self, input_dict): + output = {} + batch_size = input_dict["fc_emb"].size(0) + if input_dict["mode"] == "train": + max_length = input_dict["cap"].size(1) - 1 + elif input_dict["mode"] == "inference": + max_length = input_dict["max_length"] + else: + raise Exception("mode should be either 'train' or 'inference'") + device = input_dict["fc_emb"].device + output["seq"] = torch.full((batch_size, max_length), self.end_idx, + dtype=torch.long) + output["logit"] = torch.empty(batch_size, max_length, + self.vocab_size).to(device) + output["sampled_logprob"] = torch.zeros(batch_size, max_length) + output["embed"] = torch.empty(batch_size, max_length, + self.decoder.d_model).to(device) + return output + + def train_forward(self, input_dict): + if input_dict["ss_ratio"] != 1: # scheduled sampling training + input_dict["mode"] = "train" + return self.stepwise_forward(input_dict) + output = self.seq_forward(input_dict) + self.train_process(output, input_dict) + return output + + def seq_forward(self, input_dict): + raise NotImplementedError + + def train_process(self, output, input_dict): + pass + + def inference_forward(self, input_dict): + if input_dict["sample_method"] == "beam": + return self.beam_search(input_dict) + elif input_dict["sample_method"] == "dbs": + return self.diverse_beam_search(input_dict) + return self.stepwise_forward(input_dict) + + def stepwise_forward(self, input_dict): + """Step-by-step decoding""" + output = self.prepare_output(input_dict) + max_length = output["seq"].size(1) + # start sampling + for t in range(max_length): + input_dict["t"] = t + self.decode_step(input_dict, output) + if input_dict["mode"] == "inference": # decide whether to stop when sampling + unfinished_t = output["seq"][:, t] != self.end_idx + if t == 0: + unfinished = unfinished_t + else: + unfinished *= unfinished_t + output["seq"][:, t][~unfinished] = self.end_idx + if unfinished.sum() == 0: + break + self.stepwise_process(output) + return output + + def decode_step(self, input_dict, output): + """Decoding operation of timestep t""" + decoder_input = self.prepare_decoder_input(input_dict, output) + # feed to the decoder to get logit + output_t = self.decoder(decoder_input) + logit_t = output_t["logit"] + # assert logit_t.ndim == 3 + if logit_t.size(1) == 1: + logit_t = logit_t.squeeze(1) + embed_t = output_t["embed"].squeeze(1) + elif logit_t.size(1) > 1: + logit_t = logit_t[:, -1, :] + embed_t = output_t["embed"][:, -1, :] + else: + raise Exception("no logit output") + # sample the next input word and get the corresponding logit + sampled = self.sample_next_word(logit_t, + method=input_dict["sample_method"], + temp=input_dict["temp"]) + + output_t.update(sampled) + output_t["t"] = input_dict["t"] + output_t["logit"] = logit_t + output_t["embed"] = embed_t + self.stepwise_process_step(output, output_t) + + def prepare_decoder_input(self, input_dict, output): + """Prepare the inp ut dict for the decoder""" + raise NotImplementedError + + def stepwise_process_step(self, output, output_t): + """Postprocessing (save output values) after each timestep t""" + t = output_t["t"] + output["logit"][:, t, :] = output_t["logit"] + output["seq"][:, t] = output_t["word"] + output["sampled_logprob"][:, t] = output_t["probs"] + output["embed"][:, t, :] = output_t["embed"] + + def stepwise_process(self, output): + """Postprocessing after the whole step-by-step autoregressive decoding""" + pass + + def sample_next_word(self, logit, method, temp): + """Sample the next word, given probs output by the decoder""" + logprob = torch.log_softmax(logit, dim=1) + if method == "greedy": + sampled_logprob, word = torch.max(logprob.detach(), 1) + elif method == "gumbel": + def sample_gumbel(shape, eps=1e-20): + U = torch.rand(shape).to(logprob.device) + return -torch.log(-torch.log(U + eps) + eps) + def gumbel_softmax_sample(logit, temperature): + y = logit + sample_gumbel(logit.size()) + return torch.log_softmax(y / temperature, dim=-1) + _logprob = gumbel_softmax_sample(logprob, temp) + _, word = torch.max(_logprob.data, 1) + sampled_logprob = logprob.gather(1, word.unsqueeze(-1)) + else: + logprob = logprob / temp + if method.startswith("top"): + top_num = float(method[3:]) + if 0 < top_num < 1: # top-p sampling + probs = torch.softmax(logit, dim=1) + sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=1) + _cumsum = sorted_probs.cumsum(1) + mask = _cumsum < top_num + mask = torch.cat([torch.ones_like(mask[:,:1]), mask[:,:-1]], 1) + sorted_probs = sorted_probs * mask.to(sorted_probs) + sorted_probs = sorted_probs / sorted_probs.sum(1, keepdim=True) + logprob.scatter_(1, sorted_indices, sorted_probs.log()) + else: # top-k sampling + k = int(top_num) + tmp = torch.empty_like(logprob).fill_(float('-inf')) + topk, indices = torch.topk(logprob, k, dim=1) + tmp = tmp.scatter(1, indices, topk) + logprob = tmp + word = torch.distributions.Categorical(logits=logprob.detach()).sample() + sampled_logprob = logprob.gather(1, word.unsqueeze(-1)).squeeze(1) + word = word.detach().long() + # sampled_logprob: [N,], word: [N,] + return {"word": word, "probs": sampled_logprob} + + def beam_search(self, input_dict): + output = self.prepare_output(input_dict) + max_length = input_dict["max_length"] + beam_size = input_dict["beam_size"] + if input_dict["n_best"]: + n_best_size = input_dict["n_best_size"] + batch_size, max_length = output["seq"].size() + output["seq"] = torch.full((batch_size, n_best_size, max_length), + self.end_idx, dtype=torch.long) + + temp = input_dict["temp"] + # instance by instance beam seach + for i in range(output["seq"].size(0)): + output_i = self.prepare_beamsearch_output(input_dict) + input_dict["sample_idx"] = i + for t in range(max_length): + input_dict["t"] = t + output_t = self.beamsearch_step(input_dict, output_i) + ####################################### + # merge with previous beam and select the current max prob beam + ####################################### + logit_t = output_t["logit"] + if logit_t.size(1) == 1: + logit_t = logit_t.squeeze(1) + elif logit_t.size(1) > 1: + logit_t = logit_t[:, -1, :] + else: + raise Exception("no logit output") + logprob_t = torch.log_softmax(logit_t, dim=1) + logprob_t = torch.log_softmax(logprob_t / temp, dim=1) + logprob_t = output_i["topk_logprob"].unsqueeze(1) + logprob_t + if t == 0: # for the first step, all k seq will have the same probs + topk_logprob, topk_words = logprob_t[0].topk( + beam_size, 0, True, True) + else: # unroll and find top logprob, and their unrolled indices + topk_logprob, topk_words = logprob_t.view(-1).topk( + beam_size, 0, True, True) + topk_words = topk_words.cpu() + output_i["topk_logprob"] = topk_logprob + # output_i["prev_words_beam"] = topk_words // self.vocab_size # [beam_size,] + output_i["prev_words_beam"] = torch.div(topk_words, self.vocab_size, + rounding_mode='trunc') + output_i["next_word"] = topk_words % self.vocab_size # [beam_size,] + if t == 0: + output_i["seq"] = output_i["next_word"].unsqueeze(1) + else: + output_i["seq"] = torch.cat([ + output_i["seq"][output_i["prev_words_beam"]], + output_i["next_word"].unsqueeze(1)], dim=1) + + # add finished beams to results + is_end = output_i["next_word"] == self.end_idx + if t == max_length - 1: + is_end.fill_(1) + + for beam_idx in range(beam_size): + if is_end[beam_idx]: + final_beam = { + "seq": output_i["seq"][beam_idx].clone(), + "score": output_i["topk_logprob"][beam_idx].item() + } + final_beam["score"] = final_beam["score"] / (t + 1) + output_i["done_beams"].append(final_beam) + output_i["topk_logprob"][is_end] -= 1000 + + self.beamsearch_process_step(output_i, output_t) + + self.beamsearch_process(output, output_i, input_dict) + return output + + def prepare_beamsearch_output(self, input_dict): + beam_size = input_dict["beam_size"] + device = input_dict["fc_emb"].device + output = { + "topk_logprob": torch.zeros(beam_size).to(device), + "seq": None, + "prev_words_beam": None, + "next_word": None, + "done_beams": [], + } + return output + + def beamsearch_step(self, input_dict, output_i): + decoder_input = self.prepare_beamsearch_decoder_input(input_dict, output_i) + output_t = self.decoder(decoder_input) + output_t["t"] = input_dict["t"] + return output_t + + def prepare_beamsearch_decoder_input(self, input_dict, output_i): + raise NotImplementedError + + def beamsearch_process_step(self, output_i, output_t): + pass + + def beamsearch_process(self, output, output_i, input_dict): + i = input_dict["sample_idx"] + done_beams = sorted(output_i["done_beams"], key=lambda x: -x["score"]) + if input_dict["n_best"]: + done_beams = done_beams[:input_dict["n_best_size"]] + for out_idx, done_beam in enumerate(done_beams): + seq = done_beam["seq"] + output["seq"][i][out_idx, :len(seq)] = seq + else: + seq = done_beams[0]["seq"] + output["seq"][i][:len(seq)] = seq + + def diverse_beam_search(self, input_dict): + + def add_diversity(seq_table, logprob, t, divm, diversity_lambda, bdash): + local_time = t - divm + unaug_logprob = logprob.clone() + + if divm > 0: + change = torch.zeros(logprob.size(-1)) + for prev_choice in range(divm): + prev_decisions = seq_table[prev_choice][..., local_time] + for prev_labels in range(bdash): + change.scatter_add_(0, prev_decisions[prev_labels], change.new_ones(1)) + + change = change.to(logprob.device) + logprob = logprob - repeat_tensor(change, bdash) * diversity_lambda + + return logprob, unaug_logprob + + output = self.prepare_output(input_dict) + group_size = input_dict["group_size"] + batch_size = output["seq"].size(0) + beam_size = input_dict["beam_size"] + bdash = beam_size // group_size + input_dict["bdash"] = bdash + diversity_lambda = input_dict["diversity_lambda"] + device = input_dict["fc_emb"].device + max_length = input_dict["max_length"] + temp = input_dict["temp"] + group_nbest = input_dict["group_nbest"] + batch_size, max_length = output["seq"].size() + if group_nbest: + output["seq"] = torch.full((batch_size, beam_size, max_length), + self.end_idx, dtype=torch.long) + else: + output["seq"] = torch.full((batch_size, group_size, max_length), + self.end_idx, dtype=torch.long) + + + for i in range(batch_size): + input_dict["sample_idx"] = i + seq_table = [torch.LongTensor(bdash, 0) for _ in range(group_size)] # group_size x [bdash, 0] + logprob_table = [torch.zeros(bdash).to(device) for _ in range(group_size)] + done_beams_table = [[] for _ in range(group_size)] + + output_i = { + "prev_words_beam": [None for _ in range(group_size)], + "next_word": [None for _ in range(group_size)], + "state": [None for _ in range(group_size)] + } + + for t in range(max_length + group_size - 1): + input_dict["t"] = t + for divm in range(group_size): + input_dict["divm"] = divm + if t >= divm and t <= max_length + divm - 1: + local_time = t - divm + decoder_input = self.prepare_dbs_decoder_input(input_dict, output_i) + output_t = self.decoder(decoder_input) + output_t["divm"] = divm + logit_t = output_t["logit"] + if logit_t.size(1) == 1: + logit_t = logit_t.squeeze(1) + elif logit_t.size(1) > 1: + logit_t = logit_t[:, -1, :] + else: + raise Exception("no logit output") + logprob_t = torch.log_softmax(logit_t, dim=1) + logprob_t = torch.log_softmax(logprob_t / temp, dim=1) + logprob_t, unaug_logprob_t = add_diversity(seq_table, logprob_t, t, divm, diversity_lambda, bdash) + logprob_t = logprob_table[divm].unsqueeze(-1) + logprob_t + if local_time == 0: # for the first step, all k seq will have the same probs + topk_logprob, topk_words = logprob_t[0].topk( + bdash, 0, True, True) + else: # unroll and find top logprob, and their unrolled indices + topk_logprob, topk_words = logprob_t.view(-1).topk( + bdash, 0, True, True) + topk_words = topk_words.cpu() + logprob_table[divm] = topk_logprob + output_i["prev_words_beam"][divm] = topk_words // self.vocab_size # [bdash,] + output_i["next_word"][divm] = topk_words % self.vocab_size # [bdash,] + if local_time > 0: + seq_table[divm] = seq_table[divm][output_i["prev_words_beam"][divm]] + seq_table[divm] = torch.cat([ + seq_table[divm], + output_i["next_word"][divm].unsqueeze(-1)], -1) + + is_end = seq_table[divm][:, t-divm] == self.end_idx + assert seq_table[divm].shape[-1] == t - divm + 1 + if t == max_length + divm - 1: + is_end.fill_(1) + for beam_idx in range(bdash): + if is_end[beam_idx]: + final_beam = { + "seq": seq_table[divm][beam_idx].clone(), + "score": logprob_table[divm][beam_idx].item() + } + final_beam["score"] = final_beam["score"] / (t - divm + 1) + done_beams_table[divm].append(final_beam) + logprob_table[divm][is_end] -= 1000 + self.dbs_process_step(output_i, output_t) + done_beams_table = [sorted(done_beams_table[divm], key=lambda x: -x["score"])[:bdash] for divm in range(group_size)] + if group_nbest: + done_beams = sum(done_beams_table, []) + else: + done_beams = [group_beam[0] for group_beam in done_beams_table] + for _, done_beam in enumerate(done_beams): + output["seq"][i, _, :len(done_beam["seq"])] = done_beam["seq"] + + return output + + def prepare_dbs_decoder_input(self, input_dict, output_i): + raise NotImplementedError + + def dbs_process_step(self, output_i, output_t): + pass + + +class CaptionSequenceModel(nn.Module): + + def __init__(self, model, seq_output_size): + super().__init__() + self.model = model + if model.decoder.d_model != seq_output_size: + self.output_transform = nn.Linear(model.decoder.d_model, seq_output_size) + else: + self.output_transform = lambda x: x + + def forward(self, input_dict): + output = self.model(input_dict) + + if input_dict["mode"] == "train": + lens = input_dict["cap_len"] - 1 + # seq_outputs: [N, d_model] + elif input_dict["mode"] == "inference": + if "sample_method" in input_dict and input_dict["sample_method"] == "beam": + return output + seq = output["seq"] + lens = torch.where(seq == self.model.end_idx, torch.zeros_like(seq), torch.ones_like(seq)).sum(dim=1) + else: + raise Exception("mode should be either 'train' or 'inference'") + seq_output = mean_with_lens(output["embed"], lens) + seq_output = self.output_transform(seq_output) + output["seq_output"] = seq_output + return output + diff --git a/audio_to_text/captioning/models/decoder.py b/audio_to_text/captioning/models/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..869eac11349f2321993e84be148aaa651892607f --- /dev/null +++ b/audio_to_text/captioning/models/decoder.py @@ -0,0 +1,746 @@ +# -*- coding: utf-8 -*- + +import math +from functools import partial + +import numpy as np +import torch +import torch.nn as nn + +from .utils import generate_length_mask, init, PositionalEncoding + + +class BaseDecoder(nn.Module): + """ + Take word/audio embeddings and output the next word probs + Base decoder, cannot be called directly + All decoders should inherit from this class + """ + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, + attn_emb_dim, dropout=0.2): + super().__init__() + self.emb_dim = emb_dim + self.vocab_size = vocab_size + self.fc_emb_dim = fc_emb_dim + self.attn_emb_dim = attn_emb_dim + self.word_embedding = nn.Embedding(vocab_size, emb_dim) + self.in_dropout = nn.Dropout(dropout) + + def forward(self, x): + raise NotImplementedError + + def load_word_embedding(self, weight, freeze=True): + embedding = np.load(weight) + assert embedding.shape[0] == self.vocab_size, "vocabulary size mismatch" + assert embedding.shape[1] == self.emb_dim, "embed size mismatch" + + # embeddings = torch.as_tensor(embeddings).float() + # self.word_embeddings.weight = nn.Parameter(embeddings) + # for para in self.word_embeddings.parameters(): + # para.requires_grad = tune + self.word_embedding = nn.Embedding.from_pretrained(embedding, + freeze=freeze) + + +class RnnDecoder(BaseDecoder): + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs): + super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout,) + self.d_model = d_model + self.num_layers = kwargs.get('num_layers', 1) + self.bidirectional = kwargs.get('bidirectional', False) + self.rnn_type = kwargs.get('rnn_type', "GRU") + self.classifier = nn.Linear( + self.d_model * (self.bidirectional + 1), vocab_size) + + def forward(self, x): + raise NotImplementedError + + def init_hidden(self, bs, device): + num_dire = self.bidirectional + 1 + n_layer = self.num_layers + hid_dim = self.d_model + if self.rnn_type == "LSTM": + return (torch.zeros(num_dire * n_layer, bs, hid_dim).to(device), + torch.zeros(num_dire * n_layer, bs, hid_dim).to(device)) + else: + return torch.zeros(num_dire * n_layer, bs, hid_dim).to(device) + + +class RnnFcDecoder(RnnDecoder): + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, dropout, d_model, **kwargs): + super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, dropout, d_model, **kwargs) + self.model = getattr(nn, self.rnn_type)( + input_size=self.emb_dim * 2, + hidden_size=self.d_model, + batch_first=True, + num_layers=self.num_layers, + bidirectional=self.bidirectional) + self.fc_proj = nn.Linear(self.fc_emb_dim, self.emb_dim) + self.apply(init) + + def forward(self, input_dict): + word = input_dict["word"] + state = input_dict.get("state", None) + fc_emb = input_dict["fc_emb"] + + word = word.to(fc_emb.device) + embed = self.in_dropout(self.word_embedding(word)) + + p_fc_emb = self.fc_proj(fc_emb) + # embed: [N, T, embed_size] + embed = torch.cat((embed, p_fc_emb), dim=-1) + + out, state = self.model(embed, state) + # out: [N, T, hs], states: [num_layers * num_dire, N, hs] + logits = self.classifier(out) + output = { + "state": state, + "embeds": out, + "logits": logits + } + + return output + + +class Seq2SeqAttention(nn.Module): + + def __init__(self, hs_enc, hs_dec, attn_size): + """ + Args: + hs_enc: encoder hidden size + hs_dec: decoder hidden size + attn_size: attention vector size + """ + super(Seq2SeqAttention, self).__init__() + self.h2attn = nn.Linear(hs_enc + hs_dec, attn_size) + self.v = nn.Parameter(torch.randn(attn_size)) + self.apply(init) + + def forward(self, h_dec, h_enc, src_lens): + """ + Args: + h_dec: decoder hidden (query), [N, hs_dec] + h_enc: encoder memory (key/value), [N, src_max_len, hs_enc] + src_lens: source (encoder memory) lengths, [N, ] + """ + N = h_enc.size(0) + src_max_len = h_enc.size(1) + h_dec = h_dec.unsqueeze(1).repeat(1, src_max_len, 1) # [N, src_max_len, hs_dec] + + attn_input = torch.cat((h_dec, h_enc), dim=-1) + attn_out = torch.tanh(self.h2attn(attn_input)) # [N, src_max_len, attn_size] + + v = self.v.repeat(N, 1).unsqueeze(1) # [N, 1, attn_size] + score = torch.bmm(v, attn_out.transpose(1, 2)).squeeze(1) # [N, src_max_len] + + idxs = torch.arange(src_max_len).repeat(N).view(N, src_max_len) + mask = (idxs < src_lens.view(-1, 1)).to(h_dec.device) + + score = score.masked_fill(mask == 0, -1e10) + weights = torch.softmax(score, dim=-1) # [N, src_max_len] + ctx = torch.bmm(weights.unsqueeze(1), h_enc).squeeze(1) # [N, hs_enc] + + return ctx, weights + + +class AttentionProj(nn.Module): + + def __init__(self, hs_enc, hs_dec, embed_dim, attn_size): + self.q_proj = nn.Linear(hs_dec, embed_dim) + self.kv_proj = nn.Linear(hs_enc, embed_dim) + self.h2attn = nn.Linear(embed_dim * 2, attn_size) + self.v = nn.Parameter(torch.randn(attn_size)) + self.apply(init) + + def init(self, m): + if isinstance(m, nn.Linear): + nn.init.kaiming_uniform_(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, h_dec, h_enc, src_lens): + """ + Args: + h_dec: decoder hidden (query), [N, hs_dec] + h_enc: encoder memory (key/value), [N, src_max_len, hs_enc] + src_lens: source (encoder memory) lengths, [N, ] + """ + h_enc = self.kv_proj(h_enc) # [N, src_max_len, embed_dim] + h_dec = self.q_proj(h_dec) # [N, embed_dim] + N = h_enc.size(0) + src_max_len = h_enc.size(1) + h_dec = h_dec.unsqueeze(1).repeat(1, src_max_len, 1) # [N, src_max_len, hs_dec] + + attn_input = torch.cat((h_dec, h_enc), dim=-1) + attn_out = torch.tanh(self.h2attn(attn_input)) # [N, src_max_len, attn_size] + + v = self.v.repeat(N, 1).unsqueeze(1) # [N, 1, attn_size] + score = torch.bmm(v, attn_out.transpose(1, 2)).squeeze(1) # [N, src_max_len] + + idxs = torch.arange(src_max_len).repeat(N).view(N, src_max_len) + mask = (idxs < src_lens.view(-1, 1)).to(h_dec.device) + + score = score.masked_fill(mask == 0, -1e10) + weights = torch.softmax(score, dim=-1) # [N, src_max_len] + ctx = torch.bmm(weights.unsqueeze(1), h_enc).squeeze(1) # [N, hs_enc] + + return ctx, weights + + +class BahAttnDecoder(RnnDecoder): + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs): + """ + concatenate fc, attn, word to feed to the rnn + """ + super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs) + attn_size = kwargs.get("attn_size", self.d_model) + self.model = getattr(nn, self.rnn_type)( + input_size=self.emb_dim * 3, + hidden_size=self.d_model, + batch_first=True, + num_layers=self.num_layers, + bidirectional=self.bidirectional) + self.attn = Seq2SeqAttention(self.attn_emb_dim, + self.d_model * (self.bidirectional + 1) * \ + self.num_layers, + attn_size) + self.fc_proj = nn.Linear(self.fc_emb_dim, self.emb_dim) + self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim) + self.apply(init) + + def forward(self, input_dict): + word = input_dict["word"] + state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model] + fc_emb = input_dict["fc_emb"] + attn_emb = input_dict["attn_emb"] + attn_emb_len = input_dict["attn_emb_len"] + + word = word.to(fc_emb.device) + embed = self.in_dropout(self.word_embedding(word)) + + # embed: [N, 1, embed_size] + if state is None: + state = self.init_hidden(word.size(0), fc_emb.device) + if self.rnn_type == "LSTM": + query = state[0].transpose(0, 1).flatten(1) + else: + query = state.transpose(0, 1).flatten(1) + c, attn_weight = self.attn(query, attn_emb, attn_emb_len) + + p_fc_emb = self.fc_proj(fc_emb) + p_ctx = self.ctx_proj(c) + rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), p_fc_emb.unsqueeze(1)), + dim=-1) + + out, state = self.model(rnn_input, state) + + output = { + "state": state, + "embed": out, + "logit": self.classifier(out), + "attn_weight": attn_weight + } + return output + + +class BahAttnDecoder2(RnnDecoder): + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs): + """ + add fc, attn, word together to feed to the rnn + """ + super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs) + attn_size = kwargs.get("attn_size", self.d_model) + self.model = getattr(nn, self.rnn_type)( + input_size=self.emb_dim, + hidden_size=self.d_model, + batch_first=True, + num_layers=self.num_layers, + bidirectional=self.bidirectional) + self.attn = Seq2SeqAttention(self.emb_dim, + self.d_model * (self.bidirectional + 1) * \ + self.num_layers, + attn_size) + self.fc_proj = nn.Linear(self.fc_emb_dim, self.emb_dim) + self.attn_proj = nn.Linear(self.attn_emb_dim, self.emb_dim) + self.apply(partial(init, method="xavier")) + + def forward(self, input_dict): + word = input_dict["word"] + state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model] + fc_emb = input_dict["fc_emb"] + attn_emb = input_dict["attn_emb"] + attn_emb_len = input_dict["attn_emb_len"] + + word = word.to(fc_emb.device) + embed = self.in_dropout(self.word_embedding(word)) + p_attn_emb = self.attn_proj(attn_emb) + + # embed: [N, 1, embed_size] + if state is None: + state = self.init_hidden(word.size(0), fc_emb.device) + if self.rnn_type == "LSTM": + query = state[0].transpose(0, 1).flatten(1) + else: + query = state.transpose(0, 1).flatten(1) + c, attn_weight = self.attn(query, p_attn_emb, attn_emb_len) + + p_fc_emb = self.fc_proj(fc_emb) + rnn_input = embed + c.unsqueeze(1) + p_fc_emb.unsqueeze(1) + + out, state = self.model(rnn_input, state) + + output = { + "state": state, + "embed": out, + "logit": self.classifier(out), + "attn_weight": attn_weight + } + return output + + +class ConditionalBahAttnDecoder(RnnDecoder): + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs): + """ + concatenate fc, attn, word to feed to the rnn + """ + super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs) + attn_size = kwargs.get("attn_size", self.d_model) + self.model = getattr(nn, self.rnn_type)( + input_size=self.emb_dim * 3, + hidden_size=self.d_model, + batch_first=True, + num_layers=self.num_layers, + bidirectional=self.bidirectional) + self.attn = Seq2SeqAttention(self.attn_emb_dim, + self.d_model * (self.bidirectional + 1) * \ + self.num_layers, + attn_size) + self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim) + self.condition_embedding = nn.Embedding(2, emb_dim) + self.apply(init) + + def forward(self, input_dict): + word = input_dict["word"] + state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model] + fc_emb = input_dict["fc_emb"] + attn_emb = input_dict["attn_emb"] + attn_emb_len = input_dict["attn_emb_len"] + condition = input_dict["condition"] + + word = word.to(fc_emb.device) + embed = self.in_dropout(self.word_embedding(word)) + + condition = torch.as_tensor([[1 - c, c] for c in condition]).to(fc_emb.device) + condition_emb = torch.matmul(condition, self.condition_embedding.weight) + # condition_embs: [N, emb_dim] + + # embed: [N, 1, embed_size] + if state is None: + state = self.init_hidden(word.size(0), fc_emb.device) + if self.rnn_type == "LSTM": + query = state[0].transpose(0, 1).flatten(1) + else: + query = state.transpose(0, 1).flatten(1) + c, attn_weight = self.attn(query, attn_emb, attn_emb_len) + + p_ctx = self.ctx_proj(c) + rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), condition_emb.unsqueeze(1)), + dim=-1) + + out, state = self.model(rnn_input, state) + + output = { + "state": state, + "embed": out, + "logit": self.classifier(out), + "attn_weight": attn_weight + } + return output + + +class StructBahAttnDecoder(RnnDecoder): + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, struct_vocab_size, + attn_emb_dim, dropout, d_model, **kwargs): + """ + concatenate fc, attn, word to feed to the rnn + """ + super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs) + attn_size = kwargs.get("attn_size", self.d_model) + self.model = getattr(nn, self.rnn_type)( + input_size=self.emb_dim * 3, + hidden_size=self.d_model, + batch_first=True, + num_layers=self.num_layers, + bidirectional=self.bidirectional) + self.attn = Seq2SeqAttention(self.attn_emb_dim, + self.d_model * (self.bidirectional + 1) * \ + self.num_layers, + attn_size) + self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim) + self.struct_embedding = nn.Embedding(struct_vocab_size, emb_dim) + self.apply(init) + + def forward(self, input_dict): + word = input_dict["word"] + state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model] + fc_emb = input_dict["fc_emb"] + attn_emb = input_dict["attn_emb"] + attn_emb_len = input_dict["attn_emb_len"] + structure = input_dict["structure"] + + word = word.to(fc_emb.device) + embed = self.in_dropout(self.word_embedding(word)) + + struct_emb = self.struct_embedding(structure) + # struct_embs: [N, emb_dim] + + # embed: [N, 1, embed_size] + if state is None: + state = self.init_hidden(word.size(0), fc_emb.device) + if self.rnn_type == "LSTM": + query = state[0].transpose(0, 1).flatten(1) + else: + query = state.transpose(0, 1).flatten(1) + c, attn_weight = self.attn(query, attn_emb, attn_emb_len) + + p_ctx = self.ctx_proj(c) + rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), struct_emb.unsqueeze(1)), dim=-1) + + out, state = self.model(rnn_input, state) + + output = { + "state": state, + "embed": out, + "logit": self.classifier(out), + "attn_weight": attn_weight + } + return output + + +class StyleBahAttnDecoder(RnnDecoder): + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs): + """ + concatenate fc, attn, word to feed to the rnn + """ + super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs) + attn_size = kwargs.get("attn_size", self.d_model) + self.model = getattr(nn, self.rnn_type)( + input_size=self.emb_dim * 3, + hidden_size=self.d_model, + batch_first=True, + num_layers=self.num_layers, + bidirectional=self.bidirectional) + self.attn = Seq2SeqAttention(self.attn_emb_dim, + self.d_model * (self.bidirectional + 1) * \ + self.num_layers, + attn_size) + self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim) + self.apply(init) + + def forward(self, input_dict): + word = input_dict["word"] + state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model] + fc_emb = input_dict["fc_emb"] + attn_emb = input_dict["attn_emb"] + attn_emb_len = input_dict["attn_emb_len"] + style = input_dict["style"] + + word = word.to(fc_emb.device) + embed = self.in_dropout(self.word_embedding(word)) + + # embed: [N, 1, embed_size] + if state is None: + state = self.init_hidden(word.size(0), fc_emb.device) + if self.rnn_type == "LSTM": + query = state[0].transpose(0, 1).flatten(1) + else: + query = state.transpose(0, 1).flatten(1) + c, attn_weight = self.attn(query, attn_emb, attn_emb_len) + + p_ctx = self.ctx_proj(c) + rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), style.unsqueeze(1)), + dim=-1) + + out, state = self.model(rnn_input, state) + + output = { + "state": state, + "embed": out, + "logit": self.classifier(out), + "attn_weight": attn_weight + } + return output + + +class BahAttnDecoder3(RnnDecoder): + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs): + """ + concatenate fc, attn, word to feed to the rnn + """ + super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs) + attn_size = kwargs.get("attn_size", self.d_model) + self.model = getattr(nn, self.rnn_type)( + input_size=self.emb_dim + attn_emb_dim, + hidden_size=self.d_model, + batch_first=True, + num_layers=self.num_layers, + bidirectional=self.bidirectional) + self.attn = Seq2SeqAttention(self.attn_emb_dim, + self.d_model * (self.bidirectional + 1) * \ + self.num_layers, + attn_size) + self.ctx_proj = lambda x: x + self.apply(init) + + def forward(self, input_dict): + word = input_dict["word"] + state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model] + fc_emb = input_dict["fc_emb"] + attn_emb = input_dict["attn_emb"] + attn_emb_len = input_dict["attn_emb_len"] + + if word.size(-1) == self.fc_emb_dim: # fc_emb + embed = word.unsqueeze(1) + elif word.size(-1) == 1: # word + word = word.to(fc_emb.device) + embed = self.in_dropout(self.word_embedding(word)) + else: + raise Exception(f"problem with word input size {word.size()}") + + # embed: [N, 1, embed_size] + if state is None: + state = self.init_hidden(word.size(0), fc_emb.device) + if self.rnn_type == "LSTM": + query = state[0].transpose(0, 1).flatten(1) + else: + query = state.transpose(0, 1).flatten(1) + c, attn_weight = self.attn(query, attn_emb, attn_emb_len) + + p_ctx = self.ctx_proj(c) + rnn_input = torch.cat((embed, p_ctx.unsqueeze(1)), dim=-1) + + out, state = self.model(rnn_input, state) + + output = { + "state": state, + "embed": out, + "logit": self.classifier(out), + "attn_weight": attn_weight + } + return output + + +class SpecificityBahAttnDecoder(RnnDecoder): + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs): + """ + concatenate fc, attn, word to feed to the rnn + """ + super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, d_model, **kwargs) + attn_size = kwargs.get("attn_size", self.d_model) + self.model = getattr(nn, self.rnn_type)( + input_size=self.emb_dim + attn_emb_dim + 1, + hidden_size=self.d_model, + batch_first=True, + num_layers=self.num_layers, + bidirectional=self.bidirectional) + self.attn = Seq2SeqAttention(self.attn_emb_dim, + self.d_model * (self.bidirectional + 1) * \ + self.num_layers, + attn_size) + self.ctx_proj = lambda x: x + self.apply(init) + + def forward(self, input_dict): + word = input_dict["word"] + state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model] + fc_emb = input_dict["fc_emb"] + attn_emb = input_dict["attn_emb"] + attn_emb_len = input_dict["attn_emb_len"] + condition = input_dict["condition"] # [N,] + + word = word.to(fc_emb.device) + embed = self.in_dropout(self.word_embedding(word)) + + # embed: [N, 1, embed_size] + if state is None: + state = self.init_hidden(word.size(0), fc_emb.device) + if self.rnn_type == "LSTM": + query = state[0].transpose(0, 1).flatten(1) + else: + query = state.transpose(0, 1).flatten(1) + c, attn_weight = self.attn(query, attn_emb, attn_emb_len) + + p_ctx = self.ctx_proj(c) + rnn_input = torch.cat( + (embed, p_ctx.unsqueeze(1), condition.reshape(-1, 1, 1)), + dim=-1) + + out, state = self.model(rnn_input, state) + + output = { + "state": state, + "embed": out, + "logit": self.classifier(out), + "attn_weight": attn_weight + } + return output + + +class TransformerDecoder(BaseDecoder): + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, dropout, **kwargs): + super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout=dropout,) + self.d_model = emb_dim + self.nhead = kwargs.get("nhead", self.d_model // 64) + self.nlayers = kwargs.get("nlayers", 2) + self.dim_feedforward = kwargs.get("dim_feedforward", self.d_model * 4) + + self.pos_encoder = PositionalEncoding(self.d_model, dropout) + layer = nn.TransformerDecoderLayer(d_model=self.d_model, + nhead=self.nhead, + dim_feedforward=self.dim_feedforward, + dropout=dropout) + self.model = nn.TransformerDecoder(layer, self.nlayers) + self.classifier = nn.Linear(self.d_model, vocab_size) + self.attn_proj = nn.Sequential( + nn.Linear(self.attn_emb_dim, self.d_model), + nn.ReLU(), + nn.Dropout(dropout), + nn.LayerNorm(self.d_model) + ) + # self.attn_proj = lambda x: x + self.init_params() + + def init_params(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def generate_square_subsequent_mask(self, max_length): + mask = (torch.triu(torch.ones(max_length, max_length)) == 1).transpose(0, 1) + mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) + return mask + + def forward(self, input_dict): + word = input_dict["word"] + attn_emb = input_dict["attn_emb"] + attn_emb_len = input_dict["attn_emb_len"] + cap_padding_mask = input_dict["cap_padding_mask"] + + p_attn_emb = self.attn_proj(attn_emb) + p_attn_emb = p_attn_emb.transpose(0, 1) # [T_src, N, emb_dim] + word = word.to(attn_emb.device) + embed = self.in_dropout(self.word_embedding(word)) * math.sqrt(self.emb_dim) # [N, T, emb_dim] + embed = embed.transpose(0, 1) # [T, N, emb_dim] + embed = self.pos_encoder(embed) + + tgt_mask = self.generate_square_subsequent_mask(embed.size(0)).to(attn_emb.device) + memory_key_padding_mask = ~generate_length_mask(attn_emb_len, attn_emb.size(1)).to(attn_emb.device) + output = self.model(embed, p_attn_emb, tgt_mask=tgt_mask, + tgt_key_padding_mask=cap_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + output = output.transpose(0, 1) + output = { + "embed": output, + "logit": self.classifier(output), + } + return output + + + + +class EventTransformerDecoder(TransformerDecoder): + + def forward(self, input_dict): + word = input_dict["word"] # index of word embeddings + attn_emb = input_dict["attn_emb"] + attn_emb_len = input_dict["attn_emb_len"] + cap_padding_mask = input_dict["cap_padding_mask"] + event_emb = input_dict["event"] # [N, emb_dim] + + p_attn_emb = self.attn_proj(attn_emb) + p_attn_emb = p_attn_emb.transpose(0, 1) # [T_src, N, emb_dim] + word = word.to(attn_emb.device) + embed = self.in_dropout(self.word_embedding(word)) * math.sqrt(self.emb_dim) # [N, T, emb_dim] + + embed = embed.transpose(0, 1) # [T, N, emb_dim] + embed += event_emb + embed = self.pos_encoder(embed) + + tgt_mask = self.generate_square_subsequent_mask(embed.size(0)).to(attn_emb.device) + memory_key_padding_mask = ~generate_length_mask(attn_emb_len, attn_emb.size(1)).to(attn_emb.device) + output = self.model(embed, p_attn_emb, tgt_mask=tgt_mask, + tgt_key_padding_mask=cap_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + output = output.transpose(0, 1) + output = { + "embed": output, + "logit": self.classifier(output), + } + return output + + +class KeywordProbTransformerDecoder(TransformerDecoder): + + def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, keyword_classes_num, **kwargs): + super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, + dropout, **kwargs) + self.keyword_proj = nn.Linear(keyword_classes_num, self.d_model) + self.word_keyword_norm = nn.LayerNorm(self.d_model) + + def forward(self, input_dict): + word = input_dict["word"] # index of word embeddings + attn_emb = input_dict["attn_emb"] + attn_emb_len = input_dict["attn_emb_len"] + cap_padding_mask = input_dict["cap_padding_mask"] + keyword = input_dict["keyword"] # [N, keyword_classes_num] + + p_attn_emb = self.attn_proj(attn_emb) + p_attn_emb = p_attn_emb.transpose(0, 1) # [T_src, N, emb_dim] + word = word.to(attn_emb.device) + embed = self.in_dropout(self.word_embedding(word)) * math.sqrt(self.emb_dim) # [N, T, emb_dim] + + embed = embed.transpose(0, 1) # [T, N, emb_dim] + embed += self.keyword_proj(keyword) + embed = self.word_keyword_norm(embed) + + embed = self.pos_encoder(embed) + + tgt_mask = self.generate_square_subsequent_mask(embed.size(0)).to(attn_emb.device) + memory_key_padding_mask = ~generate_length_mask(attn_emb_len, attn_emb.size(1)).to(attn_emb.device) + output = self.model(embed, p_attn_emb, tgt_mask=tgt_mask, + tgt_key_padding_mask=cap_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + output = output.transpose(0, 1) + output = { + "embed": output, + "logit": self.classifier(output), + } + return output diff --git a/audio_to_text/captioning/models/encoder.py b/audio_to_text/captioning/models/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..0d6d8e87e0ed07abc04f6e79b0fa08cd102398a0 --- /dev/null +++ b/audio_to_text/captioning/models/encoder.py @@ -0,0 +1,686 @@ +# -*- coding: utf-8 -*- + +import math +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchaudio import transforms +from torchlibrosa.augmentation import SpecAugmentation + +from .utils import mean_with_lens, max_with_lens, \ + init, pack_wrapper, generate_length_mask, PositionalEncoding + + +def init_layer(layer): + """Initialize a Linear or Convolutional layer. """ + nn.init.xavier_uniform_(layer.weight) + + if hasattr(layer, 'bias'): + if layer.bias is not None: + layer.bias.data.fill_(0.) + + +def init_bn(bn): + """Initialize a Batchnorm layer. """ + bn.bias.data.fill_(0.) + bn.weight.data.fill_(1.) + + +class BaseEncoder(nn.Module): + + """ + Encode the given audio into embedding + Base encoder class, cannot be called directly + All encoders should inherit from this class + """ + + def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim): + super(BaseEncoder, self).__init__() + self.spec_dim = spec_dim + self.fc_feat_dim = fc_feat_dim + self.attn_feat_dim = attn_feat_dim + + + def forward(self, x): + ######################### + # an encoder first encodes audio feature into embedding, obtaining + # `encoded`: { + # fc_embs: [N, fc_emb_dim], + # attn_embs: [N, attn_max_len, attn_emb_dim], + # attn_emb_lens: [N,] + # } + ######################### + raise NotImplementedError + + +class Block2D(nn.Module): + + def __init__(self, cin, cout, kernel_size=3, padding=1): + super().__init__() + self.block = nn.Sequential( + nn.BatchNorm2d(cin), + nn.Conv2d(cin, + cout, + kernel_size=kernel_size, + padding=padding, + bias=False), + nn.LeakyReLU(inplace=True, negative_slope=0.1)) + + def forward(self, x): + return self.block(x) + + +class LinearSoftPool(nn.Module): + """LinearSoftPool + Linear softmax, takes logits and returns a probability, near to the actual maximum value. + Taken from the paper: + A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling + https://arxiv.org/abs/1810.09050 + """ + def __init__(self, pooldim=1): + super().__init__() + self.pooldim = pooldim + + def forward(self, logits, time_decision): + return (time_decision**2).sum(self.pooldim) / time_decision.sum( + self.pooldim) + + +class MeanPool(nn.Module): + + def __init__(self, pooldim=1): + super().__init__() + self.pooldim = pooldim + + def forward(self, logits, decision): + return torch.mean(decision, dim=self.pooldim) + + +class AttentionPool(nn.Module): + """docstring for AttentionPool""" + def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs): + super().__init__() + self.inputdim = inputdim + self.outputdim = outputdim + self.pooldim = pooldim + self.transform = nn.Linear(inputdim, outputdim) + self.activ = nn.Softmax(dim=self.pooldim) + self.eps = 1e-7 + + def forward(self, logits, decision): + # Input is (B, T, D) + # B, T, D + w = self.activ(torch.clamp(self.transform(logits), -15, 15)) + detect = (decision * w).sum( + self.pooldim) / (w.sum(self.pooldim) + self.eps) + # B, T, D + return detect + + +class MMPool(nn.Module): + + def __init__(self, dims): + super().__init__() + self.avgpool = nn.AvgPool2d(dims) + self.maxpool = nn.MaxPool2d(dims) + + def forward(self, x): + return self.avgpool(x) + self.maxpool(x) + + +def parse_poolingfunction(poolingfunction_name='mean', **kwargs): + """parse_poolingfunction + A heler function to parse any temporal pooling + Pooling is done on dimension 1 + :param poolingfunction_name: + :param **kwargs: + """ + poolingfunction_name = poolingfunction_name.lower() + if poolingfunction_name == 'mean': + return MeanPool(pooldim=1) + elif poolingfunction_name == 'linear': + return LinearSoftPool(pooldim=1) + elif poolingfunction_name == 'attention': + return AttentionPool(inputdim=kwargs['inputdim'], + outputdim=kwargs['outputdim']) + + +def embedding_pooling(x, lens, pooling="mean"): + if pooling == "max": + fc_embs = max_with_lens(x, lens) + elif pooling == "mean": + fc_embs = mean_with_lens(x, lens) + elif pooling == "mean+max": + x_mean = mean_with_lens(x, lens) + x_max = max_with_lens(x, lens) + fc_embs = x_mean + x_max + elif pooling == "last": + indices = (lens - 1).reshape(-1, 1, 1).repeat(1, 1, x.size(-1)) + # indices: [N, 1, hidden] + fc_embs = torch.gather(x, 1, indices).squeeze(1) + else: + raise Exception(f"pooling method {pooling} not support") + return fc_embs + + +class Cdur5Encoder(BaseEncoder): + + def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, pooling="mean"): + super().__init__(spec_dim, fc_feat_dim, attn_feat_dim) + self.pooling = pooling + self.features = nn.Sequential( + Block2D(1, 32), + nn.LPPool2d(4, (2, 4)), + Block2D(32, 128), + Block2D(128, 128), + nn.LPPool2d(4, (2, 4)), + Block2D(128, 128), + Block2D(128, 128), + nn.LPPool2d(4, (1, 4)), + nn.Dropout(0.3), + ) + with torch.no_grad(): + rnn_input_dim = self.features( + torch.randn(1, 1, 500, spec_dim)).shape + rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] + + self.gru = nn.GRU(rnn_input_dim, + 128, + bidirectional=True, + batch_first=True) + self.apply(init) + + def forward(self, input_dict): + x = input_dict["spec"] + lens = input_dict["spec_len"] + if "upsample" not in input_dict: + input_dict["upsample"] = False + lens = torch.as_tensor(copy.deepcopy(lens)) + N, T, _ = x.shape + x = x.unsqueeze(1) + x = self.features(x) + x = x.transpose(1, 2).contiguous().flatten(-2) + x, _ = self.gru(x) + if input_dict["upsample"]: + x = nn.functional.interpolate( + x.transpose(1, 2), + T, + mode='linear', + align_corners=False).transpose(1, 2) + else: + lens //= 4 + attn_emb = x + fc_emb = embedding_pooling(x, lens, self.pooling) + return { + "attn_emb": attn_emb, + "fc_emb": fc_emb, + "attn_emb_len": lens + } + + +def conv_conv_block(in_channel, out_channel): + return nn.Sequential( + nn.Conv2d(in_channel, + out_channel, + kernel_size=3, + bias=False, + padding=1), + nn.BatchNorm2d(out_channel), + nn.ReLU(True), + nn.Conv2d(out_channel, + out_channel, + kernel_size=3, + bias=False, + padding=1), + nn.BatchNorm2d(out_channel), + nn.ReLU(True) + ) + + +class Cdur8Encoder(BaseEncoder): + + def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, pooling="mean"): + super().__init__(spec_dim, fc_feat_dim, attn_feat_dim) + self.pooling = pooling + self.features = nn.Sequential( + conv_conv_block(1, 64), + MMPool((2, 2)), + nn.Dropout(0.2, True), + conv_conv_block(64, 128), + MMPool((2, 2)), + nn.Dropout(0.2, True), + conv_conv_block(128, 256), + MMPool((1, 2)), + nn.Dropout(0.2, True), + conv_conv_block(256, 512), + MMPool((1, 2)), + nn.Dropout(0.2, True), + nn.AdaptiveAvgPool2d((None, 1)), + ) + self.init_bn = nn.BatchNorm2d(spec_dim) + self.embedding = nn.Linear(512, 512) + self.gru = nn.GRU(512, 256, bidirectional=True, batch_first=True) + self.apply(init) + + def forward(self, input_dict): + x = input_dict["spec"] + lens = input_dict["spec_len"] + lens = torch.as_tensor(copy.deepcopy(lens)) + x = x.unsqueeze(1) # B x 1 x T x D + x = x.transpose(1, 3) + x = self.init_bn(x) + x = x.transpose(1, 3) + x = self.features(x) + x = x.transpose(1, 2).contiguous().flatten(-2) + x = F.dropout(x, p=0.5, training=self.training) + x = F.relu_(self.embedding(x)) + x, _ = self.gru(x) + attn_emb = x + lens //= 4 + fc_emb = embedding_pooling(x, lens, self.pooling) + return { + "attn_emb": attn_emb, + "fc_emb": fc_emb, + "attn_emb_len": lens + } + + +class Cnn10Encoder(BaseEncoder): + + def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim): + super().__init__(spec_dim, fc_feat_dim, attn_feat_dim) + self.features = nn.Sequential( + conv_conv_block(1, 64), + nn.AvgPool2d((2, 2)), + nn.Dropout(0.2, True), + conv_conv_block(64, 128), + nn.AvgPool2d((2, 2)), + nn.Dropout(0.2, True), + conv_conv_block(128, 256), + nn.AvgPool2d((2, 2)), + nn.Dropout(0.2, True), + conv_conv_block(256, 512), + nn.AvgPool2d((2, 2)), + nn.Dropout(0.2, True), + nn.AdaptiveAvgPool2d((None, 1)), + ) + self.init_bn = nn.BatchNorm2d(spec_dim) + self.embedding = nn.Linear(512, 512) + self.apply(init) + + def forward(self, input_dict): + x = input_dict["spec"] + lens = input_dict["spec_len"] + lens = torch.as_tensor(copy.deepcopy(lens)) + x = x.unsqueeze(1) # [N, 1, T, D] + x = x.transpose(1, 3) + x = self.init_bn(x) + x = x.transpose(1, 3) + x = self.features(x) # [N, 512, T/16, 1] + x = x.transpose(1, 2).contiguous().flatten(-2) # [N, T/16, 512] + attn_emb = x + lens //= 16 + fc_emb = embedding_pooling(x, lens, "mean+max") + fc_emb = F.dropout(fc_emb, p=0.5, training=self.training) + fc_emb = self.embedding(fc_emb) + fc_emb = F.relu_(fc_emb) + return { + "attn_emb": attn_emb, + "fc_emb": fc_emb, + "attn_emb_len": lens + } + + +class ConvBlock(nn.Module): + def __init__(self, in_channels, out_channels): + + super(ConvBlock, self).__init__() + + self.conv1 = nn.Conv2d(in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), stride=(1, 1), + padding=(1, 1), bias=False) + + self.conv2 = nn.Conv2d(in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), stride=(1, 1), + padding=(1, 1), bias=False) + + self.bn1 = nn.BatchNorm2d(out_channels) + self.bn2 = nn.BatchNorm2d(out_channels) + + self.init_weight() + + def init_weight(self): + init_layer(self.conv1) + init_layer(self.conv2) + init_bn(self.bn1) + init_bn(self.bn2) + + + def forward(self, input, pool_size=(2, 2), pool_type='avg'): + + x = input + x = F.relu_(self.bn1(self.conv1(x))) + x = F.relu_(self.bn2(self.conv2(x))) + if pool_type == 'max': + x = F.max_pool2d(x, kernel_size=pool_size) + elif pool_type == 'avg': + x = F.avg_pool2d(x, kernel_size=pool_size) + elif pool_type == 'avg+max': + x1 = F.avg_pool2d(x, kernel_size=pool_size) + x2 = F.max_pool2d(x, kernel_size=pool_size) + x = x1 + x2 + else: + raise Exception('Incorrect argument!') + + return x + + +class Cnn14Encoder(nn.Module): + def __init__(self, sample_rate=32000): + super().__init__() + sr_to_fmax = { + 32000: 14000, + 16000: 8000 + } + # Logmel spectrogram extractor + self.melspec_extractor = transforms.MelSpectrogram( + sample_rate=sample_rate, + n_fft=32 * sample_rate // 1000, + win_length=32 * sample_rate // 1000, + hop_length=10 * sample_rate // 1000, + f_min=50, + f_max=sr_to_fmax[sample_rate], + n_mels=64, + norm="slaney", + mel_scale="slaney" + ) + self.hop_length = 10 * sample_rate // 1000 + self.db_transform = transforms.AmplitudeToDB() + # Spec augmenter + self.spec_augmenter = SpecAugmentation(time_drop_width=64, + time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) + + self.bn0 = nn.BatchNorm2d(64) + + self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) + self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) + self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) + self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) + self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) + self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) + + self.downsample_ratio = 32 + + self.fc1 = nn.Linear(2048, 2048, bias=True) + + self.init_weight() + + def init_weight(self): + init_bn(self.bn0) + init_layer(self.fc1) + + def load_pretrained(self, pretrained): + checkpoint = torch.load(pretrained, map_location="cpu") + + if "model" in checkpoint: + state_keys = checkpoint["model"].keys() + backbone = False + for key in state_keys: + if key.startswith("backbone."): + backbone = True + break + + if backbone: # COLA + state_dict = {} + for key, value in checkpoint["model"].items(): + if key.startswith("backbone."): + model_key = key.replace("backbone.", "") + state_dict[model_key] = value + else: # PANNs + state_dict = checkpoint["model"] + elif "state_dict" in checkpoint: # CLAP + state_dict = checkpoint["state_dict"] + state_dict_keys = list(filter( + lambda x: "audio_encoder" in x, state_dict.keys())) + state_dict = { + key.replace('audio_encoder.', ''): state_dict[key] + for key in state_dict_keys + } + else: + raise Exception("Unkown checkpoint format") + + model_dict = self.state_dict() + pretrained_dict = { + k: v for k, v in state_dict.items() if (k in model_dict) and ( + model_dict[k].shape == v.shape) + } + model_dict.update(pretrained_dict) + self.load_state_dict(model_dict, strict=True) + + def forward(self, input_dict): + """ + Input: (batch_size, n_samples)""" + waveform = input_dict["wav"] + wave_length = input_dict["wav_len"] + specaug = input_dict["specaug"] + x = self.melspec_extractor(waveform) + x = self.db_transform(x) # (batch_size, mel_bins, time_steps) + x = x.transpose(1, 2) + x = x.unsqueeze(1) # (batch_size, 1, time_steps, mel_bins) + + # SpecAugment + if self.training and specaug: + x = self.spec_augmenter(x) + + x = x.transpose(1, 3) + x = self.bn0(x) + x = x.transpose(1, 3) + + x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + x = torch.mean(x, dim=3) + attn_emb = x.transpose(1, 2) + + wave_length = torch.as_tensor(wave_length) + feat_length = torch.div(wave_length, self.hop_length, + rounding_mode="floor") + 1 + feat_length = torch.div(feat_length, self.downsample_ratio, + rounding_mode="floor") + x_max = max_with_lens(attn_emb, feat_length) + x_mean = mean_with_lens(attn_emb, feat_length) + x = x_max + x_mean + x = F.dropout(x, p=0.5, training=self.training) + x = F.relu_(self.fc1(x)) + fc_emb = F.dropout(x, p=0.5, training=self.training) + + output_dict = { + 'fc_emb': fc_emb, + 'attn_emb': attn_emb, + 'attn_emb_len': feat_length + } + + return output_dict + + +class RnnEncoder(BaseEncoder): + + def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, + pooling="mean", **kwargs): + super().__init__(spec_dim, fc_feat_dim, attn_feat_dim) + self.pooling = pooling + self.hidden_size = kwargs.get('hidden_size', 512) + self.bidirectional = kwargs.get('bidirectional', False) + self.num_layers = kwargs.get('num_layers', 1) + self.dropout = kwargs.get('dropout', 0.2) + self.rnn_type = kwargs.get('rnn_type', "GRU") + self.in_bn = kwargs.get('in_bn', False) + self.embed_dim = self.hidden_size * (self.bidirectional + 1) + self.network = getattr(nn, self.rnn_type)( + attn_feat_dim, + self.hidden_size, + num_layers=self.num_layers, + bidirectional=self.bidirectional, + dropout=self.dropout, + batch_first=True) + if self.in_bn: + self.bn = nn.BatchNorm1d(self.embed_dim) + self.apply(init) + + def forward(self, input_dict): + x = input_dict["attn"] + lens = input_dict["attn_len"] + lens = torch.as_tensor(lens) + # x: [N, T, E] + if self.in_bn: + x = pack_wrapper(self.bn, x, lens) + out = pack_wrapper(self.network, x, lens) + # out: [N, T, hidden] + attn_emb = out + fc_emb = embedding_pooling(out, lens, self.pooling) + return { + "attn_emb": attn_emb, + "fc_emb": fc_emb, + "attn_emb_len": lens + } + + +class Cnn14RnnEncoder(nn.Module): + def __init__(self, sample_rate=32000, pretrained=None, + freeze_cnn=False, freeze_cnn_bn=False, + pooling="mean", **kwargs): + super().__init__() + self.cnn = Cnn14Encoder(sample_rate) + self.rnn = RnnEncoder(64, 2048, 2048, pooling, **kwargs) + if pretrained is not None: + self.cnn.load_pretrained(pretrained) + if freeze_cnn: + assert pretrained is not None, "cnn is not pretrained but frozen" + for param in self.cnn.parameters(): + param.requires_grad = False + self.freeze_cnn_bn = freeze_cnn_bn + + def train(self, mode): + super().train(mode=mode) + if self.freeze_cnn_bn: + def bn_eval(module): + class_name = module.__class__.__name__ + if class_name.find("BatchNorm") != -1: + module.eval() + self.cnn.apply(bn_eval) + return self + + def forward(self, input_dict): + output_dict = self.cnn(input_dict) + output_dict["attn"] = output_dict["attn_emb"] + output_dict["attn_len"] = output_dict["attn_emb_len"] + del output_dict["attn_emb"], output_dict["attn_emb_len"] + output_dict = self.rnn(output_dict) + return output_dict + + +class TransformerEncoder(BaseEncoder): + + def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, d_model, **kwargs): + super().__init__(spec_dim, fc_feat_dim, attn_feat_dim) + self.d_model = d_model + dropout = kwargs.get("dropout", 0.2) + self.nhead = kwargs.get("nhead", self.d_model // 64) + self.nlayers = kwargs.get("nlayers", 2) + self.dim_feedforward = kwargs.get("dim_feedforward", self.d_model * 4) + + self.attn_proj = nn.Sequential( + nn.Linear(attn_feat_dim, self.d_model), + nn.ReLU(), + nn.Dropout(dropout), + nn.LayerNorm(self.d_model) + ) + layer = nn.TransformerEncoderLayer(d_model=self.d_model, + nhead=self.nhead, + dim_feedforward=self.dim_feedforward, + dropout=dropout) + self.model = nn.TransformerEncoder(layer, self.nlayers) + self.cls_token = nn.Parameter(torch.zeros(d_model)) + self.init_params() + + def init_params(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, input_dict): + attn_feat = input_dict["attn"] + attn_feat_len = input_dict["attn_len"] + attn_feat_len = torch.as_tensor(attn_feat_len) + + attn_feat = self.attn_proj(attn_feat) # [bs, T, d_model] + + cls_emb = self.cls_token.reshape(1, 1, self.d_model).repeat( + attn_feat.size(0), 1, 1) + attn_feat = torch.cat((cls_emb, attn_feat), dim=1) + attn_feat = attn_feat.transpose(0, 1) + + attn_feat_len += 1 + src_key_padding_mask = ~generate_length_mask( + attn_feat_len, attn_feat.size(0)).to(attn_feat.device) + output = self.model(attn_feat, src_key_padding_mask=src_key_padding_mask) + + attn_emb = output.transpose(0, 1) + fc_emb = attn_emb[:, 0] + return { + "attn_emb": attn_emb, + "fc_emb": fc_emb, + "attn_emb_len": attn_feat_len + } + + +class Cnn14TransformerEncoder(nn.Module): + def __init__(self, sample_rate=32000, pretrained=None, + freeze_cnn=False, freeze_cnn_bn=False, + d_model="mean", **kwargs): + super().__init__() + self.cnn = Cnn14Encoder(sample_rate) + self.trm = TransformerEncoder(64, 2048, 2048, d_model, **kwargs) + if pretrained is not None: + self.cnn.load_pretrained(pretrained) + if freeze_cnn: + assert pretrained is not None, "cnn is not pretrained but frozen" + for param in self.cnn.parameters(): + param.requires_grad = False + self.freeze_cnn_bn = freeze_cnn_bn + + def train(self, mode): + super().train(mode=mode) + if self.freeze_cnn_bn: + def bn_eval(module): + class_name = module.__class__.__name__ + if class_name.find("BatchNorm") != -1: + module.eval() + self.cnn.apply(bn_eval) + return self + + def forward(self, input_dict): + output_dict = self.cnn(input_dict) + output_dict["attn"] = output_dict["attn_emb"] + output_dict["attn_len"] = output_dict["attn_emb_len"] + del output_dict["attn_emb"], output_dict["attn_emb_len"] + output_dict = self.trm(output_dict) + return output_dict + + + + + diff --git a/audio_to_text/captioning/models/transformer_model.py b/audio_to_text/captioning/models/transformer_model.py new file mode 100644 index 0000000000000000000000000000000000000000..76c97f171955f04b10c16fd1f1a205ce7343a0ac --- /dev/null +++ b/audio_to_text/captioning/models/transformer_model.py @@ -0,0 +1,265 @@ +# -*- coding: utf-8 -*- +import random +import torch +import torch.nn as nn + +from .base_model import CaptionModel +from .utils import repeat_tensor +import audio_to_text.captioning.models.decoder + + +class TransformerModel(CaptionModel): + + def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs): + if not hasattr(self, "compatible_decoders"): + self.compatible_decoders = ( + audio_to_text.captioning.models.decoder.TransformerDecoder, + ) + super().__init__(encoder, decoder, **kwargs) + + def seq_forward(self, input_dict): + cap = input_dict["cap"] + cap_padding_mask = (cap == self.pad_idx).to(cap.device) + cap_padding_mask = cap_padding_mask[:, :-1] + output = self.decoder( + { + "word": cap[:, :-1], + "attn_emb": input_dict["attn_emb"], + "attn_emb_len": input_dict["attn_emb_len"], + "cap_padding_mask": cap_padding_mask + } + ) + return output + + def prepare_decoder_input(self, input_dict, output): + decoder_input = { + "attn_emb": input_dict["attn_emb"], + "attn_emb_len": input_dict["attn_emb_len"] + } + t = input_dict["t"] + + ############### + # determine input word + ################ + if input_dict["mode"] == "train" and random.random() < input_dict["ss_ratio"]: # training, scheduled sampling + word = input_dict["cap"][:, :t+1] + else: + start_word = torch.tensor([self.start_idx,] * input_dict["attn_emb"].size(0)).unsqueeze(1).long() + if t == 0: + word = start_word + else: + word = torch.cat((start_word, output["seq"][:, :t]), dim=-1) + # word: [N, T] + decoder_input["word"] = word + + cap_padding_mask = (word == self.pad_idx).to(input_dict["attn_emb"].device) + decoder_input["cap_padding_mask"] = cap_padding_mask + return decoder_input + + def prepare_beamsearch_decoder_input(self, input_dict, output_i): + decoder_input = {} + t = input_dict["t"] + i = input_dict["sample_idx"] + beam_size = input_dict["beam_size"] + ############### + # prepare attn embeds + ################ + if t == 0: + attn_emb = repeat_tensor(input_dict["attn_emb"][i], beam_size) + attn_emb_len = repeat_tensor(input_dict["attn_emb_len"][i], beam_size) + output_i["attn_emb"] = attn_emb + output_i["attn_emb_len"] = attn_emb_len + decoder_input["attn_emb"] = output_i["attn_emb"] + decoder_input["attn_emb_len"] = output_i["attn_emb_len"] + ############### + # determine input word + ################ + start_word = torch.tensor([self.start_idx,] * beam_size).unsqueeze(1).long() + if t == 0: + word = start_word + else: + word = torch.cat((start_word, output_i["seq"]), dim=-1) + decoder_input["word"] = word + cap_padding_mask = (word == self.pad_idx).to(input_dict["attn_emb"].device) + decoder_input["cap_padding_mask"] = cap_padding_mask + + return decoder_input + + +class M2TransformerModel(CaptionModel): + + def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs): + if not hasattr(self, "compatible_decoders"): + self.compatible_decoders = ( + captioning.models.decoder.M2TransformerDecoder, + ) + super().__init__(encoder, decoder, **kwargs) + self.check_encoder_compatibility() + + def check_encoder_compatibility(self): + assert isinstance(self.encoder, captioning.models.encoder.M2TransformerEncoder), \ + f"only M2TransformerModel is compatible with {self.__class__.__name__}" + + + def seq_forward(self, input_dict): + cap = input_dict["cap"] + output = self.decoder( + { + "word": cap[:, :-1], + "attn_emb": input_dict["attn_emb"], + "attn_emb_mask": input_dict["attn_emb_mask"], + } + ) + return output + + def prepare_decoder_input(self, input_dict, output): + decoder_input = { + "attn_emb": input_dict["attn_emb"], + "attn_emb_mask": input_dict["attn_emb_mask"] + } + t = input_dict["t"] + + ############### + # determine input word + ################ + if input_dict["mode"] == "train" and random.random() < input_dict["ss_ratio"]: # training, scheduled sampling + word = input_dict["cap"][:, :t+1] + else: + start_word = torch.tensor([self.start_idx,] * input_dict["attn_emb"].size(0)).unsqueeze(1).long() + if t == 0: + word = start_word + else: + word = torch.cat((start_word, output["seq"][:, :t]), dim=-1) + # word: [N, T] + decoder_input["word"] = word + + return decoder_input + + def prepare_beamsearch_decoder_input(self, input_dict, output_i): + decoder_input = {} + t = input_dict["t"] + i = input_dict["sample_idx"] + beam_size = input_dict["beam_size"] + ############### + # prepare attn embeds + ################ + if t == 0: + attn_emb = repeat_tensor(input_dict["attn_emb"][i], beam_size) + attn_emb_mask = repeat_tensor(input_dict["attn_emb_mask"][i], beam_size) + output_i["attn_emb"] = attn_emb + output_i["attn_emb_mask"] = attn_emb_mask + decoder_input["attn_emb"] = output_i["attn_emb"] + decoder_input["attn_emb_mask"] = output_i["attn_emb_mask"] + ############### + # determine input word + ################ + start_word = torch.tensor([self.start_idx,] * beam_size).unsqueeze(1).long() + if t == 0: + word = start_word + else: + word = torch.cat((start_word, output_i["seq"]), dim=-1) + decoder_input["word"] = word + + return decoder_input + + +class EventEncoder(nn.Module): + """ + Encode the Label information in AudioCaps and AudioSet + """ + def __init__(self, emb_dim, vocab_size=527): + super(EventEncoder, self).__init__() + self.label_embedding = nn.Parameter( + torch.randn((vocab_size, emb_dim)), requires_grad=True) + + def forward(self, word_idxs): + indices = word_idxs / word_idxs.sum(dim=1, keepdim=True) + embeddings = indices @ self.label_embedding + return embeddings + + +class EventCondTransformerModel(TransformerModel): + + def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs): + if not hasattr(self, "compatible_decoders"): + self.compatible_decoders = ( + captioning.models.decoder.EventTransformerDecoder, + ) + super().__init__(encoder, decoder, **kwargs) + self.label_encoder = EventEncoder(decoder.emb_dim, 527) + self.train_forward_keys += ["events"] + self.inference_forward_keys += ["events"] + + # def seq_forward(self, input_dict): + # cap = input_dict["cap"] + # cap_padding_mask = (cap == self.pad_idx).to(cap.device) + # cap_padding_mask = cap_padding_mask[:, :-1] + # output = self.decoder( + # { + # "word": cap[:, :-1], + # "attn_emb": input_dict["attn_emb"], + # "attn_emb_len": input_dict["attn_emb_len"], + # "cap_padding_mask": cap_padding_mask + # } + # ) + # return output + + def prepare_decoder_input(self, input_dict, output): + decoder_input = super().prepare_decoder_input(input_dict, output) + decoder_input["events"] = self.label_encoder(input_dict["events"]) + return decoder_input + + def prepare_beamsearch_decoder_input(self, input_dict, output_i): + decoder_input = super().prepare_beamsearch_decoder_input(input_dict, output_i) + t = input_dict["t"] + i = input_dict["sample_idx"] + beam_size = input_dict["beam_size"] + if t == 0: + output_i["events"] = repeat_tensor(self.label_encoder(input_dict["events"])[i], beam_size) + decoder_input["events"] = output_i["events"] + return decoder_input + + +class KeywordCondTransformerModel(TransformerModel): + + def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs): + if not hasattr(self, "compatible_decoders"): + self.compatible_decoders = ( + captioning.models.decoder.KeywordProbTransformerDecoder, + ) + super().__init__(encoder, decoder, **kwargs) + self.train_forward_keys += ["keyword"] + self.inference_forward_keys += ["keyword"] + + def seq_forward(self, input_dict): + cap = input_dict["cap"] + cap_padding_mask = (cap == self.pad_idx).to(cap.device) + cap_padding_mask = cap_padding_mask[:, :-1] + keyword = input_dict["keyword"] + output = self.decoder( + { + "word": cap[:, :-1], + "attn_emb": input_dict["attn_emb"], + "attn_emb_len": input_dict["attn_emb_len"], + "keyword": keyword, + "cap_padding_mask": cap_padding_mask + } + ) + return output + + def prepare_decoder_input(self, input_dict, output): + decoder_input = super().prepare_decoder_input(input_dict, output) + decoder_input["keyword"] = input_dict["keyword"] + return decoder_input + + def prepare_beamsearch_decoder_input(self, input_dict, output_i): + decoder_input = super().prepare_beamsearch_decoder_input(input_dict, output_i) + t = input_dict["t"] + i = input_dict["sample_idx"] + beam_size = input_dict["beam_size"] + if t == 0: + output_i["keyword"] = repeat_tensor(input_dict["keyword"][i], + beam_size) + decoder_input["keyword"] = output_i["keyword"] + return decoder_input + diff --git a/audio_to_text/captioning/models/utils.py b/audio_to_text/captioning/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3623cf43619a7a4ff5fa31f2b056378697b04d61 --- /dev/null +++ b/audio_to_text/captioning/models/utils.py @@ -0,0 +1,132 @@ +import math + +import numpy as np +import torch +import torch.nn as nn + +from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence + + +def sort_pack_padded_sequence(input, lengths): + sorted_lengths, indices = torch.sort(lengths, descending=True) + tmp = pack_padded_sequence(input[indices], sorted_lengths.cpu(), batch_first=True) + inv_ix = indices.clone() + inv_ix[indices] = torch.arange(0,len(indices)).type_as(inv_ix) + return tmp, inv_ix + +def pad_unsort_packed_sequence(input, inv_ix): + tmp, _ = pad_packed_sequence(input, batch_first=True) + tmp = tmp[inv_ix] + return tmp + +def pack_wrapper(module, attn_feats, attn_feat_lens): + packed, inv_ix = sort_pack_padded_sequence(attn_feats, attn_feat_lens) + if isinstance(module, torch.nn.RNNBase): + return pad_unsort_packed_sequence(module(packed)[0], inv_ix) + else: + return pad_unsort_packed_sequence(PackedSequence(module(packed[0]), packed[1]), inv_ix) + +def generate_length_mask(lens, max_length=None): + lens = torch.as_tensor(lens) + N = lens.size(0) + if max_length is None: + max_length = max(lens) + idxs = torch.arange(max_length).repeat(N).view(N, max_length) + idxs = idxs.to(lens.device) + mask = (idxs < lens.view(-1, 1)) + return mask + +def mean_with_lens(features, lens): + """ + features: [N, T, ...] (assume the second dimension represents length) + lens: [N,] + """ + lens = torch.as_tensor(lens) + if max(lens) != features.size(1): + max_length = features.size(1) + mask = generate_length_mask(lens, max_length) + else: + mask = generate_length_mask(lens) + mask = mask.to(features.device) # [N, T] + + while mask.ndim < features.ndim: + mask = mask.unsqueeze(-1) + feature_mean = features * mask + feature_mean = feature_mean.sum(1) + while lens.ndim < feature_mean.ndim: + lens = lens.unsqueeze(1) + feature_mean = feature_mean / lens.to(features.device) + # feature_mean = features * mask.unsqueeze(-1) + # feature_mean = feature_mean.sum(1) / lens.unsqueeze(1).to(features.device) + return feature_mean + +def max_with_lens(features, lens): + """ + features: [N, T, ...] (assume the second dimension represents length) + lens: [N,] + """ + lens = torch.as_tensor(lens) + mask = generate_length_mask(lens).to(features.device) # [N, T] + + feature_max = features.clone() + feature_max[~mask] = float("-inf") + feature_max, _ = feature_max.max(1) + return feature_max + +def repeat_tensor(x, n): + return x.unsqueeze(0).repeat(n, *([1] * len(x.shape))) + +def init(m, method="kaiming"): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + if method == "kaiming": + nn.init.kaiming_uniform_(m.weight) + elif method == "xavier": + nn.init.xavier_uniform_(m.weight) + else: + raise Exception(f"initialization method {method} not supported") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)): + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + if method == "kaiming": + nn.init.kaiming_uniform_(m.weight) + elif method == "xavier": + nn.init.xavier_uniform_(m.weight) + else: + raise Exception(f"initialization method {method} not supported") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Embedding): + if method == "kaiming": + nn.init.kaiming_uniform_(m.weight) + elif method == "xavier": + nn.init.xavier_uniform_(m.weight) + else: + raise Exception(f"initialization method {method} not supported") + + + + +class PositionalEncoding(nn.Module): + + def __init__(self, d_model, dropout=0.1, max_len=100): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * \ + (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + # self.register_buffer("pe", pe) + self.register_parameter("pe", nn.Parameter(pe, requires_grad=False)) + + def forward(self, x): + # x: [T, N, E] + x = x + self.pe[:x.size(0), :] + return self.dropout(x) diff --git a/audio_to_text/captioning/utils/README.md b/audio_to_text/captioning/utils/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c6fd17d778a9f9dbe7bf632c92e40e36e67b91d2 --- /dev/null +++ b/audio_to_text/captioning/utils/README.md @@ -0,0 +1,19 @@ +# Utils + +Scripts in this directory are used as utility functions. + +## BERT Pretrained Embeddings + +You can load pretrained word embeddings in Google [BERT](https://github.com/google-research/bert#pre-trained-models) instead of training word embeddings from scratch. The scripts in `utils/bert` need a BERT server in the background. We use BERT server from [bert-as-service](https://github.com/hanxiao/bert-as-service). + +To use bert-as-service, you need to first install the repository. It is recommended that you create a new environment with Tensorflow 1.3 to run BERT server since it is incompatible with Tensorflow 2.x. + +After successful installation of [bert-as-service](https://github.com/hanxiao/bert-as-service), downloading and running the BERT server needs to execute: + +```bash +bash scripts/prepare_bert_server.sh zh +``` + +By default, server based on BERT base Chinese model is running in the background. You can change to other models by changing corresponding model name and path in `scripts/prepare_bert_server.sh`. + +To extract BERT word embeddings, you need to execute `utils/bert/create_word_embedding.py`. diff --git a/audio_to_text/captioning/utils/__init__.py b/audio_to_text/captioning/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_to_text/captioning/utils/__pycache__/__init__.cpython-38.pyc b/audio_to_text/captioning/utils/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be638756ffa795f33059276b99c2f8c05661cbdf Binary files /dev/null and b/audio_to_text/captioning/utils/__pycache__/__init__.cpython-38.pyc differ diff --git a/audio_to_text/captioning/utils/__pycache__/train_util.cpython-38.pyc b/audio_to_text/captioning/utils/__pycache__/train_util.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4270c25cf751d703e233146358c7345c39e55ceb Binary files /dev/null and b/audio_to_text/captioning/utils/__pycache__/train_util.cpython-38.pyc differ diff --git a/audio_to_text/captioning/utils/bert/create_sent_embedding.py b/audio_to_text/captioning/utils/bert/create_sent_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..b517a32429ca74bae668291dcb03d34296027440 --- /dev/null +++ b/audio_to_text/captioning/utils/bert/create_sent_embedding.py @@ -0,0 +1,89 @@ +import pickle +import fire +import numpy as np +import pandas as pd +from tqdm import tqdm + + +class EmbeddingExtractor(object): + + def extract_sentbert(self, caption_file: str, output: str, dev: bool=True, zh: bool=False): + from sentence_transformers import SentenceTransformer + lang2model = { + "zh": "distiluse-base-multilingual-cased", + "en": "bert-base-nli-mean-tokens" + } + lang = "zh" if zh else "en" + model = SentenceTransformer(lang2model[lang]) + + self.extract(caption_file, model, output, dev) + + def extract_originbert(self, caption_file: str, output: str, dev: bool=True, ip="localhost"): + from bert_serving.client import BertClient + client = BertClient(ip) + + self.extract(caption_file, client, output, dev) + + def extract(self, caption_file: str, model, output, dev: bool): + caption_df = pd.read_json(caption_file, dtype={"key": str}) + embeddings = {} + + if dev: + with tqdm(total=caption_df.shape[0], ascii=True) as pbar: + for idx, row in caption_df.iterrows(): + caption = row["caption"] + key = row["key"] + cap_idx = row["caption_index"] + embedding = model.encode([caption]) + embedding = np.array(embedding).reshape(-1) + embeddings[f"{key}_{cap_idx}"] = embedding + pbar.update() + + else: + dump = {} + + with tqdm(total=caption_df.shape[0], ascii=True) as pbar: + for idx, row in caption_df.iterrows(): + key = row["key"] + caption = row["caption"] + value = np.array(model.encode([caption])).reshape(-1) + + if key not in embeddings.keys(): + embeddings[key] = [value] + else: + embeddings[key].append(value) + + pbar.update() + + for key in embeddings: + dump[key] = np.stack(embeddings[key]) + + embeddings = dump + + with open(output, "wb") as f: + pickle.dump(embeddings, f) + + def extract_sbert(self, + input_json: str, + output: str): + from sentence_transformers import SentenceTransformer + import json + import torch + from h5py import File + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = SentenceTransformer("paraphrase-MiniLM-L6-v2") + model = model.to(device) + model.eval() + + data = json.load(open(input_json))["audios"] + with torch.no_grad(), tqdm(total=len(data), ascii=True) as pbar, File(output, "w") as store: + for sample in data: + audio_id = sample["audio_id"] + for cap in sample["captions"]: + cap_id = cap["cap_id"] + store[f"{audio_id}_{cap_id}"] = model.encode(cap["caption"]) + pbar.update() + + +if __name__ == "__main__": + fire.Fire(EmbeddingExtractor) diff --git a/audio_to_text/captioning/utils/bert/create_word_embedding.py b/audio_to_text/captioning/utils/bert/create_word_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..43c980e69057dc251ddbb7ae6a19684807cc6699 --- /dev/null +++ b/audio_to_text/captioning/utils/bert/create_word_embedding.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- + +import sys +import os + +from bert_serving.client import BertClient +import numpy as np +from tqdm import tqdm +import fire +import torch + +sys.path.append(os.getcwd()) +from utils.build_vocab import Vocabulary + +def main(vocab_file: str, output: str, server_hostname: str): + client = BertClient(ip=server_hostname) + vocabulary = torch.load(vocab_file) + vocab_size = len(vocabulary) + + fake_embedding = client.encode(["test"]).reshape(-1) + embed_size = fake_embedding.shape[0] + + print("Encoding words into embeddings with size: ", embed_size) + + embeddings = np.empty((vocab_size, embed_size)) + for i in tqdm(range(len(embeddings)), ascii=True): + embeddings[i] = client.encode([vocabulary.idx2word[i]]) + np.save(output, embeddings) + + +if __name__ == '__main__': + fire.Fire(main) + + diff --git a/audio_to_text/captioning/utils/build_vocab.py b/audio_to_text/captioning/utils/build_vocab.py new file mode 100644 index 0000000000000000000000000000000000000000..e9fab23bc2c48203e541d356dc172e1fdee8f113 --- /dev/null +++ b/audio_to_text/captioning/utils/build_vocab.py @@ -0,0 +1,153 @@ +import json +from tqdm import tqdm +import logging +import pickle +from collections import Counter +import re +import fire + + +class Vocabulary(object): + """Simple vocabulary wrapper.""" + def __init__(self): + self.word2idx = {} + self.idx2word = {} + self.idx = 0 + + def add_word(self, word): + if not word in self.word2idx: + self.word2idx[word] = self.idx + self.idx2word[self.idx] = word + self.idx += 1 + + def __call__(self, word): + if not word in self.word2idx: + return self.word2idx[""] + return self.word2idx[word] + + def __getitem__(self, word_id): + return self.idx2word[word_id] + + def __len__(self): + return len(self.word2idx) + + +def build_vocab(input_json: str, + threshold: int, + keep_punctuation: bool, + host_address: str, + character_level: bool = False, + zh: bool = True ): + """Build vocabulary from csv file with a given threshold to drop all counts < threshold + + Args: + input_json(string): Preprossessed json file. Structure like this: + { + 'audios': [ + { + 'audio_id': 'xxx', + 'captions': [ + { + 'caption': 'xxx', + 'cap_id': 'xxx' + } + ] + }, + ... + ] + } + threshold (int): Threshold to drop all words with counts < threshold + keep_punctuation (bool): Includes or excludes punctuation. + + Returns: + vocab (Vocab): Object with the processed vocabulary +""" + data = json.load(open(input_json, "r"))["audios"] + counter = Counter() + pretokenized = "tokens" in data[0]["captions"][0] + + if zh: + from nltk.parse.corenlp import CoreNLPParser + from zhon.hanzi import punctuation + if not pretokenized: + parser = CoreNLPParser(host_address) + for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): + for cap_idx in range(len(data[audio_idx]["captions"])): + if pretokenized: + tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split() + else: + caption = data[audio_idx]["captions"][cap_idx]["caption"] + # Remove all punctuations + if not keep_punctuation: + caption = re.sub("[{}]".format(punctuation), "", caption) + if character_level: + tokens = list(caption) + else: + tokens = list(parser.tokenize(caption)) + data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens) + counter.update(tokens) + else: + if pretokenized: + for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): + for cap_idx in range(len(data[audio_idx]["captions"])): + tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split() + counter.update(tokens) + else: + from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer + captions = {} + for audio_idx in range(len(data)): + audio_id = data[audio_idx]["audio_id"] + captions[audio_id] = [] + for cap_idx in range(len(data[audio_idx]["captions"])): + caption = data[audio_idx]["captions"][cap_idx]["caption"] + captions[audio_id].append({ + "audio_id": audio_id, + "id": cap_idx, + "caption": caption + }) + tokenizer = PTBTokenizer() + captions = tokenizer.tokenize(captions) + for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): + audio_id = data[audio_idx]["audio_id"] + for cap_idx in range(len(data[audio_idx]["captions"])): + tokens = captions[audio_id][cap_idx] + data[audio_idx]["captions"][cap_idx]["tokens"] = tokens + counter.update(tokens.split(" ")) + + if not pretokenized: + json.dump({ "audios": data }, open(input_json, "w"), indent=4, ensure_ascii=not zh) + words = [word for word, cnt in counter.items() if cnt >= threshold] + + # Create a vocab wrapper and add some special tokens. + vocab = Vocabulary() + vocab.add_word("") + vocab.add_word("") + vocab.add_word("") + vocab.add_word("") + + # Add the words to the vocabulary. + for word in words: + vocab.add_word(word) + return vocab + + +def process(input_json: str, + output_file: str, + threshold: int = 1, + keep_punctuation: bool = False, + character_level: bool = False, + host_address: str = "http://localhost:9000", + zh: bool = False): + logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + logging.basicConfig(level=logging.INFO, format=logfmt) + logging.info("Build Vocab") + vocabulary = build_vocab( + input_json=input_json, threshold=threshold, keep_punctuation=keep_punctuation, + host_address=host_address, character_level=character_level, zh=zh) + pickle.dump(vocabulary, open(output_file, "wb")) + logging.info("Total vocabulary size: {}".format(len(vocabulary))) + logging.info("Saved vocab to '{}'".format(output_file)) + + +if __name__ == '__main__': + fire.Fire(process) diff --git a/audio_to_text/captioning/utils/build_vocab_ltp.py b/audio_to_text/captioning/utils/build_vocab_ltp.py new file mode 100644 index 0000000000000000000000000000000000000000..aae0c718ae546882dcb573be42ace3408394468f --- /dev/null +++ b/audio_to_text/captioning/utils/build_vocab_ltp.py @@ -0,0 +1,150 @@ +import json +from tqdm import tqdm +import logging +import pickle +from collections import Counter +import re +import fire + +class Vocabulary(object): + """Simple vocabulary wrapper.""" + def __init__(self): + self.word2idx = {} + self.idx2word = {} + self.idx = 0 + + def add_word(self, word): + if not word in self.word2idx: + self.word2idx[word] = self.idx + self.idx2word[self.idx] = word + self.idx += 1 + + def __call__(self, word): + if not word in self.word2idx: + return self.word2idx[""] + return self.word2idx[word] + + def __len__(self): + return len(self.word2idx) + +def build_vocab(input_json: str, + output_json: str, + threshold: int, + keep_punctuation: bool, + character_level: bool = False, + zh: bool = True ): + """Build vocabulary from csv file with a given threshold to drop all counts < threshold + + Args: + input_json(string): Preprossessed json file. Structure like this: + { + 'audios': [ + { + 'audio_id': 'xxx', + 'captions': [ + { + 'caption': 'xxx', + 'cap_id': 'xxx' + } + ] + }, + ... + ] + } + threshold (int): Threshold to drop all words with counts < threshold + keep_punctuation (bool): Includes or excludes punctuation. + + Returns: + vocab (Vocab): Object with the processed vocabulary +""" + data = json.load(open(input_json, "r"))["audios"] + counter = Counter() + pretokenized = "tokens" in data[0]["captions"][0] + + if zh: + from ltp import LTP + from zhon.hanzi import punctuation + if not pretokenized: + parser = LTP("base") + for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): + for cap_idx in range(len(data[audio_idx]["captions"])): + if pretokenized: + tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split() + else: + caption = data[audio_idx]["captions"][cap_idx]["caption"] + if character_level: + tokens = list(caption) + else: + tokens, _ = parser.seg([caption]) + tokens = tokens[0] + # Remove all punctuations + if not keep_punctuation: + tokens = [token for token in tokens if token not in punctuation] + data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens) + counter.update(tokens) + else: + if pretokenized: + for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): + for cap_idx in range(len(data[audio_idx]["captions"])): + tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split() + counter.update(tokens) + else: + from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer + captions = {} + for audio_idx in range(len(data)): + audio_id = data[audio_idx]["audio_id"] + captions[audio_id] = [] + for cap_idx in range(len(data[audio_idx]["captions"])): + caption = data[audio_idx]["captions"][cap_idx]["caption"] + captions[audio_id].append({ + "audio_id": audio_id, + "id": cap_idx, + "caption": caption + }) + tokenizer = PTBTokenizer() + captions = tokenizer.tokenize(captions) + for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): + audio_id = data[audio_idx]["audio_id"] + for cap_idx in range(len(data[audio_idx]["captions"])): + tokens = captions[audio_id][cap_idx] + data[audio_idx]["captions"][cap_idx]["tokens"] = tokens + counter.update(tokens.split(" ")) + + if not pretokenized: + if output_json is None: + output_json = input_json + json.dump({ "audios": data }, open(output_json, "w"), indent=4, ensure_ascii=not zh) + words = [word for word, cnt in counter.items() if cnt >= threshold] + + # Create a vocab wrapper and add some special tokens. + vocab = Vocabulary() + vocab.add_word("") + vocab.add_word("") + vocab.add_word("") + vocab.add_word("") + + # Add the words to the vocabulary. + for word in words: + vocab.add_word(word) + return vocab + +def process(input_json: str, + output_file: str, + output_json: str = None, + threshold: int = 1, + keep_punctuation: bool = False, + character_level: bool = False, + zh: bool = True): + logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + logging.basicConfig(level=logging.INFO, format=logfmt) + logging.info("Build Vocab") + vocabulary = build_vocab( + input_json=input_json, output_json=output_json, threshold=threshold, + keep_punctuation=keep_punctuation, character_level=character_level, zh=zh) + pickle.dump(vocabulary, open(output_file, "wb")) + logging.info("Total vocabulary size: {}".format(len(vocabulary))) + logging.info("Saved vocab to '{}'".format(output_file)) + + +if __name__ == '__main__': + fire.Fire(process) diff --git a/audio_to_text/captioning/utils/build_vocab_spacy.py b/audio_to_text/captioning/utils/build_vocab_spacy.py new file mode 100644 index 0000000000000000000000000000000000000000..84da679f79d9f36b288d7312fb4ad9dc04723b0d --- /dev/null +++ b/audio_to_text/captioning/utils/build_vocab_spacy.py @@ -0,0 +1,152 @@ +import json +from tqdm import tqdm +import logging +import pickle +from collections import Counter +import re +import fire + +class Vocabulary(object): + """Simple vocabulary wrapper.""" + def __init__(self): + self.word2idx = {} + self.idx2word = {} + self.idx = 0 + + def add_word(self, word): + if not word in self.word2idx: + self.word2idx[word] = self.idx + self.idx2word[self.idx] = word + self.idx += 1 + + def __call__(self, word): + if not word in self.word2idx: + return self.word2idx[""] + return self.word2idx[word] + + def __len__(self): + return len(self.word2idx) + + +def build_vocab(input_json: str, + output_json: str, + threshold: int, + keep_punctuation: bool, + host_address: str, + character_level: bool = False, + retokenize: bool = True, + zh: bool = True ): + """Build vocabulary from csv file with a given threshold to drop all counts < threshold + + Args: + input_json(string): Preprossessed json file. Structure like this: + { + 'audios': [ + { + 'audio_id': 'xxx', + 'captions': [ + { + 'caption': 'xxx', + 'cap_id': 'xxx' + } + ] + }, + ... + ] + } + threshold (int): Threshold to drop all words with counts < threshold + keep_punctuation (bool): Includes or excludes punctuation. + + Returns: + vocab (Vocab): Object with the processed vocabulary +""" + data = json.load(open(input_json, "r"))["audios"] + counter = Counter() + if retokenize: + pretokenized = False + else: + pretokenized = "tokens" in data[0]["captions"][0] + + if zh: + from nltk.parse.corenlp import CoreNLPParser + from zhon.hanzi import punctuation + if not pretokenized: + parser = CoreNLPParser(host_address) + for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): + for cap_idx in range(len(data[audio_idx]["captions"])): + if pretokenized: + tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split() + else: + caption = data[audio_idx]["captions"][cap_idx]["caption"] + # Remove all punctuations + if not keep_punctuation: + caption = re.sub("[{}]".format(punctuation), "", caption) + if character_level: + tokens = list(caption) + else: + tokens = list(parser.tokenize(caption)) + data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens) + counter.update(tokens) + else: + if pretokenized: + for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): + for cap_idx in range(len(data[audio_idx]["captions"])): + tokens = data[audio_idx]["captions"][cap_idx]["tokens"].split() + counter.update(tokens) + else: + import spacy + tokenizer = spacy.load("en_core_web_sm", disable=["parser", "ner"]) + for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): + captions = data[audio_idx]["captions"] + for cap_idx in range(len(captions)): + caption = captions[cap_idx]["caption"] + doc = tokenizer(caption) + tokens = " ".join([str(token).lower() for token in doc]) + data[audio_idx]["captions"][cap_idx]["tokens"] = tokens + counter.update(tokens.split(" ")) + + if not pretokenized: + if output_json is None: + json.dump({ "audios": data }, open(input_json, "w"), + indent=4, ensure_ascii=not zh) + else: + json.dump({ "audios": data }, open(output_json, "w"), + indent=4, ensure_ascii=not zh) + + words = [word for word, cnt in counter.items() if cnt >= threshold] + + # Create a vocab wrapper and add some special tokens. + vocab = Vocabulary() + vocab.add_word("") + vocab.add_word("") + vocab.add_word("") + vocab.add_word("") + + # Add the words to the vocabulary. + for word in words: + vocab.add_word(word) + return vocab + +def process(input_json: str, + output_file: str, + output_json: str = None, + threshold: int = 1, + keep_punctuation: bool = False, + character_level: bool = False, + retokenize: bool = False, + host_address: str = "http://localhost:9000", + zh: bool = True): + logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + logging.basicConfig(level=logging.INFO, format=logfmt) + logging.info("Build Vocab") + vocabulary = build_vocab( + input_json=input_json, output_json=output_json, threshold=threshold, + keep_punctuation=keep_punctuation, host_address=host_address, + character_level=character_level, retokenize=retokenize, zh=zh) + pickle.dump(vocabulary, open(output_file, "wb")) + logging.info("Total vocabulary size: {}".format(len(vocabulary))) + logging.info("Saved vocab to '{}'".format(output_file)) + + +if __name__ == '__main__': + fire.Fire(process) diff --git a/audio_to_text/captioning/utils/eval_round_robin.py b/audio_to_text/captioning/utils/eval_round_robin.py new file mode 100644 index 0000000000000000000000000000000000000000..28603a56fe3e6603cca7da5d70c0f71b1421c7c5 --- /dev/null +++ b/audio_to_text/captioning/utils/eval_round_robin.py @@ -0,0 +1,182 @@ +import copy +import json + +import numpy as np +import fire + + +def evaluate_annotation(key2refs, scorer): + if scorer.method() == "Bleu": + scores = np.array([ 0.0 for n in range(4) ]) + else: + scores = 0 + num_cap_per_audio = len(next(iter(key2refs.values()))) + + for i in range(num_cap_per_audio): + if i > 0: + for key in key2refs: + key2refs[key].insert(0, res[key][0]) + res = { key: [refs.pop(),] for key, refs in key2refs.items() } + score, _ = scorer.compute_score(key2refs, res) + + if scorer.method() == "Bleu": + scores += np.array(score) + else: + scores += score + + score = scores / num_cap_per_audio + return score + +def evaluate_prediction(key2pred, key2refs, scorer): + if scorer.method() == "Bleu": + scores = np.array([ 0.0 for n in range(4) ]) + else: + scores = 0 + num_cap_per_audio = len(next(iter(key2refs.values()))) + + for i in range(num_cap_per_audio): + key2refs_i = {} + for key, refs in key2refs.items(): + key2refs_i[key] = refs[:i] + refs[i+1:] + score, _ = scorer.compute_score(key2refs_i, key2pred) + + if scorer.method() == "Bleu": + scores += np.array(score) + else: + scores += score + + score = scores / num_cap_per_audio + return score + + +class Evaluator(object): + + def eval_annotation(self, annotation, output): + captions = json.load(open(annotation, "r"))["audios"] + + key2refs = {} + for audio_idx in range(len(captions)): + audio_id = captions[audio_idx]["audio_id"] + key2refs[audio_id] = [] + for caption in captions[audio_idx]["captions"]: + key2refs[audio_id].append(caption["caption"]) + + from fense.fense import Fense + scores = {} + scorer = Fense() + scores[scorer.method()] = evaluate_annotation(copy.deepcopy(key2refs), scorer) + + refs4eval = {} + for key, refs in key2refs.items(): + refs4eval[key] = [] + for idx, ref in enumerate(refs): + refs4eval[key].append({ + "audio_id": key, + "id": idx, + "caption": ref + }) + + from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer + + tokenizer = PTBTokenizer() + key2refs = tokenizer.tokenize(refs4eval) + + + from pycocoevalcap.bleu.bleu import Bleu + from pycocoevalcap.cider.cider import Cider + from pycocoevalcap.rouge.rouge import Rouge + from pycocoevalcap.meteor.meteor import Meteor + from pycocoevalcap.spice.spice import Spice + + + scorers = [Bleu(), Rouge(), Cider(), Meteor(), Spice()] + for scorer in scorers: + scores[scorer.method()] = evaluate_annotation(copy.deepcopy(key2refs), scorer) + + spider = 0 + with open(output, "w") as f: + for name, score in scores.items(): + if name == "Bleu": + for n in range(4): + f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n])) + else: + f.write("{}: {:6.3f}\n".format(name, score)) + if name in ["CIDEr", "SPICE"]: + spider += score + f.write("SPIDEr: {:6.3f}\n".format(spider / 2)) + + def eval_prediction(self, prediction, annotation, output): + ref_captions = json.load(open(annotation, "r"))["audios"] + + key2refs = {} + for audio_idx in range(len(ref_captions)): + audio_id = ref_captions[audio_idx]["audio_id"] + key2refs[audio_id] = [] + for caption in ref_captions[audio_idx]["captions"]: + key2refs[audio_id].append(caption["caption"]) + + pred_captions = json.load(open(prediction, "r"))["predictions"] + + key2pred = {} + for audio_idx in range(len(pred_captions)): + item = pred_captions[audio_idx] + audio_id = item["filename"] + key2pred[audio_id] = [item["tokens"]] + + from fense.fense import Fense + scores = {} + scorer = Fense() + scores[scorer.method()] = evaluate_prediction(key2pred, key2refs, scorer) + + refs4eval = {} + for key, refs in key2refs.items(): + refs4eval[key] = [] + for idx, ref in enumerate(refs): + refs4eval[key].append({ + "audio_id": key, + "id": idx, + "caption": ref + }) + + preds4eval = {} + for key, preds in key2pred.items(): + preds4eval[key] = [] + for idx, pred in enumerate(preds): + preds4eval[key].append({ + "audio_id": key, + "id": idx, + "caption": pred + }) + + from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer + + tokenizer = PTBTokenizer() + key2refs = tokenizer.tokenize(refs4eval) + key2pred = tokenizer.tokenize(preds4eval) + + + from pycocoevalcap.bleu.bleu import Bleu + from pycocoevalcap.cider.cider import Cider + from pycocoevalcap.rouge.rouge import Rouge + from pycocoevalcap.meteor.meteor import Meteor + from pycocoevalcap.spice.spice import Spice + + scorers = [Bleu(), Rouge(), Cider(), Meteor(), Spice()] + for scorer in scorers: + scores[scorer.method()] = evaluate_prediction(key2pred, key2refs, scorer) + + spider = 0 + with open(output, "w") as f: + for name, score in scores.items(): + if name == "Bleu": + for n in range(4): + f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n])) + else: + f.write("{}: {:6.3f}\n".format(name, score)) + if name in ["CIDEr", "SPICE"]: + spider += score + f.write("SPIDEr: {:6.3f}\n".format(spider / 2)) + + +if __name__ == "__main__": + fire.Fire(Evaluator) diff --git a/audio_to_text/captioning/utils/fasttext/create_word_embedding.py b/audio_to_text/captioning/utils/fasttext/create_word_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..09da13a62a3462e730c8275320a6391536ff42c4 --- /dev/null +++ b/audio_to_text/captioning/utils/fasttext/create_word_embedding.py @@ -0,0 +1,50 @@ +# coding=utf-8 +#!/usr/bin/env python3 + +import numpy as np +import pandas as pd +import torch +from gensim.models import FastText +from tqdm import tqdm +import fire + +import sys +import os +sys.path.append(os.getcwd()) +from utils.build_vocab import Vocabulary + +def create_embedding(caption_file: str, + vocab_file: str, + embed_size: int, + output: str, + **fasttext_kwargs): + caption_df = pd.read_json(caption_file) + caption_df["tokens"] = caption_df["tokens"].apply(lambda x: [""] + [token for token in x] + [""]) + + sentences = list(caption_df["tokens"].values) + vocabulary = torch.load(vocab_file, map_location="cpu") + + epochs = fasttext_kwargs.get("epochs", 10) + model = FastText(size=embed_size, min_count=1, **fasttext_kwargs) + model.build_vocab(sentences=sentences) + model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs) + + word_embeddings = np.zeros((len(vocabulary), embed_size)) + + with tqdm(total=len(vocabulary), ascii=True) as pbar: + for word, idx in vocabulary.word2idx.items(): + if word == "" or word == "": + continue + word_embeddings[idx] = model.wv[word] + pbar.update() + + np.save(output, word_embeddings) + + print("Finish writing fasttext embeddings to " + output) + + +if __name__ == "__main__": + fire.Fire(create_embedding) + + + diff --git a/audio_to_text/captioning/utils/lr_scheduler.py b/audio_to_text/captioning/utils/lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..b46e3f0397634bcf48a6a61ab041a7ea07577eb3 --- /dev/null +++ b/audio_to_text/captioning/utils/lr_scheduler.py @@ -0,0 +1,128 @@ +import math +import torch + + +class ExponentialDecayScheduler(torch.optim.lr_scheduler._LRScheduler): + + def __init__(self, optimizer, total_iters, final_lrs, + warmup_iters=3000, last_epoch=-1, verbose=False): + self.total_iters = total_iters + self.final_lrs = final_lrs + if not isinstance(self.final_lrs, list) and not isinstance( + self.final_lrs, tuple): + self.final_lrs = [self.final_lrs] * len(optimizer.param_groups) + self.warmup_iters = warmup_iters + self.bases = [0.0,] * len(optimizer.param_groups) + super().__init__(optimizer, last_epoch, verbose) + for i, (base_lr, final_lr) in enumerate(zip(self.base_lrs, self.final_lrs)): + base = (final_lr / base_lr) ** (1 / ( + self.total_iters - self.warmup_iters)) + self.bases[i] = base + + def _get_closed_form_lr(self): + warmup_coeff = 1.0 + current_iter = self._step_count + if current_iter < self.warmup_iters: + warmup_coeff = current_iter / self.warmup_iters + current_lrs = [] + # if not self.linear_warmup: + # for base_lr, final_lr, base in zip(self.base_lrs, self.final_lrs, self.bases): + # # current_lr = warmup_coeff * base_lr * math.exp(((current_iter - self.warmup_iters) / self.total_iters) * math.log(final_lr / base_lr)) + # current_lr = warmup_coeff * base_lr * (base ** (current_iter - self.warmup_iters)) + # current_lrs.append(current_lr) + # else: + for base_lr, final_lr, base in zip(self.base_lrs, self.final_lrs, + self.bases): + if current_iter <= self.warmup_iters: + current_lr = warmup_coeff * base_lr + else: + # current_lr = warmup_coeff * base_lr * math.exp(((current_iter - self.warmup_iters) / self.total_iters) * math.log(final_lr / base_lr)) + current_lr = base_lr * (base ** (current_iter - self.warmup_iters)) + current_lrs.append(current_lr) + return current_lrs + + def get_lr(self): + return self._get_closed_form_lr() + + +class NoamScheduler(torch.optim.lr_scheduler._LRScheduler): + + def __init__(self, optimizer, model_size=512, factor=1, warmup_iters=3000, + last_epoch=-1, verbose=False): + self.model_size = model_size + self.warmup_iters = warmup_iters + # self.factors = [group["lr"] / (self.model_size ** (-0.5) * self.warmup_iters ** (-0.5)) for group in optimizer.param_groups] + self.factor = factor + super().__init__(optimizer, last_epoch, verbose) + + def _get_closed_form_lr(self): + current_iter = self._step_count + current_lrs = [] + for _ in self.base_lrs: + current_lr = self.factor * \ + (self.model_size ** (-0.5) * min(current_iter ** (-0.5), + current_iter * self.warmup_iters ** (-1.5))) + current_lrs.append(current_lr) + return current_lrs + + def get_lr(self): + return self._get_closed_form_lr() + + +class CosineWithWarmup(torch.optim.lr_scheduler._LRScheduler): + + def __init__(self, optimizer, total_iters, warmup_iters, + num_cycles=0.5, last_epoch=-1, verbose=False): + self.total_iters = total_iters + self.warmup_iters = warmup_iters + self.num_cycles = num_cycles + super().__init__(optimizer, last_epoch, verbose) + + def lr_lambda(self, iteration): + if iteration < self.warmup_iters: + return float(iteration) / float(max(1, self.warmup_iters)) + progress = float(iteration - self.warmup_iters) / float(max(1, + self.total_iters - self.warmup_iters)) + return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float( + self.num_cycles) * 2.0 * progress))) + + def _get_closed_form_lr(self): + current_iter = self._step_count + current_lrs = [] + for base_lr in self.base_lrs: + current_lr = base_lr * self.lr_lambda(current_iter) + current_lrs.append(current_lr) + return current_lrs + + def get_lr(self): + return self._get_closed_form_lr() + + +if __name__ == "__main__": + model = torch.nn.Linear(10, 5) + optimizer = torch.optim.Adam(model.parameters(), 5e-4) + epochs = 25 + iters = 600 + scheduler = CosineWithWarmup(optimizer, 600 * 25, 600 * 5,) + # scheduler = ExponentialDecayScheduler(optimizer, 600 * 25, 5e-7, 600 * 5) + criterion = torch.nn.MSELoss() + lrs = [] + for epoch in range(1, epochs + 1): + for iteration in range(1, iters + 1): + optimizer.zero_grad() + x = torch.randn(4, 10) + y = torch.randn(4, 5) + loss = criterion(model(x), y) + loss.backward() + optimizer.step() + scheduler.step() + # print(f"lr: {scheduler.get_last_lr()}") + # lrs.append(scheduler.get_last_lr()) + lrs.append(optimizer.param_groups[0]["lr"]) + import matplotlib.pyplot as plt + plt.plot(list(range(1, len(lrs) + 1)), lrs, '-o', markersize=1) + # plt.legend(loc="best") + plt.xlabel("Iteration") + plt.ylabel("LR") + + plt.savefig("lr_curve.png", dpi=100) diff --git a/audio_to_text/captioning/utils/model_eval_diff.py b/audio_to_text/captioning/utils/model_eval_diff.py new file mode 100644 index 0000000000000000000000000000000000000000..2c29ef8fde2451d3f84e842d0d6a72754f0d4603 --- /dev/null +++ b/audio_to_text/captioning/utils/model_eval_diff.py @@ -0,0 +1,110 @@ +import os +import sys +import copy +import pickle + +import numpy as np +import pandas as pd +import fire + +sys.path.append(os.getcwd()) + + +def coco_score(refs, pred, scorer): + if scorer.method() == "Bleu": + scores = np.array([ 0.0 for n in range(4) ]) + else: + scores = 0 + num_cap_per_audio = len(refs[list(refs.keys())[0]]) + + for i in range(num_cap_per_audio): + if i > 0: + for key in refs: + refs[key].insert(0, res[key][0]) + res = {key: [refs[key].pop(),] for key in refs} + score, _ = scorer.compute_score(refs, pred) + + if scorer.method() == "Bleu": + scores += np.array(score) + else: + scores += score + + score = scores / num_cap_per_audio + + for key in refs: + refs[key].insert(0, res[key][0]) + score_allref, _ = scorer.compute_score(refs, pred) + diff = score_allref - score + return diff + +def embedding_score(refs, pred, scorer): + + num_cap_per_audio = len(refs[list(refs.keys())[0]]) + scores = 0 + + for i in range(num_cap_per_audio): + res = {key: [refs[key][i],] for key in refs.keys() if len(refs[key]) == num_cap_per_audio} + refs_i = {key: np.concatenate([refs[key][:i], refs[key][i+1:]]) for key in refs.keys() if len(refs[key]) == num_cap_per_audio} + score, _ = scorer.compute_score(refs_i, pred) + + scores += score + + score = scores / num_cap_per_audio + + score_allref, _ = scorer.compute_score(refs, pred) + diff = score_allref - score + return diff + +def main(output_file, eval_caption_file, eval_embedding_file, output, zh=False): + output_df = pd.read_json(output_file) + output_df["key"] = output_df["filename"].apply(lambda x: os.path.splitext(os.path.basename(x))[0]) + pred = output_df.groupby("key")["tokens"].apply(list).to_dict() + + label_df = pd.read_json(eval_caption_file) + if zh: + refs = label_df.groupby("key")["tokens"].apply(list).to_dict() + else: + refs = label_df.groupby("key")["caption"].apply(list).to_dict() + + from pycocoevalcap.bleu.bleu import Bleu + from pycocoevalcap.cider.cider import Cider + from pycocoevalcap.rouge.rouge import Rouge + + scorer = Bleu(zh=zh) + bleu_scores = coco_score(copy.deepcopy(refs), pred, scorer) + scorer = Cider(zh=zh) + cider_score = coco_score(copy.deepcopy(refs), pred, scorer) + scorer = Rouge(zh=zh) + rouge_score = coco_score(copy.deepcopy(refs), pred, scorer) + + if not zh: + from pycocoevalcap.meteor.meteor import Meteor + scorer = Meteor() + meteor_score = coco_score(copy.deepcopy(refs), pred, scorer) + + from pycocoevalcap.spice.spice import Spice + scorer = Spice() + spice_score = coco_score(copy.deepcopy(refs), pred, scorer) + + # from audiocaptioneval.sentbert.sentencebert import SentenceBert + # scorer = SentenceBert(zh=zh) + # with open(eval_embedding_file, "rb") as f: + # ref_embeddings = pickle.load(f) + + # sent_bert = embedding_score(ref_embeddings, pred, scorer) + + with open(output, "w") as f: + f.write("Diff:\n") + for n in range(4): + f.write("BLEU-{}: {:6.3f}\n".format(n+1, bleu_scores[n])) + f.write("CIDEr: {:6.3f}\n".format(cider_score)) + f.write("ROUGE: {:6.3f}\n".format(rouge_score)) + if not zh: + f.write("Meteor: {:6.3f}\n".format(meteor_score)) + f.write("SPICE: {:6.3f}\n".format(spice_score)) + # f.write("SentenceBert: {:6.3f}\n".format(sent_bert)) + + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/audio_to_text/captioning/utils/predict_nn.py b/audio_to_text/captioning/utils/predict_nn.py new file mode 100644 index 0000000000000000000000000000000000000000..699c3dcffe7ce2c6dad33a5546c707dd76ccf82c --- /dev/null +++ b/audio_to_text/captioning/utils/predict_nn.py @@ -0,0 +1,49 @@ +import json +import random +import argparse +import numpy as np +from tqdm import tqdm +from h5py import File +import sklearn.metrics + +random.seed(1) + +parser = argparse.ArgumentParser() +parser.add_argument("train_feature", type=str) +parser.add_argument("train_corpus", type=str) +parser.add_argument("pred_feature", type=str) +parser.add_argument("output_json", type=str) + +args = parser.parse_args() +train_embs = [] +train_idx_to_audioid = [] +with File(args.train_feature, "r") as store: + for audio_id, embedding in tqdm(store.items(), ascii=True): + train_embs.append(embedding[()]) + train_idx_to_audioid.append(audio_id) + +train_annotation = json.load(open(args.train_corpus, "r"))["audios"] +train_audioid_to_tokens = {} +for item in train_annotation: + audio_id = item["audio_id"] + train_audioid_to_tokens[audio_id] = [cap_item["tokens"] for cap_item in item["captions"]] +train_embs = np.stack(train_embs) + + +pred_data = [] +pred_embs = [] +pred_idx_to_audioids = [] +with File(args.pred_feature, "r") as store: + for audio_id, embedding in tqdm(store.items(), ascii=True): + pred_embs.append(embedding[()]) + pred_idx_to_audioids.append(audio_id) +pred_embs = np.stack(pred_embs) + +similarity = sklearn.metrics.pairwise.cosine_similarity(pred_embs, train_embs) +for idx, audio_id in enumerate(pred_idx_to_audioids): + train_idx = similarity[idx].argmax() + pred_data.append({ + "filename": audio_id, + "tokens": random.choice(train_audioid_to_tokens[train_idx_to_audioid[train_idx]]) + }) +json.dump({"predictions": pred_data}, open(args.output_json, "w"), ensure_ascii=False, indent=4) diff --git a/audio_to_text/captioning/utils/remove_optimizer.py b/audio_to_text/captioning/utils/remove_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..2b9871ee8022c0e0814abb46173fee1a6ae4ba9c --- /dev/null +++ b/audio_to_text/captioning/utils/remove_optimizer.py @@ -0,0 +1,18 @@ +import argparse +import torch + + +def main(checkpoint): + state_dict = torch.load(checkpoint, map_location="cpu") + if "optimizer" in state_dict: + del state_dict["optimizer"] + if "lr_scheduler" in state_dict: + del state_dict["lr_scheduler"] + torch.save(state_dict, checkpoint) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint", type=str) + args = parser.parse_args() + main(args.checkpoint) diff --git a/audio_to_text/captioning/utils/report_results.py b/audio_to_text/captioning/utils/report_results.py new file mode 100644 index 0000000000000000000000000000000000000000..3b9f6ec5e8d2f253706198e0d521f73981ef3efe --- /dev/null +++ b/audio_to_text/captioning/utils/report_results.py @@ -0,0 +1,37 @@ +from pathlib import Path +import argparse +import numpy as np + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="input filename", type=str, nargs="+") +parser.add_argument("--output", help="output result file", default=None) + +args = parser.parse_args() + + +scores = {} +for path in args.input: + with open(path, "r") as reader: + for line in reader.readlines(): + metric, score = line.strip().split(": ") + score = float(score) + if metric not in scores: + scores[metric] = [] + scores[metric].append(score) + +if len(scores) == 0: + print("No experiment directory found, wrong path?") + exit(1) + +with open(args.output, "w") as writer: + print("Average results: ", file=writer) + for metric, score in scores.items(): + score = np.array(score) + mean = np.mean(score) + std = np.std(score) + print(f"{metric}: {mean:.3f} (±{std:.3f})", file=writer) + print("", file=writer) + print("Best results: ", file=writer) + for metric, score in scores.items(): + score = np.max(score) + print(f"{metric}: {score:.3f}", file=writer) diff --git a/audio_to_text/captioning/utils/tokenize_caption.py b/audio_to_text/captioning/utils/tokenize_caption.py new file mode 100644 index 0000000000000000000000000000000000000000..b340068577a1d4b02e187048e6a20cb95264561f --- /dev/null +++ b/audio_to_text/captioning/utils/tokenize_caption.py @@ -0,0 +1,86 @@ +import json +from tqdm import tqdm +import re +import fire + + +def tokenize_caption(input_json: str, + keep_punctuation: bool = False, + host_address: str = None, + character_level: bool = False, + zh: bool = True, + output_json: str = None): + """Build vocabulary from csv file with a given threshold to drop all counts < threshold + + Args: + input_json(string): Preprossessed json file. Structure like this: + { + 'audios': [ + { + 'audio_id': 'xxx', + 'captions': [ + { + 'caption': 'xxx', + 'cap_id': 'xxx' + } + ] + }, + ... + ] + } + threshold (int): Threshold to drop all words with counts < threshold + keep_punctuation (bool): Includes or excludes punctuation. + + Returns: + vocab (Vocab): Object with the processed vocabulary +""" + data = json.load(open(input_json, "r"))["audios"] + + if zh: + from nltk.parse.corenlp import CoreNLPParser + from zhon.hanzi import punctuation + parser = CoreNLPParser(host_address) + for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): + for cap_idx in range(len(data[audio_idx]["captions"])): + caption = data[audio_idx]["captions"][cap_idx]["caption"] + # Remove all punctuations + if not keep_punctuation: + caption = re.sub("[{}]".format(punctuation), "", caption) + if character_level: + tokens = list(caption) + else: + tokens = list(parser.tokenize(caption)) + data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens) + else: + from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer + captions = {} + for audio_idx in range(len(data)): + audio_id = data[audio_idx]["audio_id"] + captions[audio_id] = [] + for cap_idx in range(len(data[audio_idx]["captions"])): + caption = data[audio_idx]["captions"][cap_idx]["caption"] + captions[audio_id].append({ + "audio_id": audio_id, + "id": cap_idx, + "caption": caption + }) + tokenizer = PTBTokenizer() + captions = tokenizer.tokenize(captions) + for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): + audio_id = data[audio_idx]["audio_id"] + for cap_idx in range(len(data[audio_idx]["captions"])): + tokens = captions[audio_id][cap_idx] + data[audio_idx]["captions"][cap_idx]["tokens"] = tokens + + if output_json: + json.dump( + { "audios": data }, open(output_json, "w"), + indent=4, ensure_ascii=not zh) + else: + json.dump( + { "audios": data }, open(input_json, "w"), + indent=4, ensure_ascii=not zh) + + +if __name__ == "__main__": + fire.Fire(tokenize_caption) diff --git a/audio_to_text/captioning/utils/train_util.py b/audio_to_text/captioning/utils/train_util.py new file mode 100644 index 0000000000000000000000000000000000000000..6cd62cc36043a2db75cc6761c51fdfdd18d11392 --- /dev/null +++ b/audio_to_text/captioning/utils/train_util.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- +#!/usr/bin/env python3 +import os +import sys +import logging +from typing import Callable, Dict, Union +import yaml +import torch +from torch.optim.swa_utils import AveragedModel as torch_average_model +import numpy as np +import pandas as pd +from pprint import pformat + + +def load_dict_from_csv(csv, cols): + df = pd.read_csv(csv, sep="\t") + output = dict(zip(df[cols[0]], df[cols[1]])) + return output + + +def init_logger(filename, level="INFO"): + formatter = logging.Formatter( + "[ %(levelname)s : %(asctime)s ] - %(message)s") + logger = logging.getLogger(__name__ + "." + filename) + logger.setLevel(getattr(logging, level)) + # Log results to std + # stdhandler = logging.StreamHandler(sys.stdout) + # stdhandler.setFormatter(formatter) + # Dump log to file + filehandler = logging.FileHandler(filename) + filehandler.setFormatter(formatter) + logger.addHandler(filehandler) + # logger.addHandler(stdhandler) + return logger + + +def init_obj(module, config, **kwargs):# 'captioning.models.encoder' + obj_args = config["args"].copy() + obj_args.update(kwargs) + return getattr(module, config["type"])(**obj_args) + + +def pprint_dict(in_dict, outputfun=sys.stdout.write, formatter='yaml'): + """pprint_dict + + :param outputfun: function to use, defaults to sys.stdout + :param in_dict: dict to print + """ + if formatter == 'yaml': + format_fun = yaml.dump + elif formatter == 'pretty': + format_fun = pformat + for line in format_fun(in_dict).split('\n'): + outputfun(line) + + +def merge_a_into_b(a, b): + # merge dict a into dict b. values in a will overwrite b. + for k, v in a.items(): + if isinstance(v, dict) and k in b: + assert isinstance( + b[k], dict + ), "Cannot inherit key '{}' from base!".format(k) + merge_a_into_b(v, b[k]) + else: + b[k] = v + + +def load_config(config_file): + with open(config_file, "r") as reader: + config = yaml.load(reader, Loader=yaml.FullLoader) + if "inherit_from" in config: + base_config_file = config["inherit_from"] + base_config_file = os.path.join( + os.path.dirname(config_file), base_config_file + ) + assert not os.path.samefile(config_file, base_config_file), \ + "inherit from itself" + base_config = load_config(base_config_file) + del config["inherit_from"] + merge_a_into_b(config, base_config) + return base_config + return config + + +def parse_config_or_kwargs(config_file, **kwargs): + yaml_config = load_config(config_file) + # passed kwargs will override yaml config + args = dict(yaml_config, **kwargs) + return args + + +def store_yaml(config, config_file): + with open(config_file, "w") as con_writer: + yaml.dump(config, con_writer, indent=4, default_flow_style=False) + + +class MetricImprover: + + def __init__(self, mode): + assert mode in ("min", "max") + self.mode = mode + # min: lower -> better; max: higher -> better + self.best_value = np.inf if mode == "min" else -np.inf + + def compare(self, x, best_x): + return x < best_x if self.mode == "min" else x > best_x + + def __call__(self, x): + if self.compare(x, self.best_value): + self.best_value = x + return True + return False + + def state_dict(self): + return self.__dict__ + + def load_state_dict(self, state_dict): + self.__dict__.update(state_dict) + + +def fix_batchnorm(model: torch.nn.Module): + def inner(module): + class_name = module.__class__.__name__ + if class_name.find("BatchNorm") != -1: + module.eval() + model.apply(inner) + + +def load_pretrained_model(model: torch.nn.Module, + pretrained: Union[str, Dict], + output_fn: Callable = sys.stdout.write): + if not isinstance(pretrained, dict) and not os.path.exists(pretrained): + output_fn(f"pretrained {pretrained} not exist!") + return + + if hasattr(model, "load_pretrained"): + model.load_pretrained(pretrained) + return + + if isinstance(pretrained, dict): + state_dict = pretrained + else: + state_dict = torch.load(pretrained, map_location="cpu") + + if "model" in state_dict: + state_dict = state_dict["model"] + model_dict = model.state_dict() + pretrained_dict = { + k: v for k, v in state_dict.items() if (k in model_dict) and ( + model_dict[k].shape == v.shape) + } + output_fn(f"Loading pretrained keys {pretrained_dict.keys()}") + model_dict.update(pretrained_dict) + model.load_state_dict(model_dict, strict=True) + + +class AveragedModel(torch_average_model): + + def update_parameters(self, model): + for p_swa, p_model in zip(self.parameters(), model.parameters()): + device = p_swa.device + p_model_ = p_model.detach().to(device) + if self.n_averaged == 0: + p_swa.detach().copy_(p_model_) + else: + p_swa.detach().copy_(self.avg_fn(p_swa.detach(), p_model_, + self.n_averaged.to(device))) + + for b_swa, b_model in zip(list(self.buffers())[1:], model.buffers()): + device = b_swa.device + b_model_ = b_model.detach().to(device) + if self.n_averaged == 0: + b_swa.detach().copy_(b_model_) + else: + b_swa.detach().copy_(self.avg_fn(b_swa.detach(), b_model_, + self.n_averaged.to(device))) + self.n_averaged += 1 diff --git a/audio_to_text/captioning/utils/word2vec/create_word_embedding.py b/audio_to_text/captioning/utils/word2vec/create_word_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..77ebe5adc6ec14bd639e78125f00c1eaea0b4dcc --- /dev/null +++ b/audio_to_text/captioning/utils/word2vec/create_word_embedding.py @@ -0,0 +1,67 @@ +# coding=utf-8 +#!/usr/bin/env python3 + +import numpy as np +import pandas as pd +import torch +import gensim +from gensim.models import Word2Vec +from tqdm import tqdm +import fire + +import sys +import os +sys.path.append(os.getcwd()) +from utils.build_vocab import Vocabulary + +def create_embedding(vocab_file: str, + embed_size: int, + output: str, + caption_file: str = None, + pretrained_weights_path: str = None, + **word2vec_kwargs): + vocabulary = torch.load(vocab_file, map_location="cpu") + + if pretrained_weights_path: + model = gensim.models.KeyedVectors.load_word2vec_format( + fname=pretrained_weights_path, + binary=True, + ) + if model.vector_size != embed_size: + assert embed_size < model.vector_size, f"only reduce dimension, cannot add dimesion {model.vector_size} to {embed_size}" + from sklearn.decomposition import PCA + pca = PCA(n_components=embed_size) + model.vectors = pca.fit_transform(model.vectors) + else: + caption_df = pd.read_json(caption_file) + caption_df["tokens"] = caption_df["tokens"].apply(lambda x: [""] + [token for token in x] + [""]) + sentences = list(caption_df["tokens"].values) + epochs = word2vec_kwargs.get("epochs", 10) + if "epochs" in word2vec_kwargs: + del word2vec_kwargs["epochs"] + model = Word2Vec(size=embed_size, min_count=1, **word2vec_kwargs) + model.build_vocab(sentences=sentences) + model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs) + + word_embeddings = np.random.randn(len(vocabulary), embed_size) + + if isinstance(model, gensim.models.word2vec.Word2Vec): + model = model.wv + with tqdm(total=len(vocabulary), ascii=True) as pbar: + for word, idx in vocabulary.word2idx.items(): + try: + word_embeddings[idx] = model.get_vector(word) + except KeyError: + print(f"word {word} not found in word2vec model, it is random initialized!") + pbar.update() + + np.save(output, word_embeddings) + + print("Finish writing word2vec embeddings to " + output) + + +if __name__ == "__main__": + fire.Fire(create_embedding) + + + diff --git a/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..034751fcff1e1d3b686ae0ad1cd6346f92dacc13 --- /dev/null +++ b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml @@ -0,0 +1,22 @@ +model: + encoder: + type: Cnn14RnnEncoder + args: + sample_rate: 32000 + pretrained: ./audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth + freeze_cnn: True + freeze_cnn_bn: True + bidirectional: True + dropout: 0.5 + hidden_size: 256 + num_layers: 3 + decoder: + type: TransformerDecoder + args: + attn_emb_dim: 512 + dropout: 0.2 + emb_dim: 256 + fc_emb_dim: 512 + nlayers: 2 + type: TransformerModel + args: {} diff --git a/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f9a16de2efa334d403326acec7de5de4c3393d6 --- /dev/null +++ b/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d341dccafcdcfb7009c402afb07f314ab1d613a5f5c42d32407d6c2a821abf +size 41755865 diff --git a/audio_to_text/inference_waveform.py b/audio_to_text/inference_waveform.py new file mode 100644 index 0000000000000000000000000000000000000000..aba39614c8104f62cdb8a3c7e0e3cf5dced0d95a --- /dev/null +++ b/audio_to_text/inference_waveform.py @@ -0,0 +1,102 @@ +import sys +import os +import librosa +import numpy as np +import torch +import audio_to_text.captioning.models +import audio_to_text.captioning.models.encoder +import audio_to_text.captioning.models.decoder +import audio_to_text.captioning.utils.train_util as train_util + + +def load_model(config, checkpoint): + ckpt = torch.load(checkpoint, "cpu") + encoder_cfg = config["model"]["encoder"] + encoder = train_util.init_obj( + audio_to_text.captioning.models.encoder, + encoder_cfg + ) + if "pretrained" in encoder_cfg: + pretrained = encoder_cfg["pretrained"] + train_util.load_pretrained_model(encoder, + pretrained, + sys.stdout.write) + decoder_cfg = config["model"]["decoder"] + if "vocab_size" not in decoder_cfg["args"]: + decoder_cfg["args"]["vocab_size"] = len(ckpt["vocabulary"]) + decoder = train_util.init_obj( + audio_to_text.captioning.models.decoder, + decoder_cfg + ) + if "word_embedding" in decoder_cfg: + decoder.load_word_embedding(**decoder_cfg["word_embedding"]) + if "pretrained" in decoder_cfg: + pretrained = decoder_cfg["pretrained"] + train_util.load_pretrained_model(decoder, + pretrained, + sys.stdout.write) + model = train_util.init_obj(audio_to_text.captioning.models, config["model"], + encoder=encoder, decoder=decoder) + train_util.load_pretrained_model(model, ckpt) + model.eval() + return { + "model": model, + "vocabulary": ckpt["vocabulary"] + } + + +def decode_caption(word_ids, vocabulary): + candidate = [] + for word_id in word_ids: + word = vocabulary[word_id] + if word == "": + break + elif word == "": + continue + candidate.append(word) + candidate = " ".join(candidate) + return candidate + + +class AudioCapModel(object): + def __init__(self,weight_dir,device='cuda'): + config = os.path.join(weight_dir,'config.yaml') + self.config = train_util.parse_config_or_kwargs(config) + checkpoint = os.path.join(weight_dir,'swa.pth') + resumed = load_model(self.config, checkpoint) + model = resumed["model"] + self.vocabulary = resumed["vocabulary"] + self.model = model.to(device) + self.device = device + + def caption(self,audio_list): + if isinstance(audio_list,np.ndarray): + audio_list = [audio_list] + elif isinstance(audio_list,str): + audio_list = [librosa.load(audio_list,sr=32000)[0]] + + captions = [] + for wav in audio_list: + inputwav = torch.as_tensor(wav).float().unsqueeze(0).to(self.device) + wav_len = torch.LongTensor([len(wav)]) + input_dict = { + "mode": "inference", + "wav": inputwav, + "wav_len": wav_len, + "specaug": False, + "sample_method": "beam", + } + print(input_dict) + out_dict = self.model(input_dict) + caption_batch = [decode_caption(seq, self.vocabulary) for seq in \ + out_dict["seq"].cpu().numpy()] + captions.extend(caption_batch) + return captions + + + + def __call__(self, audio_list): + return self.caption(audio_list) + + + diff --git a/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth b/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f570dc2d96679fbdecaba7d8f266368fc7fb0c9 --- /dev/null +++ b/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c4faa86f30e77df235b5dc1fb6578a18ff2b8a1b0043f47e30acb9ccb53a336 +size 494977221 diff --git a/checkpoints/0102_xiaoma_pe/config.yaml b/checkpoints/0102_xiaoma_pe/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..69a88444205377d48573d53bb4fb500860976588 --- /dev/null +++ b/checkpoints/0102_xiaoma_pe/config.yaml @@ -0,0 +1,172 @@ +accumulate_grad_batches: 1 +audio_num_mel_bins: 80 +audio_sample_rate: 24000 +base_config: +- configs/tts/lj/fs2.yaml +binarization_args: + shuffle: false + with_align: true + with_f0: true + with_f0cwt: true + with_spk_embed: true + with_txt: true + with_wav: false +binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer +binary_data_dir: data/binary/xiaoma1022_24k_128hop +check_val_every_n_epoch: 10 +clip_grad_norm: 1 +cwt_add_f0_loss: false +cwt_hidden_size: 128 +cwt_layers: 2 +cwt_loss: l1 +cwt_std_scale: 0.8 +debug: false +dec_ffn_kernel_size: 9 +dec_layers: 4 +decoder_type: fft +dict_dir: '' +dropout: 0.1 +ds_workers: 4 +dur_enc_hidden_stride_kernel: +- 0,2,3 +- 0,2,3 +- 0,1,3 +dur_loss: mse +dur_predictor_kernel: 3 +dur_predictor_layers: 2 +enc_ffn_kernel_size: 9 +enc_layers: 4 +encoder_K: 8 +encoder_type: fft +endless_ds: true +ffn_act: gelu +ffn_padding: SAME +fft_size: 512 +fmax: 12000 +fmin: 30 +gen_dir_name: '' +hidden_size: 256 +hop_size: 128 +infer: false +lambda_commit: 0.25 +lambda_energy: 0.1 +lambda_f0: 1.0 +lambda_ph_dur: 1.0 +lambda_sent_dur: 1.0 +lambda_uv: 1.0 +lambda_word_dur: 1.0 +load_ckpt: '' +log_interval: 100 +loud_norm: false +lr: 2.0 +max_epochs: 1000 +max_eval_sentences: 1 +max_eval_tokens: 60000 +max_frames: 5000 +max_input_tokens: 1550 +max_sentences: 100000 +max_tokens: 20000 +max_updates: 60000 +mel_loss: l1 +mel_vmax: 1.5 +mel_vmin: -6 +min_level_db: -120 +norm_type: gn +num_ckpt_keep: 3 +num_heads: 2 +num_sanity_val_steps: 5 +num_spk: 1 +num_test_samples: 20 +num_valid_plots: 10 +optimizer_adam_beta1: 0.9 +optimizer_adam_beta2: 0.98 +out_wav_norm: false +pitch_ar: false +pitch_enc_hidden_stride_kernel: +- 0,2,5 +- 0,2,5 +- 0,2,5 +pitch_extractor_conv_layers: 2 +pitch_loss: l1 +pitch_norm: log +pitch_type: frame +pre_align_args: + allow_no_txt: false + denoise: false + forced_align: mfa + txt_processor: en + use_sox: false + use_tone: true +pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign +predictor_dropout: 0.5 +predictor_grad: 0.1 +predictor_hidden: -1 +predictor_kernel: 5 +predictor_layers: 2 +prenet_dropout: 0.5 +prenet_hidden_size: 256 +pretrain_fs_ckpt: '' +processed_data_dir: data/processed/ljspeech +profile_infer: false +raw_data_dir: data/raw/LJSpeech-1.1 +ref_norm_layer: bn +reset_phone_dict: true +save_best: false +save_ckpt: true +save_codes: +- configs +- modules +- tasks +- utils +- usr +save_f0: false +save_gt: false +seed: 1234 +sort_by_len: true +stop_token_weight: 5.0 +task_cls: tasks.tts.pe.PitchExtractionTask +test_ids: +- 68 +- 70 +- 74 +- 87 +- 110 +- 172 +- 190 +- 215 +- 231 +- 294 +- 316 +- 324 +- 402 +- 422 +- 485 +- 500 +- 505 +- 508 +- 509 +- 519 +test_input_dir: '' +test_num: 523 +test_set_name: test +train_set_name: train +use_denoise: false +use_energy_embed: false +use_gt_dur: false +use_gt_f0: false +use_pitch_embed: true +use_pos_embed: true +use_spk_embed: false +use_spk_id: false +use_split_spk_id: false +use_uv: true +use_var_enc: false +val_check_interval: 2000 +valid_num: 348 +valid_set_name: valid +vocoder: pwg +vocoder_ckpt: '' +warmup_updates: 2000 +weight_decay: 0 +win_size: 512 +work_dir: checkpoints/0102_xiaoma_pe diff --git a/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt b/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..468cc81b1a95e2f3dd490a8770bd705e14855f77 --- /dev/null +++ b/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53942abd8cb908b6d161e1ad7ff3d7d0dd6b204d5bf050613c9d00c56b185ceb +size 13047222 diff --git a/checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml b/checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..95fc5414ba1aff1bad8284ebfba52f5636b4d76d --- /dev/null +++ b/checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml @@ -0,0 +1,241 @@ +accumulate_grad_batches: 1 +adam_b1: 0.8 +adam_b2: 0.99 +amp: false +audio_num_mel_bins: 80 +audio_sample_rate: 24000 +aux_context_window: 0 +#base_config: +#- egs/egs_bases/singing/pwg.yaml +#- egs/egs_bases/tts/vocoder/hifigan.yaml +binarization_args: + reset_phone_dict: true + reset_word_dict: true + shuffle: false + trim_eos_bos: false + trim_sil: false + with_align: false + with_f0: true + with_f0cwt: false + with_linear: false + with_spk_embed: false + with_spk_id: true + with_txt: false + with_wav: true + with_word: false +binarizer_cls: data_gen.tts.singing.binarize.SingingBinarizer +binary_data_dir: data/binary/big_popcs_24k_hop128 +check_val_every_n_epoch: 10 +clip_grad_norm: 1 +clip_grad_value: 0 +datasets: [] +debug: false +dec_ffn_kernel_size: 9 +dec_layers: 4 +dict_dir: '' +disc_start_steps: 40000 +discriminator_grad_norm: 1 +discriminator_optimizer_params: + eps: 1.0e-06 + lr: 0.0002 + weight_decay: 0.0 +discriminator_params: + bias: true + conv_channels: 64 + in_channels: 1 + kernel_size: 3 + layers: 10 + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.2 + out_channels: 1 + use_weight_norm: true +discriminator_scheduler_params: + gamma: 0.999 + step_size: 600 +dropout: 0.1 +ds_workers: 1 +enc_ffn_kernel_size: 9 +enc_layers: 4 +endless_ds: true +ffn_act: gelu +ffn_padding: SAME +fft_size: 512 +fmax: 12000 +fmin: 30 +frames_multiple: 1 +gen_dir_name: '' +generator_grad_norm: 10 +generator_optimizer_params: + eps: 1.0e-06 + lr: 0.0002 + weight_decay: 0.0 +generator_params: + aux_channels: 80 + dropout: 0.0 + gate_channels: 128 + in_channels: 1 + kernel_size: 3 + layers: 30 + out_channels: 1 + residual_channels: 64 + skip_channels: 64 + stacks: 3 + upsample_net: ConvInUpsampleNetwork + upsample_params: + upsample_scales: + - 2 + - 4 + - 4 + - 4 + use_nsf: false + use_pitch_embed: true + use_weight_norm: true +generator_scheduler_params: + gamma: 0.999 + step_size: 600 +griffin_lim_iters: 60 +hidden_size: 256 +hop_size: 128 +infer: false +lambda_adv: 1.0 +lambda_cdisc: 4.0 +lambda_energy: 0.0 +lambda_f0: 0.0 +lambda_mel: 5.0 +lambda_mel_adv: 1.0 +lambda_ph_dur: 0.0 +lambda_sent_dur: 0.0 +lambda_uv: 0.0 +lambda_word_dur: 0.0 +load_ckpt: '' +loud_norm: false +lr: 2.0 +max_epochs: 1000 +max_frames: 2400 +max_input_tokens: 1550 +max_samples: 8192 +max_sentences: 20 +max_tokens: 24000 +max_updates: 3000000 +max_valid_sentences: 1 +max_valid_tokens: 60000 +mel_loss: ssim:0.5|l1:0.5 +mel_vmax: 1.5 +mel_vmin: -6 +min_frames: 0 +min_level_db: -120 +num_ckpt_keep: 3 +num_heads: 2 +num_mels: 80 +num_sanity_val_steps: 5 +num_spk: 100 +num_test_samples: 0 +num_valid_plots: 10 +optimizer_adam_beta1: 0.9 +optimizer_adam_beta2: 0.98 +out_wav_norm: false +pitch_extractor: parselmouth +pitch_type: frame +pre_align_args: + allow_no_txt: false + denoise: false + sox_resample: true + sox_to_wav: false + trim_sil: false + txt_processor: zh + use_tone: false +pre_align_cls: data_gen.tts.singing.pre_align.SingingPreAlign +predictor_grad: 0.0 +print_nan_grads: false +processed_data_dir: '' +profile_infer: false +raw_data_dir: '' +ref_level_db: 20 +rename_tmux: true +rerun_gen: true +resblock: '1' +resblock_dilation_sizes: +- - 1 + - 3 + - 5 +- - 1 + - 3 + - 5 +- - 1 + - 3 + - 5 +resblock_kernel_sizes: +- 3 +- 7 +- 11 +resume_from_checkpoint: 0 +save_best: true +save_codes: [] +save_f0: true +save_gt: true +scheduler: rsqrt +seed: 1234 +sort_by_len: true +stft_loss_params: + fft_sizes: + - 1024 + - 2048 + - 512 + hop_sizes: + - 120 + - 240 + - 50 + win_lengths: + - 600 + - 1200 + - 240 + window: hann_window +task_cls: tasks.vocoder.hifigan.HifiGanTask +tb_log_interval: 100 +test_ids: [] +test_input_dir: '' +test_num: 50 +test_prefixes: [] +test_set_name: test +train_set_name: train +train_sets: '' +upsample_initial_channel: 512 +upsample_kernel_sizes: +- 16 +- 16 +- 4 +- 4 +upsample_rates: +- 8 +- 4 +- 2 +- 2 +use_cdisc: false +use_cond_disc: false +use_fm_loss: false +use_gt_dur: true +use_gt_f0: true +use_mel_loss: true +use_ms_stft: false +use_pitch_embed: true +use_ref_enc: true +use_spec_disc: false +use_spk_embed: false +use_spk_id: false +use_split_spk_id: false +val_check_interval: 2000 +valid_infer_interval: 10000 +valid_monitor_key: val_loss +valid_monitor_mode: min +valid_set_name: valid +vocoder: pwg +vocoder_ckpt: '' +vocoder_denoise_c: 0.0 +warmup_updates: 8000 +weight_decay: 0 +win_length: null +win_size: 512 +window: hann +word_size: 3000 +work_dir: checkpoints/0109_hifigan_bigpopcs_hop128 diff --git a/checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt b/checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..ed55eaa98f86e3e22f4eb4e8115f254745cea155 --- /dev/null +++ b/checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cb68f3ce0c46ba0a8b6d49718f1fffdf5bd7bcab769a986fd2fd129835cc1d1 +size 55827436 diff --git a/checkpoints/0228_opencpop_ds100_rel/config.yaml b/checkpoints/0228_opencpop_ds100_rel/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..977627b65c12e00e5dd2cc42e423f9ee4899956a --- /dev/null +++ b/checkpoints/0228_opencpop_ds100_rel/config.yaml @@ -0,0 +1,342 @@ +K_step: 100 +accumulate_grad_batches: 1 +audio_num_mel_bins: 80 +audio_sample_rate: 24000 +base_config: +- usr/configs/popcs_ds_beta6.yaml +- usr/configs/midi/cascade/opencs/opencpop_statis.yaml +binarization_args: + shuffle: false + with_align: true + with_f0: true + with_f0cwt: true + with_spk_embed: false + with_txt: true + with_wav: true +binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer +binary_data_dir: data/binary/opencpop-midi-dp +check_val_every_n_epoch: 10 +clip_grad_norm: 1 +content_cond_steps: [] +cwt_add_f0_loss: false +cwt_hidden_size: 128 +cwt_layers: 2 +cwt_loss: l1 +cwt_std_scale: 0.8 +datasets: +- popcs +debug: false +dec_ffn_kernel_size: 9 +dec_layers: 4 +decay_steps: 50000 +decoder_type: fft +dict_dir: '' +diff_decoder_type: wavenet +diff_loss_type: l1 +dilation_cycle_length: 4 +dropout: 0.1 +ds_workers: 4 +dur_enc_hidden_stride_kernel: +- 0,2,3 +- 0,2,3 +- 0,1,3 +dur_loss: mse +dur_predictor_kernel: 3 +dur_predictor_layers: 5 +enc_ffn_kernel_size: 9 +enc_layers: 4 +encoder_K: 8 +encoder_type: fft +endless_ds: true +ffn_act: gelu +ffn_padding: SAME +fft_size: 512 +fmax: 12000 +fmin: 30 +fs2_ckpt: '' +gaussian_start: true +gen_dir_name: '' +gen_tgt_spk_id: -1 +hidden_size: 256 +hop_size: 128 +infer: false +keep_bins: 80 +lambda_commit: 0.25 +lambda_energy: 0.0 +lambda_f0: 0.0 +lambda_ph_dur: 1.0 +lambda_sent_dur: 1.0 +lambda_uv: 0.0 +lambda_word_dur: 1.0 +load_ckpt: '' +log_interval: 100 +loud_norm: false +lr: 0.001 +max_beta: 0.06 +max_epochs: 1000 +max_eval_sentences: 1 +max_eval_tokens: 60000 +max_frames: 8000 +max_input_tokens: 1550 +max_sentences: 48 +max_tokens: 40000 +max_updates: 160000 +mel_loss: ssim:0.5|l1:0.5 +mel_vmax: 1.5 +mel_vmin: -6.0 +min_level_db: -120 +norm_type: gn +num_ckpt_keep: 3 +num_heads: 2 +num_sanity_val_steps: 1 +num_spk: 1 +num_test_samples: 0 +num_valid_plots: 10 +optimizer_adam_beta1: 0.9 +optimizer_adam_beta2: 0.98 +out_wav_norm: false +pe_ckpt: checkpoints/0102_xiaoma_pe +pe_enable: true +pitch_ar: false +pitch_enc_hidden_stride_kernel: +- 0,2,5 +- 0,2,5 +- 0,2,5 +pitch_extractor: parselmouth +pitch_loss: l1 +pitch_norm: log +pitch_type: frame +pre_align_args: + allow_no_txt: false + denoise: false + forced_align: mfa + txt_processor: zh_g2pM + use_sox: true + use_tone: false +pre_align_cls: data_gen.singing.pre_align.SingingPreAlign +predictor_dropout: 0.5 +predictor_grad: 0.1 +predictor_hidden: -1 +predictor_kernel: 5 +predictor_layers: 5 +prenet_dropout: 0.5 +prenet_hidden_size: 256 +pretrain_fs_ckpt: '' +processed_data_dir: data/processed/popcs +profile_infer: false +raw_data_dir: data/raw/popcs +ref_norm_layer: bn +rel_pos: true +reset_phone_dict: true +residual_channels: 256 +residual_layers: 20 +save_best: false +save_ckpt: true +save_codes: +- configs +- modules +- tasks +- utils +- usr +save_f0: true +save_gt: false +schedule_type: linear +seed: 1234 +sort_by_len: true +spec_max: +- -0.79453 +- -0.81116 +- -0.61631 +- -0.30679 +- -0.13863 +- -0.050652 +- -0.11563 +- -0.10679 +- -0.091068 +- -0.062174 +- -0.075302 +- -0.072217 +- -0.063815 +- -0.073299 +- 0.007361 +- -0.072508 +- -0.050234 +- -0.16534 +- -0.26928 +- -0.20782 +- -0.20823 +- -0.11702 +- -0.070128 +- -0.065868 +- -0.012675 +- 0.0015121 +- -0.089902 +- -0.21392 +- -0.23789 +- -0.28922 +- -0.30405 +- -0.23029 +- -0.22088 +- -0.21542 +- -0.29367 +- -0.30137 +- -0.38281 +- -0.4359 +- -0.28681 +- -0.46855 +- -0.57485 +- -0.47022 +- -0.54266 +- -0.44848 +- -0.6412 +- -0.687 +- -0.6486 +- -0.76436 +- -0.49971 +- -0.71068 +- -0.69724 +- -0.61487 +- -0.55843 +- -0.69773 +- -0.57502 +- -0.70919 +- -0.82431 +- -0.84213 +- -0.90431 +- -0.8284 +- -0.77945 +- -0.82758 +- -0.87699 +- -1.0532 +- -1.0766 +- -1.1198 +- -1.0185 +- -0.98983 +- -1.0001 +- -1.0756 +- -1.0024 +- -1.0304 +- -1.0579 +- -1.0188 +- -1.05 +- -1.0842 +- -1.0923 +- -1.1223 +- -1.2381 +- -1.6467 +spec_min: +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +spk_cond_steps: [] +stop_token_weight: 5.0 +task_cls: usr.diffsinger_task.DiffSingerMIDITask +test_ids: [] +test_input_dir: '' +test_num: 0 +test_prefixes: +- "popcs-\u8BF4\u6563\u5C31\u6563" +- "popcs-\u9690\u5F62\u7684\u7FC5\u8180" +test_set_name: test +timesteps: 100 +train_set_name: train +use_denoise: false +use_energy_embed: false +use_gt_dur: false +use_gt_f0: false +use_midi: true +use_nsf: true +use_pitch_embed: false +use_pos_embed: true +use_spk_embed: false +use_spk_id: false +use_split_spk_id: false +use_uv: true +use_var_enc: false +val_check_interval: 2000 +valid_num: 0 +valid_set_name: valid +vocoder: vocoders.hifigan.HifiGAN +vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 +warmup_updates: 2000 +wav2spec_eps: 1e-6 +weight_decay: 0 +win_size: 512 +work_dir: checkpoints/0228_opencpop_ds100_rel diff --git a/checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt b/checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..07b944d43e3bd61ebd8272c09db0011425b4af08 --- /dev/null +++ b/checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a8261f7415bb39eb80a19d4c27c0ea084f63af2fdf6b82e63fcbd9cd82fc90c +size 170226367 diff --git a/checkpoints/0831_opencpop_ds1000/config.yaml b/checkpoints/0831_opencpop_ds1000/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cc2be3b17c1cab8a96f033a6370e6dbfbca1b66d --- /dev/null +++ b/checkpoints/0831_opencpop_ds1000/config.yaml @@ -0,0 +1,346 @@ +K_step: 1000 +accumulate_grad_batches: 1 +audio_num_mel_bins: 80 +audio_sample_rate: 24000 +base_config: +- usr/configs/popcs_ds_beta6.yaml +- usr/configs/midi/cascade/opencs/opencpop_statis.yaml +binarization_args: + shuffle: false + with_align: true + with_f0: true + with_f0cwt: true + with_spk_embed: false + with_txt: true + with_wav: true +binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer +binary_data_dir: data/binary/opencpop-midi-dp +check_val_every_n_epoch: 10 +clip_grad_norm: 1 +content_cond_steps: [] +cwt_add_f0_loss: false +cwt_hidden_size: 128 +cwt_layers: 2 +cwt_loss: l1 +cwt_std_scale: 0.8 +datasets: +- opencpop +debug: false +dec_ffn_kernel_size: 9 +dec_layers: 4 +decay_steps: 50000 +decoder_type: fft +dict_dir: '' +diff_decoder_type: wavenet +diff_loss_type: l1 +dilation_cycle_length: 4 +dropout: 0.1 +ds_workers: 4 +dur_enc_hidden_stride_kernel: +- 0,2,3 +- 0,2,3 +- 0,1,3 +dur_loss: mse +dur_predictor_kernel: 3 +dur_predictor_layers: 5 +enc_ffn_kernel_size: 9 +enc_layers: 4 +encoder_K: 8 +encoder_type: fft +endless_ds: true +ffn_act: gelu +ffn_padding: SAME +fft_size: 512 +fmax: 12000 +fmin: 30 +fs2_ckpt: '' +gaussian_start: true +gen_dir_name: '' +gen_tgt_spk_id: -1 +hidden_size: 256 +hop_size: 128 +infer: false +keep_bins: 80 +lambda_commit: 0.25 +lambda_energy: 0.0 +lambda_f0: 0.0 +lambda_ph_dur: 1.0 +lambda_sent_dur: 1.0 +lambda_uv: 0.0 +lambda_word_dur: 1.0 +load_ckpt: '' +log_interval: 100 +loud_norm: false +lr: 0.001 +max_beta: 0.02 +max_epochs: 1000 +max_eval_sentences: 1 +max_eval_tokens: 60000 +max_frames: 8000 +max_input_tokens: 1550 +max_sentences: 48 +max_tokens: 36000 +max_updates: 320000 +mel_loss: ssim:0.5|l1:0.5 +mel_vmax: 1.5 +mel_vmin: -6.0 +min_level_db: -120 +norm_type: gn +num_ckpt_keep: 3 +num_heads: 2 +num_sanity_val_steps: 1 +num_spk: 1 +num_test_samples: 0 +num_valid_plots: 10 +optimizer_adam_beta1: 0.9 +optimizer_adam_beta2: 0.98 +out_wav_norm: false +pe_ckpt: checkpoints/0102_xiaoma_pe +pe_enable: true +pitch_ar: false +pitch_enc_hidden_stride_kernel: +- 0,2,5 +- 0,2,5 +- 0,2,5 +pitch_extractor: parselmouth +pitch_loss: l1 +pitch_norm: log +pitch_type: frame +pre_align_args: + allow_no_txt: false + denoise: false + forced_align: mfa + txt_processor: zh_g2pM + use_sox: true + use_tone: false +pre_align_cls: data_gen.singing.pre_align.SingingPreAlign +predictor_dropout: 0.5 +predictor_grad: 0.1 +predictor_hidden: -1 +predictor_kernel: 5 +predictor_layers: 5 +prenet_dropout: 0.5 +prenet_hidden_size: 256 +pretrain_fs_ckpt: '' +processed_data_dir: xxx +profile_infer: false +raw_data_dir: data/raw/opencpop/segments +ref_norm_layer: bn +rel_pos: true +reset_phone_dict: true +residual_channels: 256 +residual_layers: 20 +save_best: false +save_ckpt: true +save_codes: +- configs +- modules +- tasks +- utils +- usr +save_f0: true +save_gt: false +schedule_type: linear +seed: 1234 +sort_by_len: true +spec_max: +- -0.79453 +- -0.81116 +- -0.61631 +- -0.30679 +- -0.13863 +- -0.050652 +- -0.11563 +- -0.10679 +- -0.091068 +- -0.062174 +- -0.075302 +- -0.072217 +- -0.063815 +- -0.073299 +- 0.007361 +- -0.072508 +- -0.050234 +- -0.16534 +- -0.26928 +- -0.20782 +- -0.20823 +- -0.11702 +- -0.070128 +- -0.065868 +- -0.012675 +- 0.0015121 +- -0.089902 +- -0.21392 +- -0.23789 +- -0.28922 +- -0.30405 +- -0.23029 +- -0.22088 +- -0.21542 +- -0.29367 +- -0.30137 +- -0.38281 +- -0.4359 +- -0.28681 +- -0.46855 +- -0.57485 +- -0.47022 +- -0.54266 +- -0.44848 +- -0.6412 +- -0.687 +- -0.6486 +- -0.76436 +- -0.49971 +- -0.71068 +- -0.69724 +- -0.61487 +- -0.55843 +- -0.69773 +- -0.57502 +- -0.70919 +- -0.82431 +- -0.84213 +- -0.90431 +- -0.8284 +- -0.77945 +- -0.82758 +- -0.87699 +- -1.0532 +- -1.0766 +- -1.1198 +- -1.0185 +- -0.98983 +- -1.0001 +- -1.0756 +- -1.0024 +- -1.0304 +- -1.0579 +- -1.0188 +- -1.05 +- -1.0842 +- -1.0923 +- -1.1223 +- -1.2381 +- -1.6467 +spec_min: +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +- -6.0 +spk_cond_steps: [] +stop_token_weight: 5.0 +task_cls: usr.diffsinger_task.DiffSingerMIDITask +test_ids: [] +test_input_dir: '' +test_num: 0 +test_prefixes: +- '2044' +- '2086' +- '2092' +- '2093' +- '2100' +test_set_name: test +timesteps: 1000 +train_set_name: train +use_denoise: false +use_energy_embed: false +use_gt_dur: false +use_gt_f0: false +use_midi: true +use_nsf: true +use_pitch_embed: false +use_pos_embed: true +use_spk_embed: false +use_spk_id: false +use_split_spk_id: false +use_uv: true +use_var_enc: false +val_check_interval: 2000 +valid_num: 0 +valid_set_name: valid +vocoder: vocoders.hifigan.HifiGAN +vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 +warmup_updates: 2000 +wav2spec_eps: 1e-6 +weight_decay: 0 +win_size: 512 +work_dir: checkpoints/0831_opencpop_ds1000 +pndm_speedup: 10 diff --git a/checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt b/checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..f36846cd61ffca537611feea3166011f480a443a --- /dev/null +++ b/checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:954a31208ee6afb6240d09454bb204c4fbc63cf70e2586bed0ab29b1dc964c9e +size 170269591 diff --git a/checkpoints/Emotion_encoder.pt b/checkpoints/Emotion_encoder.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac214aba4b7a248c6742782392529b8442855805 --- /dev/null +++ b/checkpoints/Emotion_encoder.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9de4930cbd8e5ba51efdef84c326e3728a5482dd7668f82960e4cb0f97cc8e5 +size 17095350 diff --git a/checkpoints/GenerSpeech/config.yaml b/checkpoints/GenerSpeech/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ed493feb76965929cd333ffafcb95f2d47cfc0e6 --- /dev/null +++ b/checkpoints/GenerSpeech/config.yaml @@ -0,0 +1,249 @@ +accumulate_grad_batches: 1 +amp: false +audio_num_mel_bins: 80 +audio_sample_rate: 16000 +base_config: +- egs/egs_bases/tts/fs2_adv.yaml +- egs/datasets/audio/emotion/base_text2mel.yaml +binarization_args: + reset_phone_dict: true + reset_word_dict: true + shuffle: true + trim_eos_bos: false + trim_sil: false + with_align: true + with_f0: true + with_f0cwt: false + with_linear: false + with_spk_embed: true + with_spk_id: true + with_txt: true + with_wav: true + with_word: true +binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer +binary_data_dir: data/binary/training_set +check_val_every_n_epoch: 10 +clip_grad_norm: 1 +clip_grad_value: 0 +conv_use_pos: false +crop: false +cwt_add_f0_loss: false +cwt_hidden_size: 128 +cwt_layers: 2 +cwt_loss: l1 +cwt_std_scale: 0.8 +debug: false +dec_dilations: +- 1 +- 1 +- 1 +- 1 +dec_ffn_kernel_size: 9 +dec_inp_add_noise: false +dec_kernel_size: 5 +dec_layers: 4 +dec_num_heads: 2 +decoder_rnn_dim: 0 +decoder_type: fft +dict_dir: '' +disc_hidden_size: 128 +disc_interval: 1 +disc_lr: 0.0001 +disc_norm: in +disc_reduction: stack +disc_start_steps: 0 +disc_win_num: 3 +discriminator_grad_norm: 1 +discriminator_optimizer_params: + eps: 1.0e-06 + weight_decay: 0.0 +discriminator_scheduler_params: + gamma: 0.5 + step_size: 60000 +dropout: 0.05 +ds_workers: 2 +dur_enc_hidden_stride_kernel: +- 0,2,3 +- 0,2,3 +- 0,1,3 +dur_loss: mse +dur_predictor_kernel: 3 +dur_predictor_layers: 2 +emotion_encoder_path: checkpoints/Emotion_encoder.pt # set the emotion encoder path +enc_dec_norm: ln +enc_dilations: +- 1 +- 1 +- 1 +- 1 +enc_ffn_kernel_size: 9 +enc_kernel_size: 5 +enc_layers: 4 +encoder_K: 8 +encoder_type: fft +endless_ds: true +ffn_act: gelu +ffn_hidden_size: 1024 +ffn_padding: SAME +fft_size: 1024 +fmax: 7600 +fmin: 80 +forcing: 20000 +frames_multiple: 1 +gen_dir_name: '' +generator_grad_norm: 5.0 +griffin_lim_iters: 60 +hidden_size: 256 +hop_size: 256 +infer: false +lambda_commit: 0.25 +lambda_energy: 0.1 +lambda_f0: 1.0 +lambda_mel_adv: 0.1 +lambda_ph_dur: 0.1 +lambda_sent_dur: 1.0 +lambda_uv: 1.0 +lambda_word_dur: 1.0 +layers_in_block: 2 +load_ckpt: '' +loud_norm: false +lr: 1.0 +max_epochs: 1000 +max_frames: 1548 +max_input_tokens: 1550 +max_sentences: 100000 +max_tokens: 30000 +max_updates: 300000 +max_valid_sentences: 1 +max_valid_tokens: 60000 +mel_disc_hidden_size: 128 +mel_gan: true +mel_hidden_size: 256 +mel_loss: ssim:0.5|l1:0.5 +mel_vmax: 1.5 +mel_vmin: -6 +min_frames: 128 +min_level_db: -100 +nVQ: 128 +noise_scale: 0.8 +num_ckpt_keep: 2 +num_heads: 2 +num_sanity_val_steps: -1 +num_spk: 500 +num_test_samples: 72 +num_valid_plots: 10 +optimizer_adam_beta1: 0.5 +optimizer_adam_beta2: 0.999 +out_wav_norm: false +pitch_ar: false +pitch_embed_type: 0 +pitch_enc_hidden_stride_kernel: +- 0,2,5 +- 0,2,5 +- 0,2,5 +pitch_extractor: parselmouth +pitch_loss: l1 +pitch_norm: standard +pitch_ssim_win: 11 +pitch_type: frame +post_glow_hidden: 128 +post_glow_kernel_size: 3 +post_glow_n_block_layers: 3 +post_glow_n_blocks: 8 +post_share_cond_layers: false +pre_align_args: + allow_no_txt: false + denoise: false + sox_resample: false + sox_to_wav: false + trim_sil: false + txt_processor: en + use_tone: true +pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign +predictor_dropout: 0.5 +predictor_grad: 1.0 +predictor_hidden: -1 +predictor_kernel: 5 +predictor_layers: 2 +preprocess_args: + add_eos_bos: true + mfa_group_shuffle: false + mfa_offset: 0.02 + nsample_per_mfa_group: 1000 + reset_phone_dict: true + reset_word_dict: true + save_sil_mask: true + txt_processor: en + use_mfa: true + vad_max_silence_length: 12 + wav_processors: [] + with_phsep: true +preprocess_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign +pretrain_fs_ckpt: '' +print_nan_grads: false +processed_data_dir: data/processed/emotion +profile_infer: false +raw_data_dir: data/raw/ESD +ref_audio: '' +ref_hidden_stride_kernel: +- 0,3,5 +- 0,3,5 +- 0,2,5 +- 0,2,5 +- 0,2,5 +ref_level_db: 20 +ref_norm_layer: bn +rename_tmux: true +rerun_gen: false +resume_from_checkpoint: 0 +save_best: false +save_codes: [] +save_f0: false +save_gt: true +scheduler: rsqrt +seed: 1234 +share_wn_layers: 4 +sigmoid_scale: false +sil_add_noise: false +sort_by_len: true +task_cls: modules.GenerSpeech.task.generspeech.GenerSpeechTask +tb_log_interval: 100 +test_ids: [] +test_input_dir: '' +test_num: 200 +test_set_name: test +text: '' +train_set_name: train +train_sets: '' +use_cond_disc: false +use_emotion: true +use_energy_embed: false +use_gt_dur: false +use_gt_f0: false +use_latent_cond: true +use_pitch_embed: true +use_pos_embed: true +use_ref_enc: false +use_spk_embed: true +use_spk_id: false +use_split_spk_id: false +use_txt_cond: true +use_uv: true +use_var_enc: false +use_word: true +vae_dropout: 0.0 +val_check_interval: 2000 +valid_infer_interval: 10000 +valid_monitor_key: val_loss +valid_monitor_mode: min +valid_set_name: valid +var_enc_vq_codes: 64 +vocoder: hifigan +vocoder_ckpt: checkpoints/trainset_hifigan +vocoder_denoise_c: 0.0 +vq_start: 20500 +warmup_updates: 2000 +weight_decay: 0 +win_size: 1024 +word_size: 30000 +work_dir: checkpoints/GenerSpeech diff --git a/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt b/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..def291d926fe008dc220e775ee525cdfe501d7c8 --- /dev/null +++ b/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b872bb686013cee2a98cc610b8b66b788c46ff4c33130682b63af4ac005405ea +size 619582860 diff --git a/checkpoints/trainset_hifigan/config.yaml b/checkpoints/trainset_hifigan/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..df5c3117c000dd2d20637f52dc11b87b653142e2 --- /dev/null +++ b/checkpoints/trainset_hifigan/config.yaml @@ -0,0 +1,178 @@ +accumulate_grad_batches: 1 +adam_b1: 0.8 +adam_b2: 0.99 +amp: false +audio_num_mel_bins: 80 +audio_sample_rate: 16000 +aux_context_window: 0 +base_config: +- egs/egs_bases/tts/vocoder/hifigan.yaml +- egs/datasets/audio/emotion/base_text2mel.yaml +binarization_args: + reset_phone_dict: true + reset_word_dict: true + shuffle: true + trim_eos_bos: false + trim_sil: false + with_align: false + with_f0: true + with_f0cwt: false + with_linear: false + with_spk_embed: false + with_spk_id: true + with_txt: false + with_wav: true + with_word: false +binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer +binary_data_dir: data/binary/training_set +check_val_every_n_epoch: 10 +clip_grad_norm: 1 +clip_grad_value: 0 +debug: false +dec_ffn_kernel_size: 9 +dec_layers: 4 +dict_dir: '' +disc_start_steps: 40000 +discriminator_grad_norm: 1 +discriminator_optimizer_params: + lr: 0.0002 +discriminator_scheduler_params: + gamma: 0.999 + step_size: 600 +dropout: 0.1 +ds_workers: 1 +enc_ffn_kernel_size: 9 +enc_layers: 4 +endless_ds: true +ffn_act: gelu +ffn_padding: SAME +fft_size: 1024 +fmax: 7600 +fmin: 80 +frames_multiple: 1 +gen_dir_name: '' +generator_grad_norm: 10 +generator_optimizer_params: + lr: 0.0002 +generator_scheduler_params: + gamma: 0.999 + step_size: 600 +griffin_lim_iters: 60 +hidden_size: 256 +hop_size: 256 +infer: false +lambda_adv: 1.0 +lambda_cdisc: 4.0 +lambda_mel: 5.0 +lambda_mel_adv: 1.0 +load_ckpt: '' +loud_norm: false +lr: 2.0 +max_epochs: 1000 +max_frames: 1548 +max_input_tokens: 1550 +max_samples: 8192 +max_sentences: 24 +max_tokens: 30000 +max_updates: 1000000 +max_valid_sentences: 1 +max_valid_tokens: 60000 +mel_loss: ssim:0.5|l1:0.5 +mel_vmax: 1.5 +mel_vmin: -6 +min_frames: 128 +min_level_db: -100 +num_ckpt_keep: 3 +num_heads: 2 +num_mels: 80 +num_sanity_val_steps: -1 +num_spk: 10 +num_test_samples: 30 +num_valid_plots: 10 +optimizer_adam_beta1: 0.9 +optimizer_adam_beta2: 0.98 +out_wav_norm: false +pitch_extractor: parselmouth +pitch_type: frame +pre_align_args: + allow_no_txt: false + denoise: false + sox_resample: false + sox_to_wav: false + trim_sil: false + txt_processor: en + use_tone: true +pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign +print_nan_grads: false +processed_data_dir: data/processed/emotion,data/processed/LibriTTS +profile_infer: false +raw_data_dir: data/raw/ESD +ref_level_db: 20 +rename_tmux: true +resblock: '1' +resblock_dilation_sizes: +- - 1 + - 3 + - 5 +- - 1 + - 3 + - 5 +- - 1 + - 3 + - 5 +resblock_kernel_sizes: +- 3 +- 7 +- 11 +resume_from_checkpoint: 0 +save_best: true +save_codes: [] +save_f0: false +save_gt: true +scheduler: rsqrt +seed: 1234 +sort_by_len: true +task_cls: tasks.vocoder.hifigan.HifiGanTask +tb_log_interval: 100 +test_ids: [] +test_input_dir: '' +test_num: 200 +test_set_name: test +train_set_name: train +train_sets: '' +upsample_initial_channel: 512 +upsample_kernel_sizes: +- 16 +- 16 +- 4 +- 4 +upsample_rates: +- 8 +- 8 +- 2 +- 2 +use_cdisc: false +use_cond_disc: false +use_emotion: true +use_fm_loss: false +use_ms_stft: false +use_pitch_embed: false +use_spec_disc: false +use_spk_embed: false +use_spk_id: true +use_split_spk_id: false +val_check_interval: 2000 +valid_infer_interval: 10000 +valid_monitor_key: val_loss +valid_monitor_mode: min +valid_set_name: valid +vocoder: pwg +vocoder_ckpt: '' +vocoder_denoise_c: 0.0 +warmup_updates: 8000 +weight_decay: 0 +win_length: null +win_size: 1024 +window: hann +word_size: 30000 +work_dir: checkpoints/trainset_hifigan diff --git a/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt b/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..9c71c2b0d75bd2867111cf7401bf8c7e0b77b03c --- /dev/null +++ b/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a2577919899400a111ef42a2aba65797d282c259d083d2c276539dda9d17870 +size 1016199247