as1605 commited on
Commit
275cf7c
·
1 Parent(s): 1cadd18
README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: motheecreator/Deepfake-audio-detection
4
+ tags:
5
+ - generated_from_trainer
6
+ datasets:
7
+ - audiofolder
8
+ metrics:
9
+ - accuracy
10
+ model-index:
11
+ - name: Deepfake-audio-detection-V2
12
+ results:
13
+ - task:
14
+ name: Audio Classification
15
+ type: audio-classification
16
+ dataset:
17
+ name: audiofolder
18
+ type: audiofolder
19
+ config: default
20
+ split: train
21
+ args: default
22
+ metrics:
23
+ - name: Accuracy
24
+ type: accuracy
25
+ value: 0.9972843305874898
26
+ ---
27
+
28
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
29
+ should probably proofread and complete it, then remove this comment. -->
30
+
31
+ # Deepfake-audio-detection-V2
32
+
33
+ This model is a fine-tuned version of [motheecreator/Deepfake-audio-detection](https://huggingface.co/motheecreator/Deepfake-audio-detection) on the audiofolder dataset.
34
+ It achieves the following results on the evaluation set:
35
+ - Loss: 0.0141
36
+ - Accuracy: 0.9973
37
+
38
+ ## Model description
39
+
40
+ More information needed
41
+
42
+ ## Intended uses & limitations
43
+
44
+ More information needed
45
+
46
+ ## Training and evaluation data
47
+
48
+ More information needed
49
+
50
+ ## Training procedure
51
+
52
+ ### Training hyperparameters
53
+
54
+ The following hyperparameters were used during training:
55
+ - learning_rate: 3e-05
56
+ - train_batch_size: 32
57
+ - eval_batch_size: 32
58
+ - seed: 42
59
+ - gradient_accumulation_steps: 4
60
+ - total_train_batch_size: 128
61
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
+ - lr_scheduler_type: cosine
63
+ - lr_scheduler_warmup_ratio: 0.1
64
+ - num_epochs: 5
65
+
66
+ ### Training results
67
+
68
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
69
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
70
+ | 0.0503 | 1.0 | 1381 | 0.0514 | 0.9858 |
71
+ | 0.0327 | 2.0 | 2762 | 0.0174 | 0.9956 |
72
+ | 0.0064 | 3.0 | 4143 | 0.0221 | 0.9950 |
73
+ | 0.0003 | 4.0 | 5524 | 0.0174 | 0.9965 |
74
+ | 0.0115 | 5.0 | 6905 | 0.0141 | 0.9973 |
75
+
76
+
77
+ ### Framework versions
78
+
79
+ - Transformers 4.41.2
80
+ - Pytorch 2.1.2
81
+ - Datasets 2.19.2
82
+ - Tokenizers 0.19.1
config.json ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "as1605/Deepfake-audio-detection-V2",
3
+ "base_model": "MelodyMachine/Deepfake-audio-detection-V2",
4
+ "activation_dropout": 0.0,
5
+ "adapter_attn_dim": null,
6
+ "adapter_kernel_size": 3,
7
+ "adapter_stride": 2,
8
+ "add_adapter": false,
9
+ "apply_spec_augment": true,
10
+ "architectures": [
11
+ "Wav2Vec2ForSequenceClassification"
12
+ ],
13
+ "attention_dropout": 0.1,
14
+ "bos_token_id": 1,
15
+ "classifier_proj_size": 256,
16
+ "codevector_dim": 256,
17
+ "contrastive_logits_temperature": 0.1,
18
+ "conv_bias": false,
19
+ "conv_dim": [
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512,
26
+ 512
27
+ ],
28
+ "conv_kernel": [
29
+ 10,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 3,
34
+ 2,
35
+ 2
36
+ ],
37
+ "conv_stride": [
38
+ 5,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2,
44
+ 2
45
+ ],
46
+ "ctc_loss_reduction": "sum",
47
+ "ctc_zero_infinity": false,
48
+ "diversity_loss_weight": 0.1,
49
+ "do_stable_layer_norm": false,
50
+ "eos_token_id": 2,
51
+ "feat_extract_activation": "gelu",
52
+ "feat_extract_norm": "group",
53
+ "feat_proj_dropout": 0.1,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.0,
56
+ "freeze_feat_extract_train": true,
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.1,
59
+ "hidden_size": 768,
60
+ "id2label": {
61
+ "0": "fake",
62
+ "1": "real"
63
+ },
64
+ "initializer_range": 0.02,
65
+ "intermediate_size": 3072,
66
+ "label2id": {
67
+ "fake": "0",
68
+ "real": "1"
69
+ },
70
+ "layer_norm_eps": 1e-05,
71
+ "layerdrop": 0.0,
72
+ "mask_channel_length": 10,
73
+ "mask_channel_min_space": 1,
74
+ "mask_channel_other": 0.0,
75
+ "mask_channel_prob": 0.0,
76
+ "mask_channel_selection": "static",
77
+ "mask_feature_length": 10,
78
+ "mask_feature_min_masks": 0,
79
+ "mask_feature_prob": 0.0,
80
+ "mask_time_length": 10,
81
+ "mask_time_min_masks": 2,
82
+ "mask_time_min_space": 1,
83
+ "mask_time_other": 0.0,
84
+ "mask_time_prob": 0.05,
85
+ "mask_time_selection": "static",
86
+ "model_type": "wav2vec2",
87
+ "no_mask_channel_overlap": false,
88
+ "no_mask_time_overlap": false,
89
+ "num_adapter_layers": 3,
90
+ "num_attention_heads": 12,
91
+ "num_codevector_groups": 2,
92
+ "num_codevectors_per_group": 320,
93
+ "num_conv_pos_embedding_groups": 16,
94
+ "num_conv_pos_embeddings": 128,
95
+ "num_feat_extract_layers": 7,
96
+ "num_hidden_layers": 12,
97
+ "num_negatives": 100,
98
+ "output_hidden_size": 768,
99
+ "pad_token_id": 0,
100
+ "proj_codevector_dim": 256,
101
+ "tdnn_dilation": [
102
+ 1,
103
+ 2,
104
+ 3,
105
+ 1,
106
+ 1
107
+ ],
108
+ "tdnn_dim": [
109
+ 512,
110
+ 512,
111
+ 512,
112
+ 512,
113
+ 1500
114
+ ],
115
+ "tdnn_kernel": [
116
+ 5,
117
+ 3,
118
+ 3,
119
+ 1,
120
+ 1
121
+ ],
122
+ "torch_dtype": "float32",
123
+ "transformers_version": "4.41.2",
124
+ "use_weighted_layer_sum": false,
125
+ "vocab_size": 32,
126
+ "xvector_output_dim": 512
127
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:997d9ce59e63151d5e444a6fa7c863986d0e56d515f67321bd705ac3b01bc38c
3
+ size 378302360
model/config.json ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "as1605/Deepfake-audio-detection-V2",
4
+ "base_model": "MelodyMachine/Deepfake-audio-detection-V2",
5
+ "activation_dropout": 0.0,
6
+ "adapter_attn_dim": null,
7
+ "adapter_kernel_size": 3,
8
+ "adapter_stride": 2,
9
+ "add_adapter": false,
10
+ "apply_spec_augment": true,
11
+ "architectures": [
12
+ "Wav2Vec2ForSequenceClassification"
13
+ ],
14
+ "attention_dropout": 0.1,
15
+ "bos_token_id": 1,
16
+ "classifier_proj_size": 256,
17
+ "codevector_dim": 256,
18
+ "contrastive_logits_temperature": 0.1,
19
+ "conv_bias": false,
20
+ "conv_dim": [
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512,
26
+ 512,
27
+ 512
28
+ ],
29
+ "conv_kernel": [
30
+ 10,
31
+ 3,
32
+ 3,
33
+ 3,
34
+ 3,
35
+ 2,
36
+ 2
37
+ ],
38
+ "conv_stride": [
39
+ 5,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2,
44
+ 2,
45
+ 2
46
+ ],
47
+ "ctc_loss_reduction": "sum",
48
+ "ctc_zero_infinity": false,
49
+ "diversity_loss_weight": 0.1,
50
+ "do_stable_layer_norm": false,
51
+ "eos_token_id": 2,
52
+ "feat_extract_activation": "gelu",
53
+ "feat_extract_norm": "group",
54
+ "feat_proj_dropout": 0.1,
55
+ "feat_quantizer_dropout": 0.0,
56
+ "final_dropout": 0.0,
57
+ "freeze_feat_extract_train": true,
58
+ "hidden_act": "gelu",
59
+ "hidden_dropout": 0.1,
60
+ "hidden_size": 768,
61
+ "id2label": {
62
+ "0": "fake",
63
+ "1": "real"
64
+ },
65
+ "initializer_range": 0.02,
66
+ "intermediate_size": 3072,
67
+ "label2id": {
68
+ "fake": "0",
69
+ "real": "1"
70
+ },
71
+ "layer_norm_eps": 1e-05,
72
+ "layerdrop": 0.0,
73
+ "mask_channel_length": 10,
74
+ "mask_channel_min_space": 1,
75
+ "mask_channel_other": 0.0,
76
+ "mask_channel_prob": 0.0,
77
+ "mask_channel_selection": "static",
78
+ "mask_feature_length": 10,
79
+ "mask_feature_min_masks": 0,
80
+ "mask_feature_prob": 0.0,
81
+ "mask_time_length": 10,
82
+ "mask_time_min_masks": 2,
83
+ "mask_time_min_space": 1,
84
+ "mask_time_other": 0.0,
85
+ "mask_time_prob": 0.05,
86
+ "mask_time_selection": "static",
87
+ "model_type": "wav2vec2",
88
+ "no_mask_channel_overlap": false,
89
+ "no_mask_time_overlap": false,
90
+ "num_adapter_layers": 3,
91
+ "num_attention_heads": 12,
92
+ "num_codevector_groups": 2,
93
+ "num_codevectors_per_group": 320,
94
+ "num_conv_pos_embedding_groups": 16,
95
+ "num_conv_pos_embeddings": 128,
96
+ "num_feat_extract_layers": 7,
97
+ "num_hidden_layers": 12,
98
+ "num_negatives": 100,
99
+ "output_hidden_size": 768,
100
+ "pad_token_id": 0,
101
+ "proj_codevector_dim": 256,
102
+ "tdnn_dilation": [
103
+ 1,
104
+ 2,
105
+ 3,
106
+ 1,
107
+ 1
108
+ ],
109
+ "tdnn_dim": [
110
+ 512,
111
+ 512,
112
+ 512,
113
+ 512,
114
+ 1500
115
+ ],
116
+ "tdnn_kernel": [
117
+ 5,
118
+ 3,
119
+ 3,
120
+ 1,
121
+ 1
122
+ ],
123
+ "transformers_version": "4.46.3",
124
+ "use_weighted_layer_sum": false,
125
+ "vocab_size": 32,
126
+ "xvector_output_dim": 512
127
+ }
model/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc1847c0e6f294572d38b04ab9715c4af1edde45ce1ff7b1523220615d1903e2
3
+ size 378551083
model/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd510614d7665c56a341d9e306c4baa816023771f32f7b900c2a947e42fc02f3
3
+ size 5176