laurievb commited on
Commit
664c399
1 Parent(s): 895b334

Upload scripts/prepare_openlid_v2_for_model_training.sh with huggingface_hub

Browse files
scripts/prepare_openlid_v2_for_model_training.sh ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # author: laurie
3
+ # script to sample OpenLID-v2 prior to training
4
+ # usage: bash prepare_opelid_v2_for_model_training.sh PATH_TO_OPENLID-V2
5
+ set -eo pipefail
6
+
7
+ START_DIR=${PWD}
8
+ echo "starting dir is ${START_DIR}"
9
+ INPUT_DATA=$1 # should be openlid-v2 dataset
10
+ echo "using openlid-v2 data from ${1}"
11
+
12
+ echo "generating counts in stats/"
13
+ mkdir -p stats
14
+ cut -f2 -d$'\t' $INPUT_DATA | uniq -c > stats/openlid-v2-unsampled.counts
15
+
16
+ echo "applying temperature sampling..."
17
+ python scripts/sample_with_temperature.py $INPUT_DATA stats/openlid-v2-unsampled.counts > openlid-v2-sampled.tsv