Upload scripts/prepare_openlid_v2_for_model_training.sh with huggingface_hub
Browse files
scripts/prepare_openlid_v2_for_model_training.sh
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# author: laurie
|
3 |
+
# script to sample OpenLID-v2 prior to training
|
4 |
+
# usage: bash prepare_opelid_v2_for_model_training.sh PATH_TO_OPENLID-V2
|
5 |
+
set -eo pipefail
|
6 |
+
|
7 |
+
START_DIR=${PWD}
|
8 |
+
echo "starting dir is ${START_DIR}"
|
9 |
+
INPUT_DATA=$1 # should be openlid-v2 dataset
|
10 |
+
echo "using openlid-v2 data from ${1}"
|
11 |
+
|
12 |
+
echo "generating counts in stats/"
|
13 |
+
mkdir -p stats
|
14 |
+
cut -f2 -d$'\t' $INPUT_DATA | uniq -c > stats/openlid-v2-unsampled.counts
|
15 |
+
|
16 |
+
echo "applying temperature sampling..."
|
17 |
+
python scripts/sample_with_temperature.py $INPUT_DATA stats/openlid-v2-unsampled.counts > openlid-v2-sampled.tsv
|