File size: 1,294 Bytes
d08dd00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/bin/bash

if [ $# != 2 ]; then
    echo "USAGE: ./gen_mtxt.sh <data dir>";
    exit
fi

declare -a langs=("as" "or" "kn" "ml" "ta" "te" "gu" "mr" "en" "hi" "pa" "bn")
DATA_DIR="$1"

# Generate train small file

OUTPUT="$DATA_DIR/train_small.txt"

if [ -f "$OUTPUT" ]; then
    echo "Output file already exists. Please remove it first"
    exit
fi

for lang in ${langs[@]}; do
	echo "Processing $lang"
	lines=$(wc -l "$DATA_DIR/$lang.txt" | cut -d' ' -f1)
	smtlines=$(echo "e(l($lines*100)*0.7)/1" | bc -l)
	smtlines=${smtlines%.*}
	echo "Sampling $smtlines from $lines lines";
	cat "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt"\
        "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" | head -n "$smtlines" >> "$OUTPUT"
done


# Generate train file
OUTPUT="$DATA_DIR/train.txt"

if [ -f "$OUTPUT" ]; then
    echo "Output file already exists. Please remove it first"
    exit
fi

for lang in ${langs[@]}; do
	echo "Processing $lang"
	lines=$(wc -l "$DATA_DIR/$lang.txt" | cut -d' ' -f1)
	smtlines=$(echo "e(l($lines*2100)*0.7)/1" | bc -l)
	smtlines=${smtlines%.*}
	echo "Sampling $smtlines from $lines lines";
	cat "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt"\
        "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" | head -n "$smtlines" >> "$OUTPUT"
done