File size: 1,798 Bytes
03f6091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# -*- coding: utf-8 -*-
r"""
Corpora
==============
    Available corpora to train/test Polos models.
"""
import os

import click

from torchnlp.download import download_file_maybe_extract

corpus2download = {
    "apequest": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/hter/apequest.zip",
    "qt21": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/hter/qt21.zip",
    "wmt-metrics": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/da/wmt-metrics.zip",
    "doc-wmt19": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/da/doc-wmt19.zip",
}


def download_corpus(corpus: str, saving_directory: str = None) -> None:
    """Function that downloads a corpus from AWS.

    :param corpus: Name of the corpus to be loaded.
    :param saving_directory: RELATIVE path to the saving folder.
    """
    corpus = corpus.lower()
    if not saving_directory:
        saving_directory = "data/"

    if not os.path.exists(saving_directory):
        os.makedirs(saving_directory)

    if os.path.isdir(saving_directory + corpus):
        click.secho(f"{corpus} is already in cache.", fg="yellow")
        return

    elif corpus in corpus2download:
        download_file_maybe_extract(
            corpus2download[corpus],
            directory=saving_directory,
        )

    else:
        raise Exception(f"{corpus} is not a valid corpus!")

    click.secho("Download succeeded.", fg="yellow")
    if os.path.exists(saving_directory + corpus + ".zip"):
        os.remove(saving_directory + corpus + ".zip")

    elif os.path.exists(saving_directory + corpus + ".tar.gz"):
        os.remove(saving_directory + corpus + ".tar.gz")

    else:
        click.secho("Fail to delete compressed file.", fg="red")