Spaces:
Running
Running
File size: 1,798 Bytes
03f6091 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# -*- coding: utf-8 -*-
r"""
Corpora
==============
Available corpora to train/test Polos models.
"""
import os
import click
from torchnlp.download import download_file_maybe_extract
corpus2download = {
"apequest": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/hter/apequest.zip",
"qt21": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/hter/qt21.zip",
"wmt-metrics": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/da/wmt-metrics.zip",
"doc-wmt19": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/da/doc-wmt19.zip",
}
def download_corpus(corpus: str, saving_directory: str = None) -> None:
"""Function that downloads a corpus from AWS.
:param corpus: Name of the corpus to be loaded.
:param saving_directory: RELATIVE path to the saving folder.
"""
corpus = corpus.lower()
if not saving_directory:
saving_directory = "data/"
if not os.path.exists(saving_directory):
os.makedirs(saving_directory)
if os.path.isdir(saving_directory + corpus):
click.secho(f"{corpus} is already in cache.", fg="yellow")
return
elif corpus in corpus2download:
download_file_maybe_extract(
corpus2download[corpus],
directory=saving_directory,
)
else:
raise Exception(f"{corpus} is not a valid corpus!")
click.secho("Download succeeded.", fg="yellow")
if os.path.exists(saving_directory + corpus + ".zip"):
os.remove(saving_directory + corpus + ".zip")
elif os.path.exists(saving_directory + corpus + ".tar.gz"):
os.remove(saving_directory + corpus + ".tar.gz")
else:
click.secho("Fail to delete compressed file.", fg="red")
|