Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
6 |
from huggingface_hub import CommitScheduler
|
7 |
from uuid import uuid4
|
8 |
from datasets import load_dataset
|
|
|
9 |
|
10 |
DATASET_DIR = Path("json_dataset")
|
11 |
DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
@@ -17,19 +18,52 @@ scheduler = CommitScheduler(
|
|
17 |
path_in_repo="data"
|
18 |
)
|
19 |
|
20 |
-
def download_dataset_file(dataset_id,
|
21 |
-
dataset = load_dataset(dataset_id
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
return local_file_path
|
27 |
|
28 |
DATASET_ID = "350016z/flores_plus_Taiwanese"
|
29 |
data_path = "test.csv"
|
30 |
current_dir = os.getcwd()
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
33 |
|
34 |
# data_path = "test.csv"
|
35 |
# current_dir = os.path.dirname(os.path.abspath(data_path))
|
|
|
6 |
from huggingface_hub import CommitScheduler
|
7 |
from uuid import uuid4
|
8 |
from datasets import load_dataset
|
9 |
+
import shutil
|
10 |
|
11 |
DATASET_DIR = Path("json_dataset")
|
12 |
DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
18 |
path_in_repo="data"
|
19 |
)
|
20 |
|
21 |
+
def download_dataset_file(dataset_id, local_dir):
|
22 |
+
dataset = load_dataset(dataset_id)
|
23 |
+
cache_file_info = dataset.cache_files
|
24 |
+
print(f"Cache File Info: {cache_file_info}\n")
|
25 |
+
filename = cache_file_info['test'][0]['filename']
|
26 |
+
print(f"Filename: {filename}")
|
27 |
+
snapshot_id = filename.split('\\')[-2]
|
28 |
+
print(f"Snapshot ID: {snapshot_id}")
|
29 |
+
dataset_name = filename.split('\\')[6]
|
30 |
+
dataset_name = dataset_name.replace('___', '--')
|
31 |
+
print(f"Dataset Name: {dataset_name}")
|
32 |
+
base_path = os.path.join(
|
33 |
+
filename.split('\\')[0],'\\',
|
34 |
+
filename.split('\\')[1],
|
35 |
+
filename.split('\\')[2],
|
36 |
+
filename.split('\\')[3],
|
37 |
+
'huggingface',
|
38 |
+
'hub',
|
39 |
+
'datasets--' + dataset_name
|
40 |
+
)
|
41 |
+
print("Base Path: ", base_path)
|
42 |
+
snapshot_path = os.path.join(base_path, "snapshots", snapshot_id)
|
43 |
+
print(f"snapshot_path: {snapshot_path}")
|
44 |
+
|
45 |
+
contents = os.listdir(snapshot_path)
|
46 |
+
print("Contents of snapshot path:")
|
47 |
+
print(contents)
|
48 |
+
|
49 |
+
for file_name in contents:
|
50 |
+
print("Checking file: ", file_name)
|
51 |
+
if file_name.endswith(".csv"):
|
52 |
+
print("Found CSV file: ", file_name)
|
53 |
+
source_file = os.path.join(snapshot_path, file_name)
|
54 |
+
local_file_path = os.path.join(local_dir, file_name)
|
55 |
+
shutil.move(source_file, local_file_path)
|
56 |
+
|
57 |
return local_file_path
|
58 |
|
59 |
DATASET_ID = "350016z/flores_plus_Taiwanese"
|
60 |
data_path = "test.csv"
|
61 |
current_dir = os.getcwd()
|
62 |
+
data_path = download_dataset_file(DATASET_ID, current_dir)
|
63 |
+
print(f"Data path: {data_path}")
|
64 |
+
|
65 |
+
|
66 |
+
csv_files = [f for f in os.listdir(current_dir) if f.endswith('.csv')]
|
67 |
|
68 |
# data_path = "test.csv"
|
69 |
# current_dir = os.path.dirname(os.path.abspath(data_path))
|