350016z commited on
Commit
5651817
·
verified ·
1 Parent(s): a82b5c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -8
app.py CHANGED
@@ -6,6 +6,7 @@ from pathlib import Path
6
  from huggingface_hub import CommitScheduler
7
  from uuid import uuid4
8
  from datasets import load_dataset
 
9
 
10
  DATASET_DIR = Path("json_dataset")
11
  DATASET_DIR.mkdir(parents=True, exist_ok=True)
@@ -17,19 +18,52 @@ scheduler = CommitScheduler(
17
  path_in_repo="data"
18
  )
19
 
20
- def download_dataset_file(dataset_id, file_name, local_dir):
21
- dataset = load_dataset(dataset_id, split="test")
22
- local_file_path = os.path.join(local_dir, file_name)
23
-
24
- df = pd.DataFrame(dataset)
25
- df.to_csv(local_file_path, index=False, encoding="utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  return local_file_path
27
 
28
  DATASET_ID = "350016z/flores_plus_Taiwanese"
29
  data_path = "test.csv"
30
  current_dir = os.getcwd()
31
- csv_files = [data_path]
32
- data_path = download_dataset_file(DATASET_ID, data_path, current_dir)
 
 
 
33
 
34
  # data_path = "test.csv"
35
  # current_dir = os.path.dirname(os.path.abspath(data_path))
 
6
  from huggingface_hub import CommitScheduler
7
  from uuid import uuid4
8
  from datasets import load_dataset
9
+ import shutil
10
 
11
  DATASET_DIR = Path("json_dataset")
12
  DATASET_DIR.mkdir(parents=True, exist_ok=True)
 
18
  path_in_repo="data"
19
  )
20
 
21
+ def download_dataset_file(dataset_id, local_dir):
22
+ dataset = load_dataset(dataset_id)
23
+ cache_file_info = dataset.cache_files
24
+ print(f"Cache File Info: {cache_file_info}\n")
25
+ filename = cache_file_info['test'][0]['filename']
26
+ print(f"Filename: {filename}")
27
+ snapshot_id = filename.split('\\')[-2]
28
+ print(f"Snapshot ID: {snapshot_id}")
29
+ dataset_name = filename.split('\\')[6]
30
+ dataset_name = dataset_name.replace('___', '--')
31
+ print(f"Dataset Name: {dataset_name}")
32
+ base_path = os.path.join(
33
+ filename.split('\\')[0],'\\',
34
+ filename.split('\\')[1],
35
+ filename.split('\\')[2],
36
+ filename.split('\\')[3],
37
+ 'huggingface',
38
+ 'hub',
39
+ 'datasets--' + dataset_name
40
+ )
41
+ print("Base Path: ", base_path)
42
+ snapshot_path = os.path.join(base_path, "snapshots", snapshot_id)
43
+ print(f"snapshot_path: {snapshot_path}")
44
+
45
+ contents = os.listdir(snapshot_path)
46
+ print("Contents of snapshot path:")
47
+ print(contents)
48
+
49
+ for file_name in contents:
50
+ print("Checking file: ", file_name)
51
+ if file_name.endswith(".csv"):
52
+ print("Found CSV file: ", file_name)
53
+ source_file = os.path.join(snapshot_path, file_name)
54
+ local_file_path = os.path.join(local_dir, file_name)
55
+ shutil.move(source_file, local_file_path)
56
+
57
  return local_file_path
58
 
59
  DATASET_ID = "350016z/flores_plus_Taiwanese"
60
  data_path = "test.csv"
61
  current_dir = os.getcwd()
62
+ data_path = download_dataset_file(DATASET_ID, current_dir)
63
+ print(f"Data path: {data_path}")
64
+
65
+
66
+ csv_files = [f for f in os.listdir(current_dir) if f.endswith('.csv')]
67
 
68
  # data_path = "test.csv"
69
  # current_dir = os.path.dirname(os.path.abspath(data_path))