350016z commited on
Commit
88ed347
·
verified ·
1 Parent(s): 847e429

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -4,7 +4,7 @@ import os
4
  import time
5
  import json
6
  from pathlib import Path
7
- from huggingface_hub import CommitScheduler
8
  from uuid import uuid4
9
  from datasets import load_dataset
10
  import shutil
@@ -21,19 +21,20 @@ scheduler = CommitScheduler(
21
 
22
  # Loading dataset from HuggingFace -------------------------------------------------------------------------------------
23
  def download_dataset_file(dataset_id, local_dir):
24
- dataset = load_dataset(dataset_id)
25
- cache_file_info = dataset.cache_files
26
- print(f"Cache File Info: {cache_file_info}\n")
27
-
28
- filename = cache_file_info['test'][0]['filename']
29
- snapshot_id = filename.split('/')[-2]
30
- dataset_name = "350016z--Taiwanese_dataset" # change there
31
- snapshot_path = os.path.join("/home/user/.cache/huggingface/hub", "datasets--"+dataset_name, "snapshots", snapshot_id)
 
 
32
  contents = os.listdir(snapshot_path)
33
 
34
  print("---------------------------------------")
35
  print(contents)
36
- print(os.listdir("/home/user/.cache/huggingface/hub/datasets--350016z--Taiwanese_dataset/snapshots/"))
37
  print("---------------------------------------")
38
 
39
  for file_name in contents:
 
4
  import time
5
  import json
6
  from pathlib import Path
7
+ from huggingface_hub import CommitScheduler, snapshot_download
8
  from uuid import uuid4
9
  from datasets import load_dataset
10
  import shutil
 
21
 
22
  # Loading dataset from HuggingFace -------------------------------------------------------------------------------------
23
  def download_dataset_file(dataset_id, local_dir):
24
+ # dataset = load_dataset(dataset_id)
25
+ # cache_file_info = dataset.cache_files
26
+ # print(f"Cache File Info: {cache_file_info}\n")
27
+
28
+ # filename = cache_file_info['test'][0]['filename']
29
+ # snapshot_id = filename.split('/')[-2]
30
+ # dataset_name = "350016z--Taiwanese_dataset" # change there
31
+ # snapshot_path = os.path.join("/home/user/.cache/huggingface/hub", "datasets--"+dataset_name, "snapshots", snapshot_id)
32
+ # contents = os.listdir(snapshot_path)
33
+ snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset")
34
  contents = os.listdir(snapshot_path)
35
 
36
  print("---------------------------------------")
37
  print(contents)
 
38
  print("---------------------------------------")
39
 
40
  for file_name in contents: