henry000 commited on
Commit
d3c8b75
·
1 Parent(s): ac1aadb

🔨 [Add] the automatic download dataset script

Browse files
config/data_config/coco.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ path: data/coco
requirements.txt CHANGED
@@ -1,4 +1,7 @@
1
  loguru
2
  pyyaml
3
  pytest
4
- torch
 
 
 
 
1
  loguru
2
  pyyaml
3
  pytest
4
+ torch
5
+ requests
6
+ tqdm
7
+ rich
utils/get_dataset.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import zipfile
3
+ import os
4
+ from tqdm.rich import tqdm
5
+ from loguru import logger
6
+
7
+
8
+ def download_file(url, dest_path):
9
+ """
10
+ Downloads a file from a specified URL to a destination path with progress logging.
11
+ """
12
+ logger.info(f"Downloading {os.path.basename(dest_path)}...")
13
+ with requests.get(url, stream=True) as r:
14
+ r.raise_for_status()
15
+ total_length = int(r.headers.get("content-length", 0))
16
+ with open(dest_path, "wb") as f, tqdm(
17
+ total=total_length, unit="iB", unit_scale=True, desc=os.path.basename(dest_path), leave=True
18
+ ) as bar:
19
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
20
+ f.write(chunk)
21
+ bar.update(len(chunk))
22
+ logger.info("Download complete!")
23
+
24
+
25
+ def unzip_file(zip_path, extract_to):
26
+ """
27
+ Unzips a ZIP file to a specified directory.
28
+ """
29
+ logger.info(f"Unzipping {os.path.basename(zip_path)}...")
30
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
31
+ zip_ref.extractall(extract_to)
32
+ os.remove(zip_path)
33
+ logger.info(f"Removed {zip_path}")
34
+
35
+
36
+ def check_files(directory, expected_count):
37
+ """
38
+ Checks if the specified directory has the expected number of files.
39
+ """
40
+ num_files = len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])
41
+ return num_files == expected_count
42
+
43
+
44
+ def download_coco_dataset(data_dir: str = "./data/coco"):
45
+ base_url = "http://images.cocodataset.org/zips/"
46
+ datasets = {"train2017.zip": ("train", 118287), "test2017.zip": ("test", 40670), "val2017.zip": ("val", 5000)}
47
+
48
+ for file_name, (dataset_type, expected_files) in datasets.items():
49
+ url = f"{base_url}{file_name}"
50
+ local_zip_path = os.path.join(data_dir, file_name)
51
+ extract_to = os.path.join(data_dir, dataset_type, "images")
52
+
53
+ # Ensure the extraction directory exists
54
+ os.makedirs(extract_to, exist_ok=True)
55
+
56
+ # Check if the correct number of files exists
57
+ if check_files(extract_to, expected_files):
58
+ logger.info(f"Dataset {dataset_type} already verified.")
59
+ continue
60
+
61
+ if os.path.exists(local_zip_path):
62
+ logger.info(f"Dataset {dataset_type} already downloaded.")
63
+ else:
64
+ download_file(url, local_zip_path)
65
+
66
+ unzip_file(local_zip_path, extract_to)
67
+
68
+ print(os.path.exists(local_zip_path), check_files(extract_to, expected_files))
69
+
70
+ # Additional verification post extraction
71
+ if not check_files(extract_to, expected_files):
72
+ logger.error(f"Error in verifying the {dataset_type} dataset after extraction.")
73
+
74
+
75
+ if __name__ == "__main__":
76
+ from tools import custom_logger
77
+
78
+ custom_logger()
79
+ download_coco_dataset()