File size: 4,466 Bytes
2564dd3
 
 
 
 
 
 
 
 
 
 
 
28b0783
2564dd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28b0783
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


# Paths
data_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage3/filtered_video_image_asr_caption_stage3.json'
audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data'
audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/audio_caption'
video_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video'
image_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA'
new_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/video_image_asr_caption_pre_1211.json'

# Load JSON data
with open(data_json_path, 'r') as f:
    data = json.load(f)

# Function to check if a file exists in a folder
def file_exists(folder, filename):
    return os.path.exists(os.path.join(folder, filename))

# Initialize counters for missing and total files by type
file_counts = {
    "video": {"total": 0, "missing": 0},
    "audio_asr": {"total": 0, "missing": 0},
    "audio_caption": {"total": 0, "missing": 0},
    "image": {"total": 0, "missing": 0},
    "unknown": {"total": 0, "missing": 0}  # For items missing all types of files
}

# Helper function to process each item in the dataset
def process_item(item):
    result = {"item": item, "valid": True, "missing": []}
    found = False

    if 'video' in item:
        video_file = item['video']
        file_counts["video"]["total"] += 1
        found = True
        if not video_file or not file_exists(video_folder, video_file):
            result['missing'].append(f"Video file missing or not found: {video_file}")
            result['valid'] = False
            file_counts["video"]["missing"] += 1

    if 'audio_asr' in item:
        audio_asr_file = item['audio_asr']
        file_counts["audio_asr"]["total"] += 1
        found = True
        if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file):
            result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}")
            result['valid'] = False
            file_counts["audio_asr"]["missing"] += 1

    if 'audio_caption' in item:
        audio_caption_file = item['audio_caption']
        file_counts["audio_caption"]["total"] += 1
        found = True
        if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file):
            result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}")
            result['valid'] = False
            file_counts["audio_caption"]["missing"] += 1

    if 'image' in item:
        image_file = item['image']
        file_counts["image"]["total"] += 1
        found = True
        if not image_file or not file_exists(image_folder, image_file):
            result['missing'].append(f"Image file missing or not found: {image_file}")
            result['valid'] = False
            file_counts["image"]["missing"] += 1

    if not found:
        result['valid'] = False
        file_counts["unknown"]["total"] += 1
        file_counts["unknown"]["missing"] += 1  # Count as unknown if no valid key is found

    return result

# List to store results
new_items = []
texts = []

# Use ThreadPoolExecutor for multithreaded processing
with ThreadPoolExecutor(max_workers=96) as executor:  # Adjust `max_workers` based on your system
    futures = {executor.submit(process_item, item): item for item in data}

    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        if result['valid']:
            new_items.append(result['item'])
        else:
            texts.append(result['item'])  # Collect invalid items if needed
            for missing in result['missing']:
                print(missing)

# Save new_items to a JSON file
with open(new_json_path, 'w', encoding='utf-8') as f:
    json.dump(new_items, f, ensure_ascii=False, indent=4)

# Print the summary of missing and total files by type
print(f"Saved {len(new_items)} valid items to {new_json_path}")
print(f"Total and missing files by type:")
for file_type, counts in file_counts.items():
    print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}")

miss = {'image':[], 'video':[], 'audio_caption':[], 'audio_asr':[]}
for text in texts:
    if 'image' in text:
        miss['image'].append(text)
    if 'video' in text:
        miss['video'].append(text)