Spaces:
Running
Running
import json | |
import pickle | |
import pandas as pd | |
import os | |
import csv | |
import hashlib | |
import os.path as osp | |
import time | |
import numpy as np | |
import validators | |
import mimetypes | |
import multiprocessing as mp | |
from .misc import toliststr | |
from .vlm import decode_base64_to_image_file | |
def decode_img_omni(tup): | |
root, im, p = tup | |
images = toliststr(im) | |
paths = toliststr(p) | |
if len(images) > 1 and len(paths) == 1: | |
paths = [osp.splitext(p)[0] + f'_{i}' + osp.splitext(p)[1] for i in range(len(images))] | |
assert len(images) == len(paths) | |
paths = [osp.join(root, p) for p in paths] | |
for p, im in zip(paths, images): | |
if osp.exists(p): | |
continue | |
if isinstance(im, str) and len(im) > 64: | |
decode_base64_to_image_file(im, p) | |
return paths | |
def localize_df(data, dname, nproc=32): | |
assert 'image' in data | |
indices = list(data['index']) | |
indices_str = [str(x) for x in indices] | |
images = list(data['image']) | |
image_map = {x: y for x, y in zip(indices_str, images)} | |
root = LMUDataRoot() | |
root = osp.join(root, 'images', dname) | |
os.makedirs(root, exist_ok=True) | |
if 'image_path' in data: | |
img_paths = list(data['image_path']) | |
else: | |
img_paths = [] | |
for i in indices_str: | |
if len(image_map[i]) <= 64: | |
idx = image_map[i] | |
assert idx in image_map and len(image_map[idx]) > 64 | |
img_paths.append(f'{idx}.jpg') | |
else: | |
img_paths.append(f'{i}.jpg') | |
tups = [(root, im, p) for p, im in zip(img_paths, images)] | |
pool = mp.Pool(32) | |
ret = pool.map(decode_img_omni, tups) | |
pool.close() | |
data.pop('image') | |
if 'image_path' not in data: | |
data['image_path'] = [x[0] if len(x) == 1 else x for x in ret] | |
return data | |
def LMUDataRoot(): | |
if 'LMUData' in os.environ and osp.exists(os.environ['LMUData']): | |
return os.environ['LMUData'] | |
home = osp.expanduser('~') | |
root = osp.join(home, 'LMUData') | |
os.makedirs(root, exist_ok=True) | |
return root | |
def MMBenchOfficialServer(dataset_name): | |
root = LMUDataRoot() | |
if dataset_name in ['MMBench', 'MMBench_V11', 'MMBench_CN', 'MMBench_CN_V11']: | |
ans_file = f'{root}/{dataset_name}.tsv' | |
if osp.exists(ans_file): | |
data = load(ans_file) | |
if 'answer' in data and sum([pd.isna(x) for x in data['answer']]) == 0: | |
return True | |
if dataset_name in ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11']: | |
ans_file1 = f'{root}/{dataset_name}.tsv' | |
mapp = { | |
'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_CN': 'MMBench_CN', | |
'MMBench_TEST_EN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11', | |
} | |
ans_file2 = f'{root}/{mapp[dataset_name]}.tsv' | |
for f in [ans_file1, ans_file2]: | |
if osp.exists(f): | |
data = load(f) | |
if 'answer' in data and sum([pd.isna(x) for x in data['answer']]) == 0: | |
return True | |
return False | |
class NumpyEncoder(json.JSONEncoder): | |
def default(self, obj): | |
if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, | |
np.int16, np.int32, np.int64, np.uint8, | |
np.uint16, np.uint32, np.uint64)): | |
return int(obj) | |
elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): | |
return float(obj) | |
elif isinstance(obj, (np.complex_, np.complex64, np.complex128)): | |
return {'real': obj.real, 'imag': obj.imag} | |
elif isinstance(obj, (np.ndarray,)): | |
return obj.tolist() | |
elif isinstance(obj, (np.bool_)): | |
return bool(obj) | |
elif isinstance(obj, (np.void)): | |
return None | |
return json.JSONEncoder.default(self, obj) | |
# LOAD & DUMP | |
def dump(data, f, **kwargs): | |
def dump_pkl(data, pth, **kwargs): | |
pickle.dump(data, open(pth, 'wb')) | |
def dump_json(data, pth, **kwargs): | |
json.dump(data, open(pth, 'w'), indent=4, ensure_ascii=False, cls=NumpyEncoder) | |
def dump_jsonl(data, f, **kwargs): | |
lines = [json.dumps(x, ensure_ascii=False, cls=NumpyEncoder) for x in data] | |
with open(f, 'w', encoding='utf8') as fout: | |
fout.write('\n'.join(lines)) | |
def dump_xlsx(data, f, **kwargs): | |
data.to_excel(f, index=False, engine='xlsxwriter') | |
def dump_csv(data, f, quoting=csv.QUOTE_ALL): | |
data.to_csv(f, index=False, encoding='utf-8', quoting=quoting) | |
def dump_tsv(data, f, quoting=csv.QUOTE_ALL): | |
data.to_csv(f, sep='\t', index=False, encoding='utf-8', quoting=quoting) | |
handlers = dict(pkl=dump_pkl, json=dump_json, jsonl=dump_jsonl, xlsx=dump_xlsx, csv=dump_csv, tsv=dump_tsv) | |
suffix = f.split('.')[-1] | |
return handlers[suffix](data, f, **kwargs) | |
def load(f, fmt=None): | |
def load_pkl(pth): | |
return pickle.load(open(pth, 'rb')) | |
def load_json(pth): | |
return json.load(open(pth, 'r', encoding='utf-8')) | |
def load_jsonl(f): | |
lines = open(f, encoding='utf-8').readlines() | |
lines = [x.strip() for x in lines] | |
if lines[-1] == '': | |
lines = lines[:-1] | |
data = [json.loads(x) for x in lines] | |
return data | |
def load_xlsx(f): | |
return pd.read_excel(f) | |
def load_csv(f): | |
return pd.read_csv(f) | |
def load_tsv(f): | |
return pd.read_csv(f, sep='\t') | |
handlers = dict(pkl=load_pkl, json=load_json, jsonl=load_jsonl, xlsx=load_xlsx, csv=load_csv, tsv=load_tsv) | |
if fmt is not None: | |
return handlers[fmt](f) | |
suffix = f.split('.')[-1] | |
return handlers[suffix](f) | |
def download_file(url, filename=None): | |
import urllib.request | |
from tqdm import tqdm | |
class DownloadProgressBar(tqdm): | |
def update_to(self, b=1, bsize=1, tsize=None): | |
if tsize is not None: | |
self.total = tsize | |
self.update(b * bsize - self.n) | |
if filename is None: | |
filename = url.split('/')[-1] | |
# If HF_ENDPOINT is set, replace huggingface.co with it | |
if 'huggingface.co' in url and os.environ.get('HF_ENDPOINT', '') != '': | |
url = url.replace('huggingface.co', os.environ['HF_ENDPOINT'].split('://')[1]) | |
try: | |
with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t: | |
urllib.request.urlretrieve(url, filename=filename, reporthook=t.update_to) | |
except: | |
# Handle Failed Downloads from huggingface.co | |
if 'huggingface.co' in url: | |
url_new = url.replace('huggingface.co', 'hf-mirror.com') | |
try: | |
os.system(f'wget {url_new} -O {filename}') | |
except: | |
raise Exception(f'Failed to download {url}') | |
else: | |
raise Exception(f'Failed to download {url}') | |
return filename | |
def ls(dirname='.', match=[], mode='all', level=1): | |
if isinstance(level, str): | |
assert '+' in level | |
level = int(level[:-1]) | |
res = [] | |
for i in range(1, level + 1): | |
res.extend(ls(dirname, match=match, mode='file', level=i)) | |
return res | |
if dirname == '.': | |
ans = os.listdir(dirname) | |
else: | |
ans = [osp.join(dirname, x) for x in os.listdir(dirname)] | |
assert mode in ['all', 'dir', 'file'] | |
assert level >= 1 and isinstance(level, int) | |
if level == 1: | |
if isinstance(match, str): | |
match = [match] | |
for m in match: | |
if len(m) == 0: | |
continue | |
if m[0] != '!': | |
ans = [x for x in ans if m in x] | |
else: | |
ans = [x for x in ans if m[1:] not in x] | |
if mode == 'dir': | |
ans = [x for x in ans if osp.isdir(x)] | |
elif mode == 'file': | |
ans = [x for x in ans if not osp.isdir(x)] | |
return ans | |
else: | |
dirs = [x for x in ans if osp.isdir(x)] | |
res = [] | |
for d in dirs: | |
res.extend(ls(d, match=match, mode=mode, level=level - 1)) | |
return res | |
def mrlines(fname, sp='\n'): | |
f = open(fname).read().split(sp) | |
while f != [] and f[-1] == '': | |
f = f[:-1] | |
return f | |
def mwlines(lines, fname): | |
with open(fname, 'w') as fout: | |
fout.write('\n'.join(lines)) | |
def md5(s): | |
hash = hashlib.new('md5') | |
if osp.exists(s): | |
with open(s, 'rb') as f: | |
for chunk in iter(lambda: f.read(2**20), b''): | |
hash.update(chunk) | |
else: | |
hash.update(s.encode('utf-8')) | |
return str(hash.hexdigest()) | |
def last_modified(pth): | |
stamp = osp.getmtime(pth) | |
m_ti = time.ctime(stamp) | |
t_obj = time.strptime(m_ti) | |
t = time.strftime('%Y%m%d%H%M%S', t_obj)[2:] | |
return t | |
def parse_file(s): | |
if osp.exists(s) and s != '.': | |
assert osp.isfile(s) | |
suffix = osp.splitext(s)[1].lower() | |
mime = mimetypes.types_map.get(suffix, 'unknown') | |
return (mime, s) | |
elif validators.url(s): | |
suffix = osp.splitext(s)[1].lower() | |
if suffix in mimetypes.types_map: | |
mime = mimetypes.types_map[suffix] | |
dname = osp.join(LMUDataRoot(), 'files') | |
os.makedirs(dname, exist_ok=True) | |
tgt = osp.join(dname, md5(s) + suffix) | |
download_file(s, tgt) | |
return (mime, tgt) | |
else: | |
return ('url', s) | |
else: | |
return (None, s) | |
def file_size(f, unit='GB'): | |
stats = os.stat(f) | |
div_map = { | |
'GB': 2 ** 30, | |
'MB': 2 ** 20, | |
'KB': 2 ** 10, | |
} | |
return stats.st_size / div_map[unit] | |
def parquet_to_tsv(file_path): | |
data = pd.read_parquet(file_path) | |
pth = '/'.join(file_path.split('/')[:-1]) | |
data_name = file_path.split('/')[-1].split('.')[0] | |
data.to_csv(osp.join(pth, f'{data_name}.tsv'), sep='\t', index=False) | |