lalalalalalalalalala commited on
Commit
6bf53fb
·
verified ·
1 Parent(s): ef9fe17

Update run.py

Browse files
Files changed (1) hide show
  1. run.py +29 -20
run.py CHANGED
@@ -11,6 +11,13 @@ import hashlib
11
  import os
12
  import csv
13
 
 
 
 
 
 
 
 
14
  def load_hf_dataset(dataset_path, auth_token):
15
  dataset = load_dataset(dataset_path, token=auth_token)
16
  video_paths = dataset
@@ -29,27 +36,29 @@ def fast_caption(sys_prompt, usr_prompt, temp, top_p, max_tokens, model, key, en
29
 
30
  if video_hf and video_hf_auth:
31
  progress_info.append('Begin processing Hugging Face dataset.')
32
- temp_parquet_file = hf_hub_download(
33
- repo_id=video_hf,
34
- filename='data/' + str(parquet_index).zfill(6) + '.parquet',
35
- repo_type="dataset",
36
- token=video_hf_auth,
37
- )
38
- parquet_file = pq.ParquetFile(temp_parquet_file)
39
- for batch in parquet_file.iter_batches(batch_size=1):
40
  df = batch.to_pandas()
41
- video = df['video'][0]
42
- md5 = hashlib.md5(video).hexdigest()
43
- with tempfile.NamedTemporaryFile(dir=temp_dir) as temp_file:
44
- temp_file.write(video)
45
- video_path = temp_file.name
46
- processor = VideoProcessor(frame_format=frame_format, frame_limit=frame_limit)
47
- frames = processor._decode(video_path)
48
- base64_list = processor.to_base64_list(frames)
49
- api = AzureAPI(key=key, endpoint=endpoint, model=model, temp=temp, top_p=top_p, max_tokens=max_tokens)
50
- caption = api.get_caption(sys_prompt, usr_prompt, base64_list)
51
- writer.writerow({'md5': md5, 'caption': caption})
52
- progress_info.append(f"Processed video with MD5: {md5}")
 
 
 
53
  return csv_filename, "\n".join(progress_info), None
54
  else:
55
  return "", "No video source selected.", None
 
11
  import os
12
  import csv
13
 
14
+ # pip install --no-cache-dir huggingface_hub[hf_transfer]
15
+ def single_download(repo, fname, token, endpoint):
16
+ os.environ["TOKIO_WORKER_THREADS"] = "32"
17
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
18
+ file = hf_hub_download(repo_id=repo, filename=fname, token=token, endpoint=endpoint, repo_type="dataset")
19
+ return file
20
+
21
  def load_hf_dataset(dataset_path, auth_token):
22
  dataset = load_dataset(dataset_path, token=auth_token)
23
  video_paths = dataset
 
36
 
37
  if video_hf and video_hf_auth:
38
  progress_info.append('Begin processing Hugging Face dataset.')
39
+ os.environ["TOKIO_WORKER_THREADS"] = "8"
40
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
41
+ pqfile = hf_hub_download(repo_id=video_hf, filename='data/' + str(parquet_index).zfill(6) + '.parquet', repo_type="dataset", local_dir="/dev/shm", token=video_hf_auth)
42
+
43
+ pf = pq.ParquetFile(pqfile)
44
+ for batch in pf.iter_batches(1):
45
+ _chunk = []
 
46
  df = batch.to_pandas()
47
+ for binary in df["video"]:
48
+ if(binary):
49
+ _v = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
50
+ with open(_v.name, "wb") as f:
51
+ _ = f.write(binary)
52
+ _chunk.append(_v.name)
53
+
54
+ processor = VideoProcessor(frame_format=frame_format, frame_limit=frame_limit)
55
+ frames = processor._decode(_v.name)
56
+ base64_list = processor.to_base64_list(frames)
57
+ api = AzureAPI(key=key, endpoint=endpoint, model=model, temp=temp, top_p=top_p, max_tokens=max_tokens)
58
+ caption = api.get_caption(sys_prompt, usr_prompt, base64_list)
59
+ writer.writerow({'md5': _v.name, 'caption': caption})
60
+ progress_info.append(f"Processed video with MD5: {md5}")
61
+ return csv_filename, "\n".join(progress_info), None
62
  return csv_filename, "\n".join(progress_info), None
63
  else:
64
  return "", "No video source selected.", None