File size: 1,184 Bytes
6045954
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas as pd
from dotenv import load_dotenv
import os
import base64
from PIL import Image
import io
from datasets import Dataset, Image as HFImage
from pathlib import Path
from tqdm import tqdm

load_dotenv()

df = pd.read_json("output/vespa_feed_full.jsonl", lines=True)
df = pd.json_normalize(df["fields"].tolist())

dataset_dir = Path("hf_dataset")
image_dir = dataset_dir / "images"
os.makedirs(image_dir, exist_ok=True)


def save_image(image_data, filename):
    img_data = base64.b64decode(image_data)
    img = Image.open(io.BytesIO(img_data))
    img.save(filename)


for idx, row in tqdm(df.iterrows()):
    blur_filename = os.path.join(image_dir, f"blur_{idx}.jpg")
    full_filename = os.path.join(image_dir, f"full_{idx}.jpg")
    save_image(row["blur_image"], blur_filename)
    save_image(row["full_image"], full_filename)
    df.at[idx, "blur_image"] = blur_filename
    df.at[idx, "full_image"] = full_filename


# Step 3: Convert to Hugging Face Dataset
dataset = (
    Dataset.from_dict(df.to_dict(orient="list"))
    .cast_column("blur_image", HFImage())
    .cast_column("full_image", HFImage())
)
dataset.push_to_hub("vespa-engine/gpfg-QA", private=True)