File size: 5,043 Bytes
a47b2ba
 
6c33596
632758a
 
 
798ee13
 
 
d03f6fb
632758a
d03f6fb
a47b2ba
632758a
 
 
 
 
798ee13
632758a
798ee13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
632758a
 
 
 
 
 
 
 
 
 
 
 
a47b2ba
632758a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
798ee13
 
 
 
 
c384cc2
632758a
 
 
 
 
 
 
 
 
 
ef2d262
c384cc2
632758a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a47b2ba
 
632758a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import gradio as gr
from datasets import load_dataset
import json
import random
from datetime import datetime
import os
from PIL import Image
import io
import numpy as np

# Get access token from environment
access_token = os.environ.get("HUGGINGFACE_TOKEN")

class DatasetViewer:
    def __init__(self):
        self.dataset = None
        self.dataset_size = 0
        self.last_refresh_time = None
        self.max_display_size = (800, 600)  # Maximum width and height for displayed images
        self.load_dataset()
        
    def resize_image(self, image):
        """Resize image keeping aspect ratio with a maximum size constraint"""
        if isinstance(image, np.ndarray):
            # Convert numpy array to PIL Image
            image = Image.fromarray(image)
        elif isinstance(image, bytes):
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(image))
            
        # Calculate scaling factor to fit within max dimensions
        width_ratio = self.max_display_size[0] / image.width
        height_ratio = self.max_display_size[1] / image.height
        scale_factor = min(width_ratio, height_ratio)
        
        # Only resize if image is larger than max dimensions
        if scale_factor < 1:
            new_width = int(image.width * scale_factor)
            new_height = int(image.height * scale_factor)
            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
            
        # Convert back to numpy array for gradio
        return np.array(image)

    def load_dataset(self):
        """Load the complete dataset into memory"""
        # Load the full dataset (non-streaming)
        self.dataset = load_dataset(
            "taesiri/PhotoshopRequest-DailyDump-January-2025-RandomSample",
            split="train",
            token=access_token
        )
        
        self.dataset_size = len(self.dataset)
        self.last_refresh_time = datetime.now()

    def get_next_samples(self, num_samples=5):
        """Get random samples from the dataset"""
        # Generate random indices
        indices = random.sample(range(self.dataset_size), min(num_samples, self.dataset_size))
        
        results = []
        for idx in indices:
            sample = self.dataset[idx]
            
            # Get post information
            post_id = sample["post_id"]
            title = sample["title"]
            reddit_url = f"https://www.reddit.com/r/PhotoshopRequest/comments/{post_id}"
            
            # Extract selftext if available
            selftext = ""
            try:
                selftext = json.loads(sample["json_data"])["post"]["selftext"]
            except:
                print(f"No selftext found for post {post_id}")
            
            # Create markdown text
            markdown_text = f"# {title}\n\n{selftext}\n\n[View post on r/PhotoshopRequest]({reddit_url})"
            
            # Append the triple (post_info, source_image, edited_image)
            results.append(markdown_text)
            # Resize images before adding to results
            source_image = self.resize_image(sample["source_image"])
            edited_image = self.resize_image(sample["edited_image"])
            results.append(source_image)
            results.append(edited_image)
        
        return tuple(results)

    def get_info(self):
        """Return dataset information"""
        return f"""
        <div style="text-align: center;">
            <hr>
            Dataset Size: {self.dataset_size} items<br>
            Last Refreshed: {self.last_refresh_time.strftime('%Y-%m-%d %H:%M:%S UTC')}
        </div>
        """

def create_interface():
    viewer = DatasetViewer()
    
    with gr.Blocks() as demo:
        gr.Markdown("# PhotoshopRequest Dataset Viewer")
        
        gr.Markdown("""
            This is a viewer for the PhotoshopRequest dataset. Each sample shows a Photoshop editing request post.
            Click the 'Show New Samples' button to see **5 random samples** from the dataset.
            
            **Layout**: For each sample, you'll see:
            1. The post title and description
            2. The source image (left) and edited result (right)
        """)
        
        # Create 5 sets of outputs
        outputs = []
        for i in range(5):
            post_info = gr.Markdown()
            outputs.append(post_info)
            
            with gr.Row():
                source = gr.Image(label=f"Source Image {i+1}")
                edited = gr.Image(label=f"Edited Image {i+1}")
                outputs.extend([source, edited])
        
        sample_button = gr.Button("Show New Samples")
        info_md = gr.Markdown()
        
        # Set up event handlers
        sample_button.click(
            viewer.get_next_samples,
            outputs=outputs
        ).then(
            viewer.get_info,
            outputs=[info_md]
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch()