Raymond Weitekamp commited on
Commit
7ca2071
·
1 Parent(s): 0a75f20

private works

Browse files
Files changed (1) hide show
  1. app.py +94 -16
app.py CHANGED
@@ -4,6 +4,7 @@ import os
4
  from datetime import datetime
5
  from huggingface_hub import HfApi
6
  from typing import Optional
 
7
 
8
  # The list of sentences from our previous conversation.
9
  sentences = [
@@ -85,6 +86,18 @@ class OCRDataCollector:
85
  def skip_text(self, text_block, username: Optional[str] = None):
86
  return self.get_random_text_block()
87
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  def create_gradio_interface():
89
  collector = OCRDataCollector()
90
 
@@ -92,44 +105,109 @@ def create_gradio_interface():
92
  gr.Markdown("## Crowdsourcing Handwriting OCR Dataset")
93
  gr.LoginButton()
94
  user_info = gr.Markdown()
 
 
95
 
96
- gr.Markdown("You will be shown between 1 and 5 consecutive sentences. Please handwrite them on paper and upload an image of your handwriting. If you wish to skip the current text, click 'Skip'.")
 
 
 
97
 
98
  text_box = gr.Textbox(value=collector.current_text_block, label="Text to Handwrite", interactive=False, visible=False)
99
  image_input = gr.Image(type="pil", label="Upload Handwritten Image", sources=["upload"], visible=False)
 
 
100
 
101
  with gr.Row(visible=False) as button_row:
102
  submit_btn = gr.Button("Submit")
103
  skip_btn = gr.Button("Skip")
104
-
105
- def update_user_info(profile: gr.OAuthProfile | None) -> str:
106
- if profile is None:
107
- return "Please log in with your Hugging Face account to contribute to the dataset."
108
- return f"Logged in as: {profile.name}"
109
-
110
- def handle_submit(profile: gr.OAuthProfile | None) -> str:
111
  if profile is None:
 
 
 
 
 
 
 
 
 
 
 
 
112
  raise gr.Error("Please log in to use this application")
113
- return collector.submit_image(image_input.value, text_box.value, profile.name)
 
 
114
 
115
- def handle_skip(profile: gr.OAuthProfile | None) -> str:
116
- if profile is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  raise gr.Error("Please log in to use this application")
118
- return collector.skip_text(text_box.value, profile.name)
119
 
120
  def update_visibility(profile: gr.OAuthProfile | None):
121
  is_visible = profile is not None
 
122
  return [
 
123
  gr.update(visible=is_visible),
124
  gr.update(visible=is_visible),
125
  gr.update(visible=is_visible)
126
  ]
127
 
128
- demo.load(update_user_info, inputs=None, outputs=user_info)
129
- demo.load(update_visibility, inputs=None, outputs=[text_box, image_input, button_row])
 
130
 
131
- submit_btn.click(handle_submit, inputs=None, outputs=[image_input, text_box])
132
- skip_btn.click(handle_skip, inputs=None, outputs=text_box)
 
133
 
134
  return demo
135
 
 
4
  from datetime import datetime
5
  from huggingface_hub import HfApi
6
  from typing import Optional
7
+ from PIL import Image # Needed for working with PIL images
8
 
9
  # The list of sentences from our previous conversation.
10
  sentences = [
 
86
  def skip_text(self, text_block, username: Optional[str] = None):
87
  return self.get_random_text_block()
88
 
89
+
90
+ def strip_metadata(image: Image.Image) -> Image.Image:
91
+ """
92
+ Helper function to strip all metadata from the provided PIL Image.
93
+ This creates a new image with the same pixel data but no additional info.
94
+ """
95
+ data = list(image.getdata())
96
+ stripped_image = Image.new(image.mode, image.size)
97
+ stripped_image.putdata(data)
98
+ return stripped_image
99
+
100
+
101
  def create_gradio_interface():
102
  collector = OCRDataCollector()
103
 
 
105
  gr.Markdown("## Crowdsourcing Handwriting OCR Dataset")
106
  gr.LoginButton()
107
  user_info = gr.Markdown()
108
+ # Hidden state to hold the user's OAuth profile as JSON.
109
+ profile_state = gr.JSON(visible=False)
110
 
111
+ gr.Markdown(
112
+ "You will be shown between 1 and 5 consecutive sentences. Please handwrite them on paper and upload an image of your handwriting. "
113
+ "If you wish to skip the current text, click 'Skip'."
114
+ )
115
 
116
  text_box = gr.Textbox(value=collector.current_text_block, label="Text to Handwrite", interactive=False, visible=False)
117
  image_input = gr.Image(type="pil", label="Upload Handwritten Image", sources=["upload"], visible=False)
118
+ # Toggle (using a radio button) for dataset choice.
119
+ dataset_radio = gr.Radio(choices=["Private", "Public"], label="Select Dataset", value="Private", visible=False)
120
 
121
  with gr.Row(visible=False) as button_row:
122
  submit_btn = gr.Button("Submit")
123
  skip_btn = gr.Button("Skip")
124
+
125
+ def update_user_info(profile: gr.OAuthProfile | None):
 
 
 
 
 
126
  if profile is None:
127
+ return "Please log in with your Hugging Face account to contribute to the dataset.", {}
128
+ # Use the username provided by the profile (from the "profile" scope)
129
+ return f"Logged in as: {profile.username}", {"username": profile.username}
130
+
131
+ def handle_submit(profile, dataset_choice, image, text):
132
+ """
133
+ Depending on the dataset toggle selection, this function either:
134
+ - For 'Private': strips metadata from the image, checks/creates a private HF dataset,
135
+ pushes the image to it, and logs the submission.
136
+ - For 'Public': uses the existing `submit_image` logic.
137
+ """
138
+ if not profile or "username" not in profile:
139
  raise gr.Error("Please log in to use this application")
140
+ username = profile["username"]
141
+ # Use the username from the OAuth profile directly
142
+ repo_id = f"{username}/handwriting-ocr-private"
143
 
144
+ if dataset_choice == "Private":
145
+ # Remove all metadata for privacy
146
+ stripped_image = strip_metadata(image)
147
+
148
+ # Check if the dataset exists; if not, create it as private.
149
+ try:
150
+ collector.hf_api.dataset_info(repo_id)
151
+ except Exception as e:
152
+ collector.hf_api.create_repo(repo_id, repo_type="dataset", private=True)
153
+
154
+ # Save the stripped image to a temporary file
155
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
156
+ filename = f"{timestamp}.png"
157
+ temp_dir = "temp"
158
+ os.makedirs(temp_dir, exist_ok=True)
159
+ temp_path = os.path.join(temp_dir, filename)
160
+ stripped_image.save(temp_path)
161
+
162
+ # Upload the file to the Hugging Face dataset repository
163
+ collector.hf_api.upload_file(
164
+ path_or_fileobj=temp_path,
165
+ path_in_repo=filename,
166
+ repo_id=repo_id,
167
+ repo_type="dataset",
168
+ )
169
+
170
+ # Remove the temporary file
171
+ os.remove(temp_path)
172
+
173
+ # Log the submission locally with an indicator for the private dataset.
174
+ collector.collected_pairs.append({
175
+ "text": text,
176
+ "image": image, # The original image is recorded locally.
177
+ "timestamp": timestamp,
178
+ "username": username,
179
+ "dataset": "private"
180
+ })
181
+ new_text = collector.get_random_text_block()
182
+ # Return a tuple to clear the image input (set to None) and update the text.
183
+ return None, new_text
184
+ else:
185
+ # Fallback to public submission using the existing logic.
186
+ new_text = collector.submit_image(image, text, username)
187
+ return None, new_text
188
+
189
+ def handle_skip(profile, text):
190
+ if not profile or "username" not in profile:
191
  raise gr.Error("Please log in to use this application")
192
+ return collector.skip_text(text, profile["username"])
193
 
194
  def update_visibility(profile: gr.OAuthProfile | None):
195
  is_visible = profile is not None
196
+ # Update the visibility of text_box, image_input, button_row, and dataset_radio.
197
  return [
198
+ gr.update(visible=is_visible),
199
  gr.update(visible=is_visible),
200
  gr.update(visible=is_visible),
201
  gr.update(visible=is_visible)
202
  ]
203
 
204
+ # On load, update both the display message and the hidden profile state.
205
+ demo.load(update_user_info, inputs=None, outputs=[user_info, profile_state])
206
+ demo.load(update_visibility, inputs=None, outputs=[text_box, image_input, button_row, dataset_radio])
207
 
208
+ # Bind the submit and skip actions with updated inputs.
209
+ submit_btn.click(handle_submit, inputs=[profile_state, dataset_radio, image_input, text_box], outputs=[image_input, text_box])
210
+ skip_btn.click(handle_skip, inputs=[profile_state, text_box], outputs=text_box)
211
 
212
  return demo
213