Raymond Weitekamp commited on
Commit
e3d7a16
·
1 Parent(s): 7ca2071

private dataset structure

Browse files
Files changed (2) hide show
  1. app.py +22 -19
  2. requirements.txt +2 -1
app.py CHANGED
@@ -5,6 +5,7 @@ from datetime import datetime
5
  from huggingface_hub import HfApi
6
  from typing import Optional
7
  from PIL import Image # Needed for working with PIL images
 
8
 
9
  # The list of sentences from our previous conversation.
10
  sentences = [
@@ -129,23 +130,16 @@ def create_gradio_interface():
129
  return f"Logged in as: {profile.username}", {"username": profile.username}
130
 
131
  def handle_submit(profile, dataset_choice, image, text):
132
- """
133
- Depending on the dataset toggle selection, this function either:
134
- - For 'Private': strips metadata from the image, checks/creates a private HF dataset,
135
- pushes the image to it, and logs the submission.
136
- - For 'Public': uses the existing `submit_image` logic.
137
- """
138
  if not profile or "username" not in profile:
139
  raise gr.Error("Please log in to use this application")
140
  username = profile["username"]
141
- # Use the username from the OAuth profile directly
142
  repo_id = f"{username}/handwriting-ocr-private"
143
 
144
  if dataset_choice == "Private":
145
  # Remove all metadata for privacy
146
  stripped_image = strip_metadata(image)
147
 
148
- # Check if the dataset exists; if not, create it as private.
149
  try:
150
  collector.hf_api.dataset_info(repo_id)
151
  except Exception as e:
@@ -159,30 +153,39 @@ def create_gradio_interface():
159
  temp_path = os.path.join(temp_dir, filename)
160
  stripped_image.save(temp_path)
161
 
162
- # Upload the file to the Hugging Face dataset repository
163
- collector.hf_api.upload_file(
164
- path_or_fileobj=temp_path,
165
- path_in_repo=filename,
166
- repo_id=repo_id,
167
- repo_type="dataset",
168
- )
 
 
 
 
 
 
 
 
 
169
 
170
  # Remove the temporary file
171
  os.remove(temp_path)
172
 
173
- # Log the submission locally with an indicator for the private dataset.
174
  collector.collected_pairs.append({
175
  "text": text,
176
- "image": image, # The original image is recorded locally.
177
  "timestamp": timestamp,
178
  "username": username,
179
  "dataset": "private"
180
  })
 
181
  new_text = collector.get_random_text_block()
182
- # Return a tuple to clear the image input (set to None) and update the text.
183
  return None, new_text
184
  else:
185
- # Fallback to public submission using the existing logic.
186
  new_text = collector.submit_image(image, text, username)
187
  return None, new_text
188
 
 
5
  from huggingface_hub import HfApi
6
  from typing import Optional
7
  from PIL import Image # Needed for working with PIL images
8
+ import datasets
9
 
10
  # The list of sentences from our previous conversation.
11
  sentences = [
 
130
  return f"Logged in as: {profile.username}", {"username": profile.username}
131
 
132
  def handle_submit(profile, dataset_choice, image, text):
 
 
 
 
 
 
133
  if not profile or "username" not in profile:
134
  raise gr.Error("Please log in to use this application")
135
  username = profile["username"]
 
136
  repo_id = f"{username}/handwriting-ocr-private"
137
 
138
  if dataset_choice == "Private":
139
  # Remove all metadata for privacy
140
  stripped_image = strip_metadata(image)
141
 
142
+ # Check if the dataset exists; if not, create it as private
143
  try:
144
  collector.hf_api.dataset_info(repo_id)
145
  except Exception as e:
 
153
  temp_path = os.path.join(temp_dir, filename)
154
  stripped_image.save(temp_path)
155
 
156
+ # Create a dataset dictionary with the image-text pair
157
+ features = datasets.Features({
158
+ 'text': datasets.Value('string'),
159
+ 'image': datasets.Image(),
160
+ 'timestamp': datasets.Value('string')
161
+ })
162
+
163
+ dataset_dict = {
164
+ 'text': [text],
165
+ 'image': [temp_path],
166
+ 'timestamp': [timestamp]
167
+ }
168
+
169
+ # Create the dataset and push to hub
170
+ dataset = datasets.Dataset.from_dict(dataset_dict, features=features)
171
+ dataset.push_to_hub(repo_id)
172
 
173
  # Remove the temporary file
174
  os.remove(temp_path)
175
 
176
+ # Log the submission locally
177
  collector.collected_pairs.append({
178
  "text": text,
179
+ "image": image,
180
  "timestamp": timestamp,
181
  "username": username,
182
  "dataset": "private"
183
  })
184
+
185
  new_text = collector.get_random_text_block()
 
186
  return None, new_text
187
  else:
188
+ # Fallback to public submission
189
  new_text = collector.submit_image(image, text, username)
190
  return None, new_text
191
 
requirements.txt CHANGED
@@ -4,4 +4,5 @@ Pillow>=10.0.0
4
  pytest>=7.0.0
5
  pytest-playwright>=0.4.0
6
  pytest-asyncio>=0.23.0
7
- playwright>=1.40.0
 
 
4
  pytest>=7.0.0
5
  pytest-playwright>=0.4.0
6
  pytest-asyncio>=0.23.0
7
+ playwright>=1.40.0
8
+ datasets>=2.16.0