Raymond Weitekamp commited on
Commit
47b06ca
·
1 Parent(s): 491aa62

refactor: improve leaderboard layout and heading text

Browse files
Files changed (2) hide show
  1. app.py +77 -25
  2. requirements.txt +2 -1
app.py CHANGED
@@ -11,6 +11,7 @@ if gr.NO_RELOAD:
11
  from PIL import Image # Needed for working with PIL images
12
  import datasets
13
  import numpy as np # Added to help handle numpy array images
 
14
 
15
  # Load environment variables from .env if available.
16
  from dotenv import load_dotenv
@@ -84,19 +85,43 @@ class SubmissionData(BaseModel):
84
  class OCRDataCollector:
85
  def __init__(self):
86
  self.collected_pairs = []
 
87
  self.current_text_block = self.get_random_text_block(201) # Default max words
88
  self.hf_api = HfApi()
89
 
90
  def get_random_text_block(self, max_words: int):
91
- block_length = random.randint(1, 5)
92
- start_index = random.randint(0, len(sentences) - block_length)
93
- block = " ".join(sentences[start_index:start_index + block_length])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  # Truncate to max_words if necessary
96
  words = block.split()
97
  if len(words) > max_words:
98
  block = " ".join(words[:max_words])
99
-
 
100
  return block
101
 
102
  def submit_image(self, image, text_block, username: Optional[str] = None):
@@ -122,12 +147,43 @@ class OCRDataCollector:
122
  if item['user'] != 'anonymous':
123
  user_counts[item['user']] = user_counts.get(item['user'], 0) + 1
124
 
125
- # Sort by count (descending) and format for display
126
- leaderboard = sorted(user_counts.items(), key=lambda x: x[1], reverse=True)
127
- return [(f"🏆 {i+1}. {user}", count) for i, (user, count) in enumerate(leaderboard)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  except Exception as e:
129
  print(f"Error fetching leaderboard: {e}")
130
- return []
131
 
132
  def strip_metadata(image: Image.Image) -> Image.Image:
133
  """
@@ -158,21 +214,24 @@ def create_gradio_interface():
158
 
159
  with gr.Blocks() as demo:
160
  gr.Markdown("# Handwriting OCR Dataset Creator")
161
- gr.Markdown("## After almost 100 years, handwriting recognition still sucks. Together, we can change that.")
162
 
163
  # Add leaderboard section at the top
 
164
  with gr.Row():
165
- with gr.Column():
166
- gr.Markdown("### 🏆 Top Contributors")
 
167
  leaderboard = gr.Dataframe(
168
- headers=["User", "Contributions"],
169
  value=collector.get_leaderboard(),
170
  elem_id="leaderboard",
171
  visible=True,
172
- interactive=False
 
173
  )
174
- refresh_btn = gr.Button("🔄 Refresh Leaderboard", elem_id="refresh_btn")
175
-
 
176
  gr.Markdown("### Step 1: Log in with your Hugging Face account to use this app.")
177
  # Login section - centered
178
  with gr.Row():
@@ -303,6 +362,9 @@ def create_gradio_interface():
303
 
304
  # Load initial state and update UI visibility
305
  demo.load(update_user_state, inputs=profile_state, outputs=[upload_info, image_input, dataset_options, button_row])
 
 
 
306
 
307
  def handle_submit(
308
  text: str,
@@ -451,16 +513,6 @@ def create_gradio_interface():
451
  outputs=text_box
452
  )
453
 
454
- # Add leaderboard refresh handler
455
- def refresh_leaderboard():
456
- return collector.get_leaderboard()
457
-
458
- refresh_btn.click(
459
- fn=refresh_leaderboard,
460
- inputs=[],
461
- outputs=[leaderboard]
462
- )
463
-
464
  return demo
465
 
466
  if __name__ == "__main__":
 
11
  from PIL import Image # Needed for working with PIL images
12
  import datasets
13
  import numpy as np # Added to help handle numpy array images
14
+ import pandas as pd # Added for pandas DataFrame
15
 
16
  # Load environment variables from .env if available.
17
  from dotenv import load_dotenv
 
85
  class OCRDataCollector:
86
  def __init__(self):
87
  self.collected_pairs = []
88
+ self.last_text_block = None
89
  self.current_text_block = self.get_random_text_block(201) # Default max words
90
  self.hf_api = HfApi()
91
 
92
  def get_random_text_block(self, max_words: int):
93
+ attempts = 0
94
+ max_attempts = 10 # Prevent infinite loop in case of very small sentence list
95
+
96
+ while attempts < max_attempts:
97
+ block_length = random.randint(1, 5)
98
+ start_index = random.randint(0, len(sentences) - block_length)
99
+ block = " ".join(sentences[start_index:start_index + block_length])
100
+
101
+ # Truncate to max_words if necessary
102
+ words = block.split()
103
+ if len(words) > max_words:
104
+ block = " ".join(words[:max_words])
105
+
106
+ # If this block is different from the last one, use it
107
+ if block != self.last_text_block:
108
+ self.last_text_block = block
109
+ return block
110
+
111
+ attempts += 1
112
+
113
+ # If we couldn't find a different block after max attempts,
114
+ # force a different block by using the next available sentences
115
+ current_start = sentences.index(self.last_text_block.split('.')[0] + '.') if self.last_text_block else 0
116
+ next_start = (current_start + 1) % len(sentences)
117
+ block = sentences[next_start]
118
 
119
  # Truncate to max_words if necessary
120
  words = block.split()
121
  if len(words) > max_words:
122
  block = " ".join(words[:max_words])
123
+
124
+ self.last_text_block = block
125
  return block
126
 
127
  def submit_image(self, image, text_block, username: Optional[str] = None):
 
147
  if item['user'] != 'anonymous':
148
  user_counts[item['user']] = user_counts.get(item['user'], 0) + 1
149
 
150
+ # Create a pandas DataFrame for better styling
151
+ df = pd.DataFrame(user_counts.items(), columns=['Username', 'Contributions'])
152
+ df['Rank'] = range(1, len(df) + 1)
153
+ df['Medal'] = df['Rank'].apply(lambda x: "🏆" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else "👏")
154
+
155
+ # Reorder columns
156
+ df = df[['Rank', 'Medal', 'Username', 'Contributions']]
157
+
158
+ # Style the DataFrame
159
+ styled_df = df.style\
160
+ .set_properties(**{
161
+ 'text-align': 'center',
162
+ 'font-size': '16px',
163
+ 'padding': '10px',
164
+ 'border': '1px solid #ddd'
165
+ })\
166
+ .set_table_styles([
167
+ {'selector': 'th', 'props': [
168
+ ('background-color', '#f4f4f4'),
169
+ ('color', '#333'),
170
+ ('font-weight', 'bold'),
171
+ ('text-align', 'center'),
172
+ ('padding', '12px'),
173
+ ('border', '1px solid #ddd')
174
+ ]},
175
+ {'selector': 'tr:nth-of-type(odd)', 'props': [
176
+ ('background-color', '#f9f9f9')
177
+ ]},
178
+ {'selector': 'tr:hover', 'props': [
179
+ ('background-color', '#f5f5f5')
180
+ ]}
181
+ ])
182
+
183
+ return styled_df
184
  except Exception as e:
185
  print(f"Error fetching leaderboard: {e}")
186
+ return pd.DataFrame(columns=['Rank', 'Medal', 'Username', 'Contributions'])
187
 
188
  def strip_metadata(image: Image.Image) -> Image.Image:
189
  """
 
214
 
215
  with gr.Blocks() as demo:
216
  gr.Markdown("# Handwriting OCR Dataset Creator")
217
+ gr.Markdown("## After almost 100 years of research, handwriting recognition still sucks. Together, we can change that.")
218
 
219
  # Add leaderboard section at the top
220
+ gr.Markdown("### 🏆 Top Contributors", show_label=False)
221
  with gr.Row():
222
+ with gr.Column(scale=1):
223
+ pass
224
+ with gr.Column(scale=2, min_width=400):
225
  leaderboard = gr.Dataframe(
 
226
  value=collector.get_leaderboard(),
227
  elem_id="leaderboard",
228
  visible=True,
229
+ interactive=False,
230
+ show_label=False
231
  )
232
+ with gr.Column(scale=1):
233
+ pass
234
+
235
  gr.Markdown("### Step 1: Log in with your Hugging Face account to use this app.")
236
  # Login section - centered
237
  with gr.Row():
 
362
 
363
  # Load initial state and update UI visibility
364
  demo.load(update_user_state, inputs=profile_state, outputs=[upload_info, image_input, dataset_options, button_row])
365
+
366
+ # Also load leaderboard on page load
367
+ demo.load(fn=lambda: collector.get_leaderboard(), outputs=leaderboard)
368
 
369
  def handle_submit(
370
  text: str,
 
513
  outputs=text_box
514
  )
515
 
 
 
 
 
 
 
 
 
 
 
516
  return demo
517
 
518
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -7,4 +7,5 @@ pytest-asyncio>=0.23.0
7
  playwright>=1.40.0
8
  datasets>=2.16.0
9
  pydantic>=2.6.1
10
- python-dotenv>=1.0.0
 
 
7
  playwright>=1.40.0
8
  datasets>=2.16.0
9
  pydantic>=2.6.1
10
+ python-dotenv>=1.0.0
11
+ pandas>=2.0.0