[email protected] commited on
Commit
323a625
1 Parent(s): 23900aa

dataset integration

Browse files
Files changed (3) hide show
  1. app.py +250 -124
  2. requirements.txt +6 -0
  3. texts.py +68 -0
app.py CHANGED
@@ -1,22 +1,67 @@
1
  from __future__ import annotations
2
 
 
3
  import os
4
  import random
5
  import time
6
  import gradio as gr
 
7
  from selenium import webdriver
8
  from selenium.common.exceptions import WebDriverException
9
  from PIL import Image
10
  from io import BytesIO
11
  import base64
12
 
 
 
 
 
 
 
13
  import trafilatura
14
 
 
 
 
 
 
15
  from huggingface_hub import whoami
16
 
17
  from languages import ISO_CODE_TO_LANGUAGE_NAME
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- OFFLINE = os.environ.get("OFFLINE", False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def get_candidate_urls():
22
  return [
@@ -46,7 +91,7 @@ def fetch_screenshot_and_text_from_url(url):
46
  height = 350
47
  text = ""
48
 
49
- if OFFLINE:
50
  screenshot = Image.new('RGB', (350, height))
51
  text = f"Some dummy text for {url} (offline mode enabled)"
52
 
@@ -103,143 +148,224 @@ with gr.Blocks(fill_height=True) as demo:
103
  # Seed Crawl Annotator
104
  """)
105
 
106
- profile_state = gr.State([])
107
- gr.LoginButton()
108
-
109
- with gr.Column(visible=False) as wrapper_col:
110
- login_status = gr.Markdown("no")
111
-
112
- def handle_login(profile: gr.OAuthProfile | None) -> dict:
113
- if profile:
114
- gr.Info(f"Logged in as {profile.username}")
115
- return {
116
- profile_state: f"{profile.username}",
117
- wrapper_col: gr.update(visible=True),
118
- login_status: "yes",
119
- }
120
- else:
121
- gr.Warning(f"You need to login to use this app.")
122
- return {
123
- profile_state: [],
124
- wrapper_col: gr.update(visible=False),
125
- login_status: "no",
126
- }
127
-
128
- demo.load(handle_login, inputs=None, outputs=[profile_state, wrapper_col, login_status])
129
-
130
- url_field = gr.Textbox(label="Website URL", placeholder="Enter a URL you want to annotate", interactive=True)
 
 
131
 
132
- with gr.Row():
133
- set_random_btn = gr.Button("Set Random URL", variant="secondary", interactive=True)
134
 
135
- load_btn = gr.Button("Annotate URL", variant="primary", interactive=True)
136
 
137
- with gr.Row():
138
- extracted_text = gr.Textbox(
139
- label="Extracted text",
140
- max_lines=15,
141
- lines=15,
142
- visible=True,
143
- placeholder="Click on `Load URL` to fetch Web page's text content."
144
- )
145
-
146
- screenshot_scrollable = gr.HTML("", visible=False)
147
-
148
- with gr.Column(visible=False) as output_col:
149
  with gr.Row():
150
- language_codes = gr.Dropdown(
151
- [("unknown", "unknown")] + [(f"{code}: {name}", code) for code, name in ISO_CODE_TO_LANGUAGE_NAME.items()],
152
- label="Language codes",
153
- multiselect=True,
154
- # allow_custom_value=True,
 
155
  )
156
- categories = gr.CheckboxGroup(["News", "Culture/History", "Government", "Political Parties", "Other"], label="Categories")
157
-
158
- with gr.Row():
159
- do_crawl_btn = gr.Button("✅ Do Crawl", elem_classes="success")
160
- dont_crawl_btn = gr.Button("❌ Don't Crawl", elem_classes="error")
161
- # random_subpage_btn = gr.Button("🔁 Load Another Subpage", variant="secondary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
 
 
 
 
 
 
163
 
164
- def set_random_url():
165
- candidate_urls = get_candidate_urls()
166
- selected_url = random.choice(candidate_urls)
167
- return selected_url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- set_random_btn.click(fn=set_random_url, outputs=url_field)
170
-
171
-
172
- def load_url(url):
173
- screenshot_html_str, text = fetch_screenshot_and_text_from_url(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
- if not screenshot_html_str or not text:
176
- gr.Error("Could not fetch data for url")
177
- else:
178
 
179
- return {
180
- screenshot_scrollable: gr.update(value=screenshot_html_str, visible=True),
181
- extracted_text: gr.update(value=text, visible=True),
182
- output_col: gr.update(visible=True),
183
- language_codes: "unknown", # Reset by set to invalid value # gr.update(None, label=url),
184
- categories: gr.update(value=None),
185
- }
186
 
187
- load_btn.click(fn=load_url, inputs=url_field, outputs=[screenshot_scrollable, extracted_text, output_col, language_codes, categories], api_name="load_url")
188
 
189
- def do_crawl(profile_state, url, language_codes, categories, do_crawl=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- if profile_state:
192
- # html_str = f"<b>Thanks {profile_state}, we have saved your feedback!</b>"
193
- gr.Info("✅ Thanks for your feedback")
194
- else:
195
- gr.Error("❌ Feedback could not be saved")
196
- # html_str = f"<b>Feedback could not be saved.</b> You are not authenticated."
197
 
198
- return {
199
- url_field: "",
200
- output_col: gr.update(visible=False),
201
- extracted_text: gr.update(value=None, visible=True),
202
- screenshot_scrollable: gr.update(value="", visible=False),
203
- }
204
 
205
- # def do_crawl(profile_state, url, language_codes, categories):
206
- # return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=True)
207
-
208
- # def dont_crawl(profile_state, url, language_codes, categories):
209
- # return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=False)
210
-
211
-
212
- do_crawl_btn.click(
213
- fn=do_crawl,
214
- inputs=[profile_state, url_field, language_codes, categories],
215
- outputs=[
216
- url_field,
217
- output_col,
218
- extracted_text,
219
- screenshot_scrollable
220
- ],
221
- api_name="do_crawl",
222
- )
223
- dont_crawl_btn.click(
224
- fn=do_crawl,
225
- inputs=[profile_state, url_field, language_codes, categories],
226
- outputs=[
227
- url_field,
228
- output_col,
229
- extracted_text,
230
- screenshot_scrollable
231
- ],
232
- api_name="do_crawl",
233
- )
234
-
235
- # dont_crawl_btn.click(fn=dont_crawl, inputs=[profile_state, url, language_codes, categories], outputs=[url, output_col, extracted_text, screenshot_scrollable], api_name="dont_crawl")
236
-
237
- # def random_subpage(url):
238
- # new_url = "http://example.com"
239
-
240
- # return [new_url, *fetch_screenshot_and_text_from_url(new_url)]
241
-
242
- # random_subpage_btn.click(fn=random_subpage, inputs=url, outputs=[url, screenshot_scrollable, extracted_text, output_col], api_name="load_random_subpage")
243
 
244
 
245
 
 
1
  from __future__ import annotations
2
 
3
+ import json
4
  import os
5
  import random
6
  import time
7
  import gradio as gr
8
+ import pandas as pd
9
  from selenium import webdriver
10
  from selenium.common.exceptions import WebDriverException
11
  from PIL import Image
12
  from io import BytesIO
13
  import base64
14
 
15
+
16
+
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ from uuid import uuid4
20
+
21
  import trafilatura
22
 
23
+ from datasets import load_dataset
24
+ from datasets import Features, Value, Sequence
25
+
26
+ from huggingface_hub import CommitScheduler
27
+
28
  from huggingface_hub import whoami
29
 
30
  from languages import ISO_CODE_TO_LANGUAGE_NAME
31
+ from texts import ABOUT_TEXT
32
+
33
+ DISABLE_FETCH_URL = os.environ.get("DISABLE_FETCH_URL", False)
34
+
35
+ if DISABLE_FETCH_URL:
36
+ print("Fetch URL is disabled: Only dummy screenshot and text will be returned.")
37
+
38
+ DATASET_REPO_ID = os.environ.get("DATASET_REPO_ID", "malteos/seed-crawl-urls")
39
+
40
+ JSON_DATASET_DIR = Path("jsonl_dataset")
41
+ JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
42
+
43
+ # Each instance of this space will spawn a unique file for each type of result
44
+ # For the life of that space, it will append to that file pushed to a dataset every so often
45
+ # It also is append_only, so no previous data will be overwritten
46
+ JSON_DATASET_PATH = JSON_DATASET_DIR / f"urls-{uuid4()}.jsonl"
47
 
48
+ if os.getenv("HF_TOKEN"):
49
+ scheduler = CommitScheduler(
50
+ repo_id=DATASET_REPO_ID,
51
+ repo_type="dataset",
52
+ folder_path=JSON_DATASET_DIR,
53
+ path_in_repo="data",
54
+ )
55
+ else:
56
+ scheduler = None
57
+ print("No HF_TOKEN found, results will not be uploaded to the hub.")
58
+
59
+ def save_to_jsonl(obj: dict) -> None:
60
+ if scheduler:
61
+ with scheduler.lock:
62
+ with JSON_DATASET_PATH.open("a") as f:
63
+ json.dump(obj, f)
64
+ f.write("\n")
65
 
66
  def get_candidate_urls():
67
  return [
 
91
  height = 350
92
  text = ""
93
 
94
+ if DISABLE_FETCH_URL:
95
  screenshot = Image.new('RGB', (350, height))
96
  text = f"Some dummy text for {url} (offline mode enabled)"
97
 
 
148
  # Seed Crawl Annotator
149
  """)
150
 
151
+ with gr.Tab("Contribute"):
152
+ gr.Markdown("Welcome! This is a crowd-sourced effort to improve crawling of low-resource languages. Your contributions will be part of a public dataset.")
153
+ profile_state = gr.State([])
154
+ gr.LoginButton()
155
+
156
+ with gr.Column(visible=False) as wrapper_col:
157
+ login_status = gr.Markdown("no", visible=False)
158
+
159
+ def handle_login(profile: gr.OAuthProfile | None) -> dict:
160
+ if profile:
161
+ gr.Info(f"Logged in as {profile.username}")
162
+ return {
163
+ profile_state: f"{profile.username}",
164
+ wrapper_col: gr.update(visible=True),
165
+ login_status: "yes",
166
+ }
167
+ else:
168
+ gr.Warning(f"You need to login to use this app.")
169
+ return {
170
+ profile_state: [],
171
+ wrapper_col: gr.update(visible=False),
172
+ login_status: "no",
173
+ }
174
+
175
+ demo.load(handle_login, inputs=None, outputs=[profile_state, wrapper_col, login_status])
176
+
177
+ url_field = gr.Textbox(label="Website URL", placeholder="Enter a URL you want to annotate", interactive=True)
178
 
179
+ with gr.Row():
180
+ set_random_btn = gr.Button("Pick Random URL", variant="secondary", interactive=True)
181
 
182
+ load_btn = gr.Button("Annotate URL", variant="primary", interactive=True)
183
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  with gr.Row():
185
+ extracted_text = gr.Textbox(
186
+ label="Extracted text",
187
+ max_lines=15,
188
+ lines=15,
189
+ visible=True,
190
+ placeholder="Click on `Load URL` to fetch Web page's text content."
191
  )
192
+
193
+ screenshot_scrollable = gr.HTML("", visible=False)
194
+
195
+ with gr.Column(visible=False) as output_col:
196
+ with gr.Row():
197
+ language_codes = gr.Dropdown(
198
+ [("unknown", "unknown")] + [(f"{code}: {name}", code) for code, name in ISO_CODE_TO_LANGUAGE_NAME.items()],
199
+ label="Language codes",
200
+ multiselect=True,
201
+ # allow_custom_value=True,
202
+ )
203
+ categories = gr.CheckboxGroup(["News", "Culture/History", "Government", "Political Parties", "Other"], label="Categories")
204
+
205
+ with gr.Row():
206
+ do_crawl_btn = gr.Button("✅ Do Crawl", elem_classes="success")
207
+ dont_crawl_btn = gr.Button("❌ Don't Crawl", elem_classes="error")
208
+ # random_subpage_btn = gr.Button("🔁 Load Another Subpage", variant="secondary")
209
+
210
+
211
+ def set_random_url():
212
+ candidate_urls = get_candidate_urls()
213
+ selected_url = random.choice(candidate_urls)
214
+ return selected_url
215
+
216
+ set_random_btn.click(fn=set_random_url, outputs=url_field)
217
+
218
+
219
+ def load_url(url):
220
+ screenshot_html_str, text = fetch_screenshot_and_text_from_url(url)
221
+
222
+ if not screenshot_html_str or not text:
223
+ gr.Error("Could not fetch data for url")
224
+ else:
225
+
226
+ return {
227
+ screenshot_scrollable: gr.update(value=screenshot_html_str, visible=True),
228
+ extracted_text: gr.update(value=text, visible=True),
229
+ output_col: gr.update(visible=True),
230
+ language_codes: "unknown", # Reset by set to invalid value # gr.update(None, label=url),
231
+ categories: gr.update(value=None),
232
+ }
233
+
234
+ load_btn.click(fn=load_url, inputs=url_field, outputs=[screenshot_scrollable, extracted_text, output_col, language_codes, categories], api_name="load_url")
235
+
236
+ def do_crawl_error_handler(msg):
237
+ # error response
238
+ print("error -> no changes")
239
+ gr.Warning(f"❌ Error: {msg}")
240
 
241
+ return {
242
+ url_field: gr.update(),
243
+ output_col: gr.update(),
244
+ extracted_text: gr.update(),
245
+ screenshot_scrollable: gr.update(),
246
+ }
247
 
248
+ def do_crawl(profile_state, url, language_codes, categories, do_crawl=True):
249
+ print(f"{url=}")
250
+ print(f"{language_codes=}")
251
+ print(f"{categories=}")
252
+ print(f"{do_crawl=}")
253
+
254
+
255
+
256
+ if not profile_state:
257
+ return do_crawl_error_handler("You are not authenticated.")
258
+
259
+ elif len(url) <= 0:
260
+ return do_crawl_error_handler("URL is empty.")
261
+
262
+ elif len(categories) <= 0:
263
+ return do_crawl_error_handler("You must select at least one category.")
264
+
265
+ elif len(language_codes) <= 0:
266
+ return do_crawl_error_handler("You must select at least one language.")
267
+ else:
268
+ #
269
+ save_to_jsonl({
270
+ "url": url,
271
+ "language_codes": language_codes,
272
+ "categories": categories,
273
+ "do_crawl": int(do_crawl),
274
+ "username": profile_state,
275
+ "submission_datetime": datetime.now().isoformat(),
276
+ })
277
+ # html_str = f"<b>Thanks {profile_state}, we have saved your feedback!</b>"
278
+ gr.Info("✅ Thanks for your feedback. Let's continue!")
279
+
280
+ return {
281
+ url_field: "", # TODO fetch new url
282
+ output_col: gr.update(visible=False),
283
+ extracted_text: gr.update(value=None, visible=True),
284
+ screenshot_scrollable: gr.update(value="", visible=False),
285
+ }
286
+
287
+
288
+
289
+ # def do_crawl(profile_state, url, language_codes, categories):
290
+ # return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=True)
291
+
292
+ # def dont_crawl(profile_state, url, language_codes, categories):
293
+ # return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=False)
294
+
295
 
296
+ do_crawl_btn.click(
297
+ fn=do_crawl,
298
+ inputs=[profile_state, url_field, language_codes, categories],
299
+ outputs=[
300
+ url_field,
301
+ output_col,
302
+ extracted_text,
303
+ screenshot_scrollable
304
+ ],
305
+ api_name="do_crawl",
306
+ )
307
+ dont_crawl_btn.click(
308
+ fn=do_crawl,
309
+ inputs=[profile_state, url_field, language_codes, categories],
310
+ outputs=[
311
+ url_field,
312
+ output_col,
313
+ extracted_text,
314
+ screenshot_scrollable
315
+ ],
316
+ api_name="do_crawl",
317
+ )
318
 
319
+ # dont_crawl_btn.click(fn=dont_crawl, inputs=[profile_state, url, language_codes, categories], outputs=[url, output_col, extracted_text, screenshot_scrollable], api_name="dont_crawl")
 
 
320
 
321
+ # def random_subpage(url):
322
+ # new_url = "http://example.com"
 
 
 
 
 
323
 
324
+ # return [new_url, *fetch_screenshot_and_text_from_url(new_url)]
325
 
326
+ # random_subpage_btn.click(fn=random_subpage, inputs=url, outputs=[url, screenshot_scrollable, extracted_text, output_col], api_name="load_random_subpage")
327
+
328
+ with gr.Tab("Browse Contributions"):
329
+ gr.Markdown("This page lists all the data we have collected so far. Please note that the list might be out-of-sync.")
330
+
331
+ """
332
+ dataset_info:
333
+ - config_name: base
334
+ features:
335
+ - name: url
336
+ dtype: string
337
+ - name: language_codes
338
+ list: string
339
+ - name: categories
340
+ list: string
341
+ - name: do_crawl
342
+ dtype: int32
343
+ - name: username
344
+ dtype: string
345
+ - name: submission_datetime
346
+ dtype: string
347
+ """
348
+
349
+ features = Features({
350
+ "url": Value("string"),
351
+ "language_codes": Sequence(Value(dtype="string")),
352
+ "categories": Sequence(Value(dtype="string")),
353
+ "do_crawl": Value("int32"),
354
+ "username": Value("string"),
355
+ "submission_datetime": Value("string"),
356
+ })
357
+ try:
358
+ ds = load_dataset(DATASET_REPO_ID, data_files={"train": "data/*.jsonl"}, features=features)
359
+ df = ds["train"].to_pandas()
360
+ gr.Dataframe(df)
361
+ except ValueError as e:
362
+ print(e)
363
 
364
+ gr.Markdown("> Error: Dataset cannot be loaded.")
 
 
 
 
 
365
 
 
 
 
 
 
 
366
 
367
+ with gr.Tab("About"):
368
+ gr.Markdown(ABOUT_TEXT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
 
371
 
requirements.txt CHANGED
@@ -11,3 +11,9 @@ trafilatura==2.0.0
11
  # ImportError: lxml.html.clean module is now a separate project lxml_html_clean.
12
  lxml_html_clean
13
 
 
 
 
 
 
 
 
11
  # ImportError: lxml.html.clean module is now a separate project lxml_html_clean.
12
  lxml_html_clean
13
 
14
+ datasets
15
+ huggingface-hub>=0.19
16
+ hf-transfer>=0.1.4
17
+ # protobuf<4
18
+ # click<8.1
19
+ # pydantic~=1.0
texts.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ABOUT_TEXT = """
2
+
3
+ ## Web Languages Project
4
+
5
+ Welcome! This is a crowd-sourced effort to improve crawling
6
+ of low-resource languages. This dataset is public.
7
+
8
+ [Common Crawl](https://commoncrawl.github.io/cc-crawl-statistics/plots/languages)
9
+ recognizes a lot of languages, and we can see that we don't have
10
+ enough of languages like Hindi (500 million speakers!), smaller
11
+ country languages like Hungarian, and regional languages like Catalan.
12
+ We are interested in languages from all over the world. If you choose
13
+ to help, you'll be helping create lists of websites related to
14
+ languages that you read or speak.
15
+
16
+ ### How can I contribute?
17
+
18
+ If you look below you'll see a huge list of living languages. If you
19
+ see one that looks interesting, click on it. You'll see a
20
+ language-specific document, probably mostly blank, that you can fill
21
+ out.
22
+
23
+ There are 2 ways to add to this document. If you aren't very familiar
24
+ with Github, you can copy the entire document into an email, fill it
25
+ out, and send it to web-languages ZAT commoncrawl ZOT org. We'll do the rest.
26
+
27
+ If you are familiar with Github, and are logged in, click on the pen
28
+ icon in the upper right corner to start editing the document.
29
+ Github will request that you fork the repo. Do that, edit the
30
+ document, and finally create a pull request.
31
+
32
+ To see a partially completed example, look at the
33
+ [Welsh](living/welsh.md) entry.
34
+
35
+ Sometimes asking a Large Language Model can be helpful: "What are some
36
+ top websites written in the Welsh language?"
37
+
38
+ ### What kind of websites are you looking for?
39
+
40
+ If you look at the template, we have requested urls in a few
41
+ categories: News, Culture/History, Government, Political Parties, and
42
+ Other. Remember that we're looking for websites in this particular
43
+ language. If the language is only a part of the website, and that's
44
+ visible in the URL as https://example.com/catalan/, then that's the
45
+ URL you should add.
46
+
47
+ For a language like Hindi, with 500 million speakers, there are a lot
48
+ of websites to choose from. Please suggest websites that are important
49
+ and influential, and please think about diversity. Are all geographic
50
+ regions represented?
51
+
52
+ For a country-wide language like Hungarian, there are still probably a
53
+ wide variety of websites to choose from. If a website is all English,
54
+ however, that's not what we're looking for.
55
+
56
+ For a regional language like Catalan, things are trickier. Catalan has
57
+ multiple names -- it's called Valencian in some parts of Spain -- and
58
+ use of the Catalan language is a part of a vigorous debate in Spanish
59
+ national and regional politics. You might not be able to find
60
+ Catalan-language content for every political party, and government
61
+ websites might offer Catalan content one day and remove it after
62
+ the next election. In that case, please do the best you can.
63
+
64
+ If your favorite language has its own Wikipedia -- [check the list here](https://en.wikipedia.org/wiki/List_of_Wikipedias) --
65
+ please include this link under "Other".
66
+
67
+
68
+ """