Ezi Ozoani commited on
Commit
08d7436
·
1 Parent(s): 833f9be

run it again

Browse files
__pycache__/main.cpython-310.pyc ADDED
Binary file (4.76 kB). View file
 
__pycache__/server.cpython-310.pyc ADDED
Binary file (1.2 kB). View file
 
main.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from difflib import SequenceMatcher
4
+ from typing import Any, Dict, Optional, Tuple
5
+
6
+ from fastapi import FastAPI, Request, Response
7
+ from huggingface_hub import (DatasetCard, HfApi, ModelCard, comment_discussion,
8
+ create_discussion, get_discussion_details,
9
+ get_repo_discussions, login)
10
+ from huggingface_hub.utils import EntryNotFoundError
11
+ from tabulate import tabulate
12
+
13
+ KEY = os.environ.get("WEBHOOK_SECRET")
14
+ #HF_TOKEN = os.environ.get("HF_ACCESS_TOKEN")
15
+
16
+ #api = HfApi(token=HF_TOKEN)
17
+ #login(HF_TOKEN)
18
+
19
+ #app = FastAPI()
20
+
21
+
22
+
23
+
24
+ def similar(a, b):
25
+ """Check similarity of two sequences"""
26
+ return SequenceMatcher(None, a, b).ratio()
27
+
28
+
29
+ def create_metadata_key_dict(card_data, repo_type: str):
30
+ shared_keys = ["tags", "license"]
31
+ if repo_type == "model":
32
+ model_keys = ["library_name", "datasets", "metrics", "co2", "pipeline_tag"]
33
+ shared_keys.extend(model_keys)
34
+ keys = shared_keys
35
+ return {key: card_data.get(key) for key in keys}
36
+ if repo_type == "dataset":
37
+ data_keys = [
38
+ "pretty_name",
39
+ "size_categories",
40
+ "task_categories",
41
+ "task_ids",
42
+ "source_datasets",
43
+ ]
44
+ shared_keys.extend(data_keys)
45
+ keys = shared_keys
46
+ return {key: card_data.get(key) for key in keys}
47
+
48
+
49
+ def create_metadata_breakdown_table(desired_metadata_dictionary):
50
+ data = {k:v or "Field Missing" for k,v in desired_metadata_dictionary.items()}
51
+ metadata_fields_column = list(data.keys())
52
+ metadata_values_column = list(data.values())
53
+ table_data = list(zip(metadata_fields_column, metadata_values_column))
54
+ return tabulate(
55
+ table_data, tablefmt="github", headers=("Metadata Field", "Provided Value")
56
+ )
57
+
58
+
59
+ def calculate_grade(desired_metadata_dictionary):
60
+ metadata_values = list(desired_metadata_dictionary.values())
61
+ score = sum(1 if field else 0 for field in metadata_values) / len(metadata_values)
62
+ return round(score, 2)
63
+
64
+
65
+ def create_markdown_report(
66
+ desired_metadata_dictionary, repo_name, repo_type, score, update: bool = False
67
+ ):
68
+ report = f"""# {repo_type.title()} metadata report card {"(updated)" if update else ""}
69
+ \n
70
+ This is an automatically produced metadata quality report card for {repo_name}. This report is meant as a POC!
71
+ \n
72
+ ## Breakdown of metadata fields for your{repo_type}
73
+ \n
74
+ {create_metadata_breakdown_table(desired_metadata_dictionary)}
75
+ \n
76
+ You scored a metadata coverage grade of: **{score}**% \n {f"We're not angry we're just disappointed! {repo_type.title()} metadata is super important. Please try harder..."
77
+ if score <= 0.5 else f"Not too shabby! Make sure you also fill in a {repo_type} card too!"}
78
+ """
79
+ return report
80
+
81
+
82
+ def parse_webhook_post(data: Dict[str, Any]) -> Optional[Tuple[str, str]]:
83
+ event = data["event"]
84
+ if event["scope"] != "repo":
85
+ return None
86
+ repo = data["repo"]
87
+ repo_name = repo["name"]
88
+ repo_type = repo["type"]
89
+ if repo_type not in {"model", "dataset"}:
90
+ raise ValueError("Unknown hub type")
91
+ return repo_type, repo_name
92
+
93
+
94
+ def load_repo_card_metadata(repo_type, repo_name):
95
+ if repo_type == "dataset":
96
+ try:
97
+ return DatasetCard.load(repo_name).data.to_dict()
98
+ except EntryNotFoundError:
99
+ return {}
100
+ if repo_type == "model":
101
+ try:
102
+ return ModelCard.load(repo_name).data.to_dict()
103
+ except EntryNotFoundError:
104
+ return {}
105
+
106
+
107
+ def create_or_update_report(data):
108
+ if parsed_post := parse_webhook_post(data):
109
+ repo_type, repo_name = parsed_post
110
+ else:
111
+ return Response("Unable to parse webhook data", status_code=400)
112
+ card_data = load_repo_card_metadata(repo_type, repo_name)
113
+ desired_metadata_dictionary = create_metadata_key_dict(card_data, repo_type)
114
+ score = calculate_grade(desired_metadata_dictionary)
115
+ report = create_markdown_report(
116
+ desired_metadata_dictionary, repo_name, repo_type, score, update=False
117
+ )
118
+ repo_discussions = get_repo_discussions(
119
+ repo_name,
120
+ repo_type=repo_type,
121
+ )
122
+ for discussion in repo_discussions:
123
+ if (
124
+ discussion.title == "Metadata Report Card" and discussion.status == "open"
125
+ ): # An existing open report card thread
126
+ discussion_details = get_discussion_details(
127
+ repo_name, discussion.num, repo_type=repo_type
128
+ )
129
+ last_comment = discussion_details.events[-1].content
130
+ if similar(report, last_comment) <= 0.999:
131
+ report = create_markdown_report(
132
+ desired_metadata_dictionary,
133
+ repo_name,
134
+ repo_type,
135
+ score,
136
+ update=True,
137
+ )
138
+ comment_discussion(
139
+ repo_name,
140
+ discussion.num,
141
+ comment=report,
142
+ repo_type=repo_type,
143
+ )
144
+ return True
145
+ create_discussion(
146
+ repo_name,
147
+ "Metadata Report Card",
148
+ description=report,
149
+ repo_type=repo_type,
150
+ )
151
+ return True
152
+
server.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from fastapi import FastAPI, Request, Response
3
+ from main import create_or_update_report
4
+ from tabulate import tabulate
5
+
6
+ KEY = os.environ.get("WEBHOOK_SECRET")
7
+
8
+ app = FastAPI()
9
+
10
+ @app.get("/")
11
+ def read_root():
12
+ data = """
13
+ <h2 style="text-align:center">Metadata Review Bot</h2>
14
+ <p style="text-align:center">This is a demo app showing how to use webhooks to automate metadata review for models and datasets shared on the Hugging Face Hub.</p>
15
+ """
16
+ return Response(content=data, media_type="text/html")
17
+
18
+ @app.post("/webhook")
19
+ async def webhook(request: Request):
20
+ if request.method == "POST":
21
+ if request.headers.get("X-Webhook-Secret") != KEY:
22
+ return Response("Invalid secret", status_code=401)
23
+ data = await request.json()
24
+ result = create_or_update_report(data)
25
+ return "Webhook received!" if result else result
26
+
27
+