nielsr HF staff commited on
Commit
3170ddb
1 Parent(s): 4dd059d

Update requirements

Browse files
Files changed (3) hide show
  1. app.py +37 -15
  2. load_dataframe.py +43 -0
  3. requirements.txt +5 -5
app.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
  import matplotlib.pyplot as plt
6
 
7
  # from load_dataframe import get_data
 
8
 
9
 
10
  def aggregated_data(df, aggregation_level="week"):
@@ -22,6 +23,13 @@ def aggregated_data(df, aggregation_level="week"):
22
  # Calculate the percentage of papers with artifacts
23
  percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100
24
 
 
 
 
 
 
 
 
25
  # Create the plot
26
  plt.figure(figsize=(12, 6))
27
  plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')
@@ -40,38 +48,49 @@ def aggregated_data(df, aggregation_level="week"):
40
 
41
 
42
  def display_data(df):
43
- num_artifacts = df[(df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)].shape[0]
 
 
 
 
 
 
44
 
45
  st.markdown(f"""
46
- ## Number of papers: {df.shape[0]}
47
- #### Number of papers with a Github link: {df['github'].notnull().sum()}
48
- #### Number of papers with at least one HF artifact: {num_artifacts}
 
 
49
  """)
50
 
51
  st.write("Papers with at least one artifact")
52
- df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
53
- st.dataframe(df[df['has_artifact']],
54
  hide_index=True,
55
- column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
56
  column_config={"github": st.column_config.LinkColumn(),
57
- "paper_page": st.column_config.LinkColumn()},
58
- width=2000)
 
 
59
 
60
  st.write("Papers without artifacts")
61
- st.dataframe(df[~df['has_artifact']],
62
  hide_index=True,
63
- column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
64
  column_config={"github": st.column_config.LinkColumn(),
65
  "paper_page": st.column_config.LinkColumn()},
66
- width=2000)
 
67
 
68
  st.write("Papers with a HF mention in README but no artifacts")
69
- st.dataframe(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
70
  hide_index=True,
71
- column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
72
  column_config={"github": st.column_config.LinkColumn(),
73
  "paper_page": st.column_config.LinkColumn()},
74
- width=2000)
 
75
 
76
 
77
  def main():
@@ -90,6 +109,9 @@ def main():
90
  df.index = pd.to_datetime(df.index)
91
  df = df.sort_index()
92
 
 
 
 
93
  if selection == "Daily/weekly/monthly data":
94
  # Button to select day, month or week
95
  # Add streamlit selectbox.
 
5
  import matplotlib.pyplot as plt
6
 
7
  # from load_dataframe import get_data
8
+ from urllib.parse import quote
9
 
10
 
11
  def aggregated_data(df, aggregation_level="week"):
 
23
  # Calculate the percentage of papers with artifacts
24
  percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100
25
 
26
+ # Calculate the growth rate
27
+ growth_rate = percentage_papers_with_artifacts.pct_change() * 100
28
+
29
+ # Display the latest growth rate as a big number
30
+ latest_growth_rate = growth_rate.iloc[-1] if not growth_rate.empty else 0
31
+ st.metric(label=f"{aggregation_level.capitalize()}ly Growth Rate", value=f"{latest_growth_rate:.2f}%")
32
+
33
  # Create the plot
34
  plt.figure(figsize=(12, 6))
35
  plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')
 
48
 
49
 
50
  def display_data(df):
51
+ df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
52
+ num_artifacts = df['has_artifact'].sum()
53
+ percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
54
+ percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)
55
+
56
+ # add reached out column
57
+ df['reached_out'] = [False for _ in range(df.shape[0])]
58
 
59
  st.markdown(f"""
60
+ ## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact
61
+
62
+ * Number of papers: {df.shape[0]}
63
+ * Number of papers with a Github link: {df['github'].notnull().sum()}
64
+ * Number of papers with at least one HF artifact: {num_artifacts}
65
  """)
66
 
67
  st.write("Papers with at least one artifact")
68
+ st.data_editor(df[df['has_artifact']],
 
69
  hide_index=True,
70
+ column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
71
  column_config={"github": st.column_config.LinkColumn(),
72
+ "paper_page": st.column_config.LinkColumn(),
73
+ "paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')},
74
+ width=2000,
75
+ key="papers_with_artifacts")
76
 
77
  st.write("Papers without artifacts")
78
+ st.data_editor(df[~df['has_artifact']],
79
  hide_index=True,
80
+ column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
81
  column_config={"github": st.column_config.LinkColumn(),
82
  "paper_page": st.column_config.LinkColumn()},
83
+ width=2000,
84
+ key="papers_without_artifacts")
85
 
86
  st.write("Papers with a HF mention in README but no artifacts")
87
+ st.data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
88
  hide_index=True,
89
+ column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
90
  column_config={"github": st.column_config.LinkColumn(),
91
  "paper_page": st.column_config.LinkColumn()},
92
+ width=2000,
93
+ key="papers_with_hf_mention_no_artifacts")
94
 
95
 
96
  def main():
 
109
  df.index = pd.to_datetime(df.index)
110
  df = df.sort_index()
111
 
112
+ # hack: include title in URL column
113
+ df['updated_url'] = df.apply(lambda row: f'{row["paper_page"]}/title/{quote(row["title"])}', axis=1)
114
+
115
  if selection == "Daily/weekly/monthly data":
116
  # Button to select day, month or week
117
  # Add streamlit selectbox.
load_dataframe.py CHANGED
@@ -21,6 +21,10 @@ class PaperInfo:
21
 
22
 
23
  def get_df() -> pd.DataFrame:
 
 
 
 
24
  df = pd.merge(
25
  left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
26
  right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
@@ -112,6 +116,42 @@ def add_hf_assets(batch):
112
  return batch
113
 
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  @st.cache_data
116
  def get_data() -> pd.DataFrame:
117
  """
@@ -132,6 +172,9 @@ def get_data() -> pd.DataFrame:
132
  # step 3. enrich using Hugging Face API
133
  dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count())
134
 
 
 
 
135
  # return as Pandas dataframe
136
  dataframe = dataset.to_pandas()
137
 
 
21
 
22
 
23
  def get_df() -> pd.DataFrame:
24
+ """
25
+ Load the initial dataset as a Pandas dataframe.
26
+ """
27
+
28
  df = pd.merge(
29
  left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
30
  right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
 
116
  return batch
117
 
118
 
119
+ def check_hf_mention(batch):
120
+ """
121
+ Check if a paper mentions Hugging Face in the README.
122
+ """
123
+
124
+ hf_mentions = []
125
+ for github_url in batch["github"]:
126
+ hf_mention = 0
127
+ if github_url != "":
128
+ # get README text using Github API
129
+ owner = github_url.split("/")[-2]
130
+ repo = github_url.split("/")[-1]
131
+ branch = "main"
132
+ url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md"
133
+ response = requests.get(url)
134
+
135
+ if response.status_code != 200:
136
+ # try master branch as second attempt
137
+ branch = "master"
138
+ url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md"
139
+ response = requests.get(url)
140
+
141
+ if response.status_code == 200:
142
+ # get text
143
+ text = response.text
144
+ if "huggingface" in text.lower() or "hugging face" in text.lower():
145
+ hf_mention = 1
146
+
147
+ hf_mentions.append(hf_mention)
148
+
149
+ # overwrite the Github links
150
+ batch["hf_mention"] = hf_mentions
151
+
152
+ return batch
153
+
154
+
155
  @st.cache_data
156
  def get_data() -> pd.DataFrame:
157
  """
 
172
  # step 3. enrich using Hugging Face API
173
  dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count())
174
 
175
+ # step 4. check if Hugging Face is mentioned in the README
176
+ dataset = dataset.map(check_hf_mention, batched=True, batch_size=4, num_proc=cpu_count())
177
+
178
  # return as Pandas dataframe
179
  dataframe = dataset.to_pandas()
180
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- streamlit
2
- plotly
3
- tqdm
4
- datasets
5
- paperswithcode
 
1
+ streamlit==1.36.0
2
+ matplotlib==3.7.0
3
+ tqdm==4.66.4
4
+ datasets==2.20.0
5
+ paperswithcode-client==0.3.1