Spaces:

nielsr
/

community-science-progress

Running

App Files Files Community

nielsr HF Staff commited on Jul 22, 2024

Commit

3170ddb

1 Parent(s): 4dd059d

Update requirements

Browse files

Files changed (3) hide show

app.py +37 -15
load_dataframe.py +43 -0
requirements.txt +5 -5

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pandas as pd
 import matplotlib.pyplot as plt
 # from load_dataframe import get_data
 def aggregated_data(df, aggregation_level="week"):
@@ -22,6 +23,13 @@ def aggregated_data(df, aggregation_level="week"):
     # Calculate the percentage of papers with artifacts
     percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100
     # Create the plot
     plt.figure(figsize=(12, 6))
     plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')
@@ -40,38 +48,49 @@ def aggregated_data(df, aggregation_level="week"):
 def display_data(df):
-    num_artifacts = df[(df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)].shape[0]
     st.markdown(f"""
-    ## Number of papers: {df.shape[0]}
-    #### Number of papers with a Github link: {df['github'].notnull().sum()}
-    #### Number of papers with at least one HF artifact: {num_artifacts}
     """)
     st.write("Papers with at least one artifact")
-    df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
-    st.dataframe(df[df['has_artifact']],
                 hide_index=True,
-                column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                 column_config={"github": st.column_config.LinkColumn(),
-                                "paper_page": st.column_config.LinkColumn()},
-                width=2000)
     st.write("Papers without artifacts")
-    st.dataframe(df[~df['has_artifact']],
                 hide_index=True,
-                column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                 column_config={"github": st.column_config.LinkColumn(),
                                 "paper_page": st.column_config.LinkColumn()},
-                width=2000)
     st.write("Papers with a HF mention in README but no artifacts")
-    st.dataframe(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
                 hide_index=True,
-                column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                 column_config={"github": st.column_config.LinkColumn(),
                                 "paper_page": st.column_config.LinkColumn()},
-                width=2000)
 def main():
@@ -90,6 +109,9 @@ def main():
     df.index = pd.to_datetime(df.index)
     df = df.sort_index()
     if selection == "Daily/weekly/monthly data":
         # Button to select day, month or week
         # Add streamlit selectbox.

 import matplotlib.pyplot as plt
 # from load_dataframe import get_data
+from urllib.parse import quote
 def aggregated_data(df, aggregation_level="week"):
     # Calculate the percentage of papers with artifacts
     percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100
+    # Calculate the growth rate
+    growth_rate = percentage_papers_with_artifacts.pct_change() * 100
+    # Display the latest growth rate as a big number
+    latest_growth_rate = growth_rate.iloc[-1] if not growth_rate.empty else 0
+    st.metric(label=f"{aggregation_level.capitalize()}ly Growth Rate", value=f"{latest_growth_rate:.2f}%")
     # Create the plot
     plt.figure(figsize=(12, 6))
     plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')
 def display_data(df):
+    df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
+    num_artifacts = df['has_artifact'].sum()
+    percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
+    percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)
+    # add reached out column
+    df['reached_out'] = [False for _ in range(df.shape[0])]
     st.markdown(f"""
+    ## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact
+    * Number of papers: {df.shape[0]}
+    * Number of papers with a Github link: {df['github'].notnull().sum()}
+    * Number of papers with at least one HF artifact: {num_artifacts}
     """)
     st.write("Papers with at least one artifact")
+    st.data_editor(df[df['has_artifact']],
                 hide_index=True,
+                column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                 column_config={"github": st.column_config.LinkColumn(),
+                                "paper_page": st.column_config.LinkColumn(),
+                                "paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')},
+                width=2000,
+                key="papers_with_artifacts")
     st.write("Papers without artifacts")
+    st.data_editor(df[~df['has_artifact']],
                 hide_index=True,
+                column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                 column_config={"github": st.column_config.LinkColumn(),
                                 "paper_page": st.column_config.LinkColumn()},
+                width=2000,
+                key="papers_without_artifacts")
     st.write("Papers with a HF mention in README but no artifacts")
+    st.data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
                 hide_index=True,
+                column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                 column_config={"github": st.column_config.LinkColumn(),
                                 "paper_page": st.column_config.LinkColumn()},
+                width=2000,
+                key="papers_with_hf_mention_no_artifacts")
 def main():
     df.index = pd.to_datetime(df.index)
     df = df.sort_index()
+    # hack: include title in URL column
+    df['updated_url'] = df.apply(lambda row: f'{row["paper_page"]}/title/{quote(row["title"])}', axis=1)
     if selection == "Daily/weekly/monthly data":
         # Button to select day, month or week
         # Add streamlit selectbox.

load_dataframe.py CHANGED Viewed

@@ -21,6 +21,10 @@ class PaperInfo:
 def get_df() -> pd.DataFrame:
     df = pd.merge(
         left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
         right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
@@ -112,6 +116,42 @@ def add_hf_assets(batch):
     return batch
 @st.cache_data
 def get_data() -> pd.DataFrame:
     """
@@ -132,6 +172,9 @@ def get_data() -> pd.DataFrame:
     # step 3. enrich using Hugging Face API
     dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count())
     # return as Pandas dataframe
     dataframe = dataset.to_pandas()

 def get_df() -> pd.DataFrame:
+    """
+    Load the initial dataset as a Pandas dataframe.
+    """
     df = pd.merge(
         left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
         right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
     return batch
+def check_hf_mention(batch):
+    """
+    Check if a paper mentions Hugging Face in the README.
+    """
+    hf_mentions = []
+    for github_url in batch["github"]:
+        hf_mention = 0
+        if github_url != "":
+            # get README text using Github API
+            owner = github_url.split("/")[-2]
+            repo = github_url.split("/")[-1]
+            branch = "main"
+            url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md"
+            response = requests.get(url)
+            if response.status_code != 200:
+                # try master branch as second attempt
+                branch = "master"
+                url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md"
+                response = requests.get(url)
+            if response.status_code == 200:
+                # get text
+                text = response.text
+            if "huggingface" in text.lower() or "hugging face" in text.lower():
+                hf_mention = 1
+        hf_mentions.append(hf_mention)
+    # overwrite the Github links
+    batch["hf_mention"] = hf_mentions
+    return batch
 @st.cache_data
 def get_data() -> pd.DataFrame:
     """
     # step 3. enrich using Hugging Face API
     dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count())
+    # step 4. check if Hugging Face is mentioned in the README
+    dataset = dataset.map(check_hf_mention, batched=True, batch_size=4, num_proc=cpu_count())
     # return as Pandas dataframe
     dataframe = dataset.to_pandas()

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-streamlit
-plotly
-tqdm
-datasets
-paperswithcode

+streamlit==1.36.0
+matplotlib==3.7.0
+tqdm==4.66.4
+datasets==2.20.0
+paperswithcode-client==0.3.1