Spaces:

franlucc
/

Candidates_viewer_NPR_challenge

Sleeping

App Files Files Community

franlucc commited on Mar 1

Commit

8a17d1d

verified ·

1 Parent(s): 95a4b2a

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gitattributes +1 -0
README.md +3 -9
candidates.py +213 -0
data.csv +0 -0
metrics.py +188 -0
output.csv +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+output.csv filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Candidates Viewer NPR Challenge
-emoji: 🦀
-colorFrom: green
-colorTo: yellow
 sdk: gradio
-sdk_version: 5.20.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Candidates_viewer_NPR_challenge
+app_file: candidates.py
 sdk: gradio
+sdk_version: 5.15.0
 ---

candidates.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+Script for joining .csv candidate data into a .duckdb results.
+Launches a gradio app to review candidates
+"""
+import argparse
+from pathlib import Path
+import pandas as pd
+from metrics import load_results
+import numpy as np
+import json
+import ast
+import gradio as gr
+from typing import List
+from hashlib import sha256
+import re
+def _query_format_models(models: List[str]) -> str:
+    """
+    Format model names for the SQL query `WHERE <this_model> IN <models>
+    """
+    return "('" + "','".join(["completions-"+m for m in models]) + "')"
+def _hash(text: str) -> str:
+    return sha256(bytes(text, "utf-8")).hexdigest()
+SQL_QUERY = """
+WITH AllResults AS (
+    SELECT
+        results.parent_dir AS model,
+        *
+    FROM
+        results.completions results
+    JOIN
+        challenges challenges
+    ON
+        results.prompt_id = challenges.ID
+)
+SELECT prompt_id, model, completion, answer as solution, prompt
+FROM AllResults
+WHERE
+    AllResults.model IN {models}
+""".format(models=_query_format_models(['r1_distill_qwen32b','r1','gemini2']))
+def print_info(db_connection):
+    tables = db_connection.execute("SHOW TABLES").fetchall()
+    # Iterate over each table and print its name and columns
+    for table in tables:
+        table_name = table[0]
+        print(f"Table: {table_name}")
+        # Get the columns for this table
+        columns = db_connection.execute(f"DESCRIBE {table_name}").fetchall()
+        # Print the column details
+        for column in columns:
+            print(f"  - {column[0]} ({column[1]})")  # column[0] is the column name, column[1] is the data type
+        print()  # Add a blank line between tables for readability
+def _parse(x):
+    if isinstance(x, str):
+        if len(x.strip()) == 0 or x.strip() in ["]","["]:
+            return [] # bad gen
+        else:
+            try:
+                return ast.literal_eval(x)
+            except:
+                raise ValueError(f"Bad gen: {x}")
+    elif np.isnan(x):
+        return []
+    else:
+        raise ValueError(f"Found unexpected type {type(x)}: {x}")
+def _concat(series: pd.Series) -> np.array:
+    items = list(filter(lambda x: len(x) > 0, map(_parse, series)))
+    if len(items) > 0:
+        return np.unique(np.concatenate(items))
+    else:
+        return items
+def check_candidates(candidates: pd.DataFrame, merged_df: pd.DataFrame):
+    """
+    Perform a variety of sanity checks ie:
+    - all chunks are present
+    - all attempted answers are in the completion
+    """
+    MANUALLY_CHECKED_SPECIAL_CASES = [
+        "4fd9a9adf162fe558cd94ab7ebcf8f42882873dca133aa1a4620572caa364c0c", # extracted as a str list, eg. `FIED, GOA`
+        "7dd4a475af16d67ed896275674d6a9b51911a3ee22aaca84411fb0a946245fa1"
+    ]
+    for _,row in merged_df.iterrows():
+        candidates = json.loads(row["candidates"])
+        comp = row["completion"].lower()
+        for c in candidates:
+            assert c.lower() in comp or \
+                    c.lower() in re.sub(r'[^a-z0-9]', '', comp) or \
+                    row["_original_completion_hash"] in MANUALLY_CHECKED_SPECIAL_CASES, \
+                json.dumps({"candidate":c, "completion":row["completion"], "hash": row["_original_completion_hash"]}, indent=4)
+    # grouped = candidates.groupby(["model","prompt_id"]).agg({"chunk_id": "unique", "num_chunks":"first"})
+    # for _,row in grouped.iterrows():
+    #     assert list(row["chunk_id"]) == range(row["num_chunks"]+1), (row["chunk_id"], row["num_chunks"])
+def launch_app(df: pd.DataFrame, share_demo: bool = False):
+    # Define function to display table and toggle completion
+    def show_table(show_completion, example_idx):
+        # Extract the row based on the slider index
+        example = df.iloc[example_idx]
+        # Function to highlight words from the candidates list
+        def highlight_words(text, candidates):
+            for word in candidates:
+                # Use word boundaries to ensure we only match whole words
+                text = re.sub(rf'\b({re.escape(word)})\b', r'<mark>\1</mark>', text, flags=re.IGNORECASE)
+            return text
+        # Highlight words in the 'completion' column
+        candidates = json.loads(example['candidates'])
+        highlighted_completion = highlight_words(example['completion'], candidates)
+        # Create a table with the core columns
+        table_html = f"""
+        <table>
+            <tr><td><b>Completion hash</b></td><td>{example['_original_completion_hash']}</td></tr>
+            <tr><td><b>Model</b></td><td>{example['model']}</td></tr>
+            <tr><td><b>Prompt ID</b></td><td>{example['prompt_id']}</td></tr>
+            <tr><td><b>Solution</b></td><td>{example['solution']}</td></tr>
+            <tr><td><b>Prompt</b></td><td>{example['prompt']}</td></tr>
+            <tr><td><b>Candidates</b></td><td>{candidates}</td></tr>
+        </table>
+        """
+        # If the toggle is checked, show the 'completion' column with highlighted words
+        if show_completion:
+            table_html += f"""
+            <br><b>Completion:</b><br>
+            <p>{highlighted_completion}</p>
+            """
+        return table_html
+    # Create the Gradio interface
+    with gr.Blocks() as demo:
+        # Slider to navigate through examples
+        example_slider = gr.Slider(minimum=0, maximum=len(df)-1, step=1, label="Example", value=0)
+        # Toggle button for showing/hiding completion
+        toggle_button = gr.Checkbox(label="Show Completion", value=False)
+        with gr.Row():
+            gr.HTML('<h1>Candidates Table</h1>')
+        # Table display
+        table_output = gr.HTML()
+        # Set interaction behavior: update the table when slider or checkbox changes
+        example_slider.change(show_table, inputs=[toggle_button, example_slider], outputs=[table_output])
+        toggle_button.change(show_table, inputs=[toggle_button, example_slider], outputs=[table_output])
+    # Launch the app
+    demo.launch(share=share_demo)
+def main(candidates: Path, output_csv: Path, launch_gradio: bool, share_demo: bool):
+    if not output_csv.exists():
+        candidates = pd.read_csv(candidates.as_posix())
+        conn = load_results()
+        completions = conn.sql(SQL_QUERY).df()
+        candidates = candidates.groupby(["model","prompt_id","solution","prompt","_original_completion_hash"]).agg({
+            "candidates": "unique"
+        }).reset_index()
+        candidates["candidates"] = candidates["candidates"].apply(lambda x: json.dumps(list(_concat(x))))
+        completions["_original_completion_hash"] = completions["completion"].apply(_hash)
+        df = candidates.merge(completions, on=["model","prompt_id","prompt","solution","_original_completion_hash"])
+        print(df, candidates, completions, sep="\n")
+        # print_info(conn)
+        # check_candidates(candidates, df)
+        df.to_csv(output_csv)
+        # tables = conn.execute("SHOW TABLES").fetchall()
+        # if not ("candidates", ) in tables:
+        #     # Create a table in DuckDB and insert the candidate data
+        #     conn.execute("CREATE TABLE candidates (model VARCHAR, prompt_id INTEGER, \
+        #                 prompt VARCHAR, completion VARCHAR, solution VARCHAR, candidates VARCHAR)")
+        #     # Insert the list of rows into the table
+        #     for _,row in df.iterrows():
+        #         drow = [row["model"],row["prompt_id"],row["prompt"],row["completion"],row["solution"],row["candidates"]]
+        #         conn.execute("INSERT INTO candidates VALUES (?, ?, ?, ?, ?, ?)", drow)
+        #     conn.commit()
+        #     print_info(conn)
+        #     conn.close()
+    else:
+        df = pd.read_csv(output_csv.as_posix())
+    print(df)
+    if launch_gradio:
+        launch_app(df, share_demo)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("candidates", type=Path, help="path to .csv data containing extracted candidates")
+    parser.add_argument("output_csv", type=Path, help="path to .csv output file; will reload from here if path exists")
+    parser.add_argument("-gr","--launch_gradio", action="store_true")
+    parser.add_argument("-s", "--share_demo", action="store_true")
+    args = parser.parse_args()
+    main(**vars(args))

data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

metrics.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import re
+import duckdb
+import textwrap
+from typing import List, Tuple
+import argparse
+def _parse_answer(text: str) -> List[List[str]]:
+    """
+    Converts text to lowercase. Then interprets ";" as a separator between
+    alternatives. Within each alternative, interprets "," and "-->" as separators
+    for elements of a set. Within each set, drops all non-alphanumeric characters
+    and returns that set.
+    Another way to describe this is that we interpret adjacent words as
+    phrases that must be present literally. However, comma and arrow separate
+    distinct phrases that may be present in any order. All other characters
+    are dropped.
+    """
+    text = text.lower()
+    alternatives = re.split(r';', text)
+    result = [ ]
+    for alternative in alternatives:
+        groups = re.split(r'–?-?-?>|,', alternative)
+        result.append([" ".join(re.findall(r'\b\w+\b', group)) for group in groups])
+    return result
+def _answer_without_thoughts(completion: str) -> str:
+    if "<think>" not in completion[:200]:
+        return completion
+    chunks = completion.split("</think>")
+    if len(chunks) <= 1:
+        return ""
+    return chunks[-1].strip()
+def _check_answer(completion: str, answer: str) -> bool:
+    """
+    Check that all the phrases that must appear in the answer appear in the
+    completion. We ignore "thoughts", capitalization, and punctuation.
+    """
+    completion = _answer_without_thoughts(completion).lower()
+    completion  = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
+    completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
+    alternative_answers = _parse_answer(answer)
+    for answer_phrases in alternative_answers:
+        # if all(phrase in completion for phrase in answer_phrases):
+        if all(re.search(rf'\b{re.escape(phrase)}\b', completion) for phrase in answer_phrases):
+            return True
+    return False
+def _clip_text(text: str, width: int) -> str:
+    return text if len(text) <= width else text[:width] + "..."
+def _wrap_text(text: str, width: int) -> str:
+    return textwrap.fill(text, width=width)
+def load_results():
+    conn = duckdb.connect(":memory:")
+    conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
+    # conn.execute("CREATE TABLE challenges as SELECT * FROM 'puzzles_cleaned.csv'")
+    conn.execute("""
+        CREATE TABLE challenges AS
+        SELECT * FROM 'puzzles_cleaned.csv'
+        WHERE Warnings IS NULL OR Warnings NOT LIKE '%(E)%'
+    """)
+    conn.create_function("check_answer", _check_answer)
+    conn.create_function("clip_text", _clip_text)
+    conn.create_function("wrap_text", _wrap_text)
+    return conn
+def r1_accuracy_by_completion_length(conn,model_name):
+    """
+    For the responses from the completions-r1 model:
+    1. We calculate completion length and correctness for each problem.
+    2. We sort by length.
+    3. We compute cumulative number of correct responses.
+    """
+    r1_completions = conn.sql(f"""
+        WITH LengthsAndCorrectness AS (
+            SELECT
+                LENGTH(results.completion) AS length,
+                CAST(check_answer(results.completion, challenges.answer) AS INT32) AS correct
+            FROM results.completions results JOIN  challenges
+            ON results.prompt_id = challenges.ID
+            WHERE results.parent_dir = '{model_name}'
+        ),
+        TotalItems AS (
+            SELECT COUNT(*) as total_count
+            FROM LengthsAndCorrectness
+        ),
+        CumulativeCorrect AS (
+            SELECT
+                length,
+                SUM(correct) OVER (ORDER BY length) as cumulative_correct,
+            FROM LengthsAndCorrectness
+        )
+        SELECT
+            length,
+            cumulative_correct,
+            CAST(cumulative_correct AS FLOAT) / total_count AS cumulative_accuracy
+        FROM CumulativeCorrect, TotalItems
+        ORDER BY length
+    """)
+    return r1_completions
+def accuracy_by_model_and_time(conn):
+    model_accuracies = conn.sql("""
+        WITH ChallengesWithDates AS (
+            SELECT
+                ID,
+                answer,
+                EXTRACT(YEAR FROM CAST(date AS DATE)) AS year
+            FROM
+                challenges
+        ),
+        DateAnswerCheck AS (
+            SELECT
+                results.parent_dir AS model,
+                dates.year,
+                COUNT(*) AS total,
+                SUM(CAST(check_answer(results.completion, dates.answer) AS INTEGER)) AS correct
+            FROM
+                results.completions results
+            JOIN
+                ChallengesWithDates dates
+            ON
+                results.prompt_id = dates.ID
+            GROUP BY
+                results.parent_dir,
+                dates.year
+        )
+        SELECT
+            model,
+            year,
+            total,
+            correct,
+            ROUND(correct / total, 2) AS accuracy
+        FROM
+            DateAnswerCheck
+        ORDER BY
+            model,
+            year
+    """)
+    return model_accuracies
+def accuracy_by_model(conn):
+    return conn.sql("""
+        WITH AnswerCheck AS (
+            SELECT
+                results.parent_dir AS model,
+                SUM(results.count) AS total,
+                SUM(results.count * CAST(check_answer(results.completion, challenges.answer) AS INTEGER)) AS correct
+            FROM
+                results.completions results
+            JOIN
+                challenges challenges
+            ON
+                results.prompt_id = challenges.ID
+            GROUP BY
+                results.parent_dir
+        )
+        SELECT
+            model,
+            total,
+            correct,
+            ROUND(correct / total, 2) AS accuracy
+        FROM
+            AnswerCheck
+    """)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--by-model-and-time", action="store_true")
+    args = parser.parse_args()
+    conn = load_results()
+    if args.by_model_and_time:
+        print(accuracy_by_model_and_time(conn))
+    else:
+        print(accuracy_by_model(conn))
+if __name__ == "__main__":
+    main()

output.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ad970dce3fb60473dcbfa707515ab67dd78b3cbcc2856feeff6fcb33c918e69
+size 18655953