Spaces:

HuggingFaceTB
/

wikiracing-llms

Running

App Files Files Community

stillerman commited on May 6

Commit

1e310b7

1 Parent(s): 8b6f89e

docs minus assets

Browse files

Files changed (10) hide show

.gitignore +4 -1
Dockerfile +1 -1
README.md +12 -1
index.html +1 -1
parallel_eval/README.md +59 -0
parallel_eval/game.py +310 -0
parallel_eval/proctor.py +233 -0
parallel_eval/requirements.txt +5 -0
parallel_eval/supernodes.json +19 -0
src/components/viewer-tab.tsx +69 -5

.gitignore CHANGED Viewed

@@ -28,4 +28,7 @@ tmp
 qwen3-final-results.json
-__pycache__

 qwen3-final-results.json
+__pycache__
+.venv
+proctor_tmp
+wikihop.db

Dockerfile CHANGED Viewed

@@ -53,7 +53,7 @@ RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true \
     curl https://huggingface.co/api/whoami-v2 -H "Authorization: Bearer $(cat /run/secrets/HF_TOKEN)"
 RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true \
-    curl -L https://huggingface.co/HuggingFaceTB/simplewiki-pruned-text-350k/resolve/main/wikihop.db -H "Authorization: Bearer $(cat /run/secrets/HF_TOKEN)" -o wikihop.db
 ENV WIKISPEEDIA_DB_PATH=/home/user/app/wikihop.db

     curl https://huggingface.co/api/whoami-v2 -H "Authorization: Bearer $(cat /run/secrets/HF_TOKEN)"
 RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true \
+    curl -L https://huggingface.co/datasets/HuggingFaceTB/simplewiki-pruned-text-350k/blob/main/wikihop.db -H "Authorization: Bearer $(cat /run/secrets/HF_TOKEN)" -o wikihop.db
 ENV WIKISPEEDIA_DB_PATH=/home/user/app/wikihop.db

README.md CHANGED Viewed

@@ -9,4 +9,15 @@ hf_oauth: true
 hf_oauth_scopes:
   - inference-api
   - email
----

 hf_oauth_scopes:
   - inference-api
   - email
+---
+# Can you wikirace faster than an LLM? 🏁
+Go head-to-head with Qwen, Gemma, and DeepSeek on the [Huggingface Space](https://huggingface.co/spaces/HuggingFaceTB/Wikispeedia)
+<!-- add gifs -->
+![vid2](assets/vid2_720p.gif)
+Or run 100s of agents on any model in parallel for efficient evaluations [see README](parallel_eval)
+![vid1](assets/vid1_720p.gif)

index.html CHANGED Viewed

@@ -4,7 +4,7 @@
     <meta charset="UTF-8" />
     <link rel="icon" type="image/svg+xml" href="/vite.svg" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>Vite + React + TS</title>
   </head>
   <body>
     <div id="root"></div>

     <meta charset="UTF-8" />
     <link rel="icon" type="image/svg+xml" href="/vite.svg" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>WikiRacing LLMs</title>
   </head>
   <body>
     <div id="root"></div>

parallel_eval/README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+## Setup env
+```bash
+uv venv
+source .venv/bin/activate
+uv pip install -r requirements.txt
+# pull wikihop db
+wget https://huggingface.co/datasets/HuggingFaceTB/simplewiki-pruned-text-350k/blob/main/wikihop.db -o wikihop.db
+```
+## Which models does it support?
+Under the hood it uses [LiteLLM](https://github.com/BerriAI/litellm) so you can use any major model (dont forget to export appropriate api key), or host any model on huggingface via [vLLM](https://github.com/vllm-project/vllm).
+## Play the game
+```
+# play the game with cli
+python game.py --human --start 'Saint Lucia' --end 'Italy' --db wikihop.db
+# have the agent play the game (gpt-4o)
+export OPENAI_API_KEY=sk_xxxxx
+python game.py --agent --start 'Saint Lucia' --end 'Italy' --db wikihop.db --model gpt-4o --max-steps 20
+# run an evaluation suite with qwen3 hosted on vLLM, 200 workers
+python proctor.py --model "hosted_vllm/Qwen/Qwen3-30B-A3B" --api-base "http://localhost:8000/v1" --workers 200
+# this will produce a `proctor_tmp/proctor_1-final-results.json` that can be visualized in the space, as well as the individual reasoning traces for each run. This is resumable if it is stopped and is idempotent.
+```
+## JQ command to strip out reasoning traces
+This output file will be very large because it contains all the reasoning traces. You can shrink it down and still be able to visualize it with
+```bash
+jq '{
+  article_list: .article_list,
+  num_trials: .num_trials,
+  num_workers: .num_workers,
+  max_steps: .max_steps,
+  agent_settings: .agent_settings,
+  runs: [.runs[] | {
+    model: .model,
+    api_base: .api_base,
+    max_links: .max_links,
+    max_tries: .max_tries, result: .result,
+    start_article: .start_article,
+    destination_article: .destination_article,
+    steps: [.steps[] | {
+      type: .type,
+      article: .article,
+      metadata: (if .metadata.conversation then
+        .metadata | del(.conversation)
+      else
+        .metadata
+      end)
+    }]
+  }]
+}' proctor_tmp/proctor_1-final-results.json > cleaned_data.json
+```

parallel_eval/game.py ADDED Viewed

	@@ -0,0 +1,310 @@

+from typing import List, Tuple, Dict, Optional
+import sqlite3
+import json
+import litellm
+import re
+import asyncio
+import argparse
+from functools import lru_cache
+class SQLiteDB:
+    def __init__(self, db_path: str):
+        """Initialize the database with path to SQLite database"""
+        self.db_path = db_path
+        self.conn = sqlite3.connect(db_path)
+        self.conn.row_factory = sqlite3.Row
+        self.cursor = self.conn.cursor()
+        self._article_count = self._get_article_count()
+        print(f"Connected to SQLite database with {self._article_count} articles")
+    def _get_article_count(self):
+        self.cursor.execute("SELECT COUNT(*) FROM core_articles")
+        return self.cursor.fetchone()[0]
+    @lru_cache(maxsize=8192)
+    def get_article_with_links(self, article_title: str) -> Tuple[str, List[str]]:
+        self.cursor.execute(
+            "SELECT title, links_json FROM core_articles WHERE title = ?",
+            (article_title,),
+        )
+        article = self.cursor.fetchone()
+        if not article:
+            return None, []
+        links = json.loads(article["links_json"])
+        return article["title"], links
+class Player:
+    def __init__(self, name: str):
+        self.name = name
+    async def get_move(self, game_state: List[Dict]) -> Tuple[str, Dict]:
+        print("Link choices:")
+        for i, link in enumerate(game_state[-1]["links"]):
+            print(f"{i}: {link}")
+        idx = int(input(f"Enter the index of the link you want to select: "))
+        return game_state[-1]["links"][idx], {
+            "message": f"{self.name} selected link #{i}"
+        }  # select the first link
+class AgentPlayer(Player):
+    def __init__(
+        self,
+        model: str,
+        api_base: str,
+        verbose: bool = True,
+        max_links=None,
+        max_tries=10,
+        target_article = None,
+        seed = None
+    ):
+        super().__init__(model)
+        self.model = model
+        self.api_base = api_base
+        self.verbose = verbose
+        self.max_links = max_links
+        self.max_tries = max_tries
+        self.target_article = target_article
+        self.seed = seed
+    async def get_move(self, game_state: List[Dict]) -> Tuple[str, Dict]:
+        prompt = self.construct_prompt(game_state)
+        conversation = [
+            {"role": "user", "content": prompt}
+        ]
+        for try_number in range(self.max_tries):
+            response = await litellm.acompletion(
+                model=self.model,
+                api_base=self.api_base,
+                messages=conversation,
+                seed=self.seed
+            )
+            response = response.choices[0].message.content
+            conversation.append({"role": "assistant", "content": response})
+            answer, message = self._attempt_to_extract_answer(response, maximum_answer=len(game_state[-1]["links"]))
+            # there was a problem with the answer so give the model another chance
+            if answer == -1:
+                conversation.append({"role": "user", "content": message})
+                continue
+            assert answer >= 1 and answer <= len(game_state[-1]["links"]), f"Answer {answer} is out of range"
+            # we found an answer so we can return it
+            return game_state[-1]["links"][answer-1], {"tries": try_number, "conversation": conversation}
+        # we tried the max number of times and still didn't find an answer
+        return -1, {"tries": self.max_tries, "conversation": conversation}
+    def construct_prompt(self, game_state: List[Dict]) -> str:
+        current = game_state[-1]["article"]
+        target = self.target_article
+        available_links = game_state[-1]["links"]
+        formatted_links = "\n".join([f"{i+1}. {link}" for i, link in enumerate(available_links)])
+        path_so_far = [step["article"] for step in game_state]
+        try:
+            formatted_path = ' -> '.join(path_so_far)
+        except Exception as e:
+            print(f"Error formatting path: {e}")
+            print(game_state)
+            print("Path so far: ", path_so_far)
+            raise e
+        return f"""You are playing WikiRun, trying to navigate from one Wikipedia article to another using only links.
+IMPORTANT: You MUST put your final answer in <answer>NUMBER</answer> tags, where NUMBER is the link number.
+For example, if you want to choose link 3, output <answer>3</answer>.
+Current article: {current}
+Target article: {target}
+Available links (numbered):
+{formatted_links}
+Your path so far: {formatted_path}
+Think about which link is most likely to lead you toward the target article.
+First, analyze each link briefly and how it connects to your goal, then select the most promising one.
+Remember to format your final answer by explicitly writing out the xml number tags like this: <answer>NUMBER</answer>
+        """
+    def _attempt_to_extract_answer(self, response: str, maximum_answer: Optional[int] = None) -> Tuple[int, str]:
+        'returns -1 and a message if no answer is found'
+        # Extract choice using format <answer>N</answer>
+        choice_match = re.search(r"<answer>(\d+)</answer>", response)
+        if choice_match is None:
+            return -1, f"No answer found in response. Please respond with a number between 1 and {maximum_answer} in <answer>NUMBER</answer> tags."
+        # check if there are multiple answers
+        multiple_answers = re.findall(r"<answer>(\d+)</answer>", response)
+        if len(multiple_answers) > 1:
+            return -1, "Multiple answers found in response. Please respond with just one."
+        answer = choice_match.group(1)
+        # try to convert to int
+        try:
+            answer = int(answer)
+        except ValueError:
+            return -1, f"You answered with {answer} but it could not be converted to an integer. Please respond with a number between 1 and {maximum_answer}."
+        # check if the answer is too high or too low
+        if answer > maximum_answer or answer < 1:
+            return -1, f"You answered with {answer} but you have to select a number between 1 and {maximum_answer}."
+        return answer, "" # we found an answer so we don't need to return a message
+class Game:
+    def __init__(
+        self,
+        start_article: str,
+        target_article: str,
+        db: SQLiteDB,
+        max_allowed_steps: int,
+        player: Player,
+        verbose: bool = True,
+    ):
+        self.start_article = start_article
+        self.target_article = target_article
+        self.db = db
+        self.max_allowed_steps = max_allowed_steps
+        self.steps = []
+        self.steps_taken = 0
+        self.player = player
+        self.verbose = verbose
+        # Ensure the player knows the target article
+        if isinstance(self.player, AgentPlayer):
+            self.player.target_article = self.target_article
+    async def run(self):
+        if self.verbose:
+            print(f"Starting game from {self.start_article} to {self.target_article}")
+        # get the start article
+        _, links = self.db.get_article_with_links(self.start_article)
+        self.steps.append(
+            {
+                "type": "start",
+                "article": self.start_article,
+                "links": links,
+                "metadata": {"message": "Game started"},
+            }
+        )
+        # while the current article is not the target article and the number of steps taken is less than the max allowed steps
+        while self.steps_taken < self.max_allowed_steps:
+            self.steps_taken += 1
+            # Await the async player move
+            player_move, metadata = await self.player.get_move(self.steps)
+            # player couldn't select a valid link
+            if player_move == -1:
+                self.steps.append(
+                    {"type": "lose", "article": player_move, "metadata": metadata}
+                )
+                break
+            if self.verbose:
+                print(f" ->  Step {self.steps_taken}: {player_move}")
+                # input("Press Enter to continue...")
+            # if we found it its over
+            if player_move == self.target_article:
+                self.steps.append(
+                    {"type": "win", "article": player_move, "metadata": metadata}
+                )
+                break
+            # if not lets get the next article
+            _, links = self.db.get_article_with_links(player_move)
+            if len(links) == 0:
+                self.steps.append(
+                    {"type": "lose", "article": player_move, "metadata": metadata}
+                )
+                break
+            self.steps.append(
+                {
+                    "type": "move",
+                    "article": player_move,
+                    "links": links,
+                    "metadata": metadata,
+                }
+            )
+        return self.steps
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Play the WikiRun game")
+    # Add mutual exclusion group for player type
+    player_group = parser.add_mutually_exclusive_group(required=True)
+    player_group.add_argument("--human", action="store_true", help="Play as a human")
+    player_group.add_argument("--agent", action="store_true", help="Use an AI agent to play")
+    # Game parameters
+    parser.add_argument("--start", type=str, default="British Library", help="Starting article title")
+    parser.add_argument("--end", type=str, default="Saint Lucia", help="Target article title")
+    parser.add_argument("--db", type=str, required=True, help="Path to SQLite database")
+    parser.add_argument("--max-steps", type=int, default=10, help="Maximum number of steps allowed (default: 10)")
+    # Agent parameters (only used with --agent)
+    parser.add_argument("--model", type=str, default="gpt-4o", help="Model to use for the agent (default: gpt-4o)")
+    parser.add_argument("--api-base", type=str, default="https://api.openai.com/v1",
+                        help="API base URL (default: https://api.openai.com/v1)")
+    parser.add_argument("--max-links", type=int, default=200, help="Maximum number of links to consider (default: 200)")
+    parser.add_argument("--max-tries", type=int, default=3, help="Maximum number of tries for the agent (default: 3)")
+    parser.add_argument("--seed", type=int, default=None, help="Random seed for reproducibility")
+    args = parser.parse_args()
+    # Initialize the database
+    db = SQLiteDB(args.db)
+    # Initialize the player based on the argument
+    if args.human:
+        player = Player("Human")
+    else:  # args.agent is True
+        player = AgentPlayer(
+            model=args.model,
+            api_base=args.api_base,
+            verbose=True,
+            max_links=args.max_links,
+            max_tries=args.max_tries,
+            target_article=args.end,
+            seed=args.seed
+        )
+    # Create and run the game
+    game = Game(
+        start_article=args.start,
+        target_article=args.end,
+        db=db,
+        max_allowed_steps=args.max_steps,
+        player=player,
+        verbose=True
+    )
+    steps = asyncio.run(game.run())
+    print(f"Game over in {len(steps)} steps")
+    for i, step in enumerate(steps):
+        print(f"Step {i}: {step['type']}")
+        print(f"  Article: {step['article']}")
+        print(f"  Links: {step.get('links', [])}")
+        print(f"  Metadata: {step.get('metadata', {})}")
+        print("\n\n")

parallel_eval/proctor.py ADDED Viewed

	@@ -0,0 +1,233 @@

+from game import AgentPlayer, SQLiteDB, Game
+import os
+import json
+import asyncio
+import argparse
+class Proctor:
+    def __init__(
+        self,
+        article_list: list[tuple[str, str]],
+        num_trials: int,
+        num_workers: int,
+        max_steps: int,
+        agent_settings: dict,
+        db_path: str,
+        verbose: bool = True,
+        output_dir: str = "./proctor_tmp",
+        proctor_id: str = "proctor_1",
+        starting_seed: int = 42,
+    ):
+        self.article_list = article_list
+        self.num_trials = num_trials
+        self.num_workers = num_workers
+        self.max_steps = max_steps
+        self.agent_settings = agent_settings
+        self.db_path = db_path
+        self.verbose = verbose
+        self.output_dir = output_dir
+        self.proctor_id = proctor_id
+        self.db = SQLiteDB(self.db_path)
+        self.starting_seed = starting_seed
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.runs = []
+        self.setup_runs()
+    def setup_runs(self):
+        for start in self.article_list:
+            for destination in self.article_list:
+                if start == destination:
+                    continue
+                for n in range(self.num_trials):
+                    run_id = f"{self.proctor_id}_{start}_{destination}_{n}"
+                    self.runs.append(
+                        Run(
+                            start,
+                            destination,
+                            self.max_steps,
+                            self.agent_settings,
+                            self.db,
+                            self.output_dir,
+                            self.verbose,
+                            run_id,
+                            self.starting_seed + n,
+                        )
+                    )
+                    print(f"Setup run {run_id}")
+    async def run(self):
+        semaphore = asyncio.Semaphore(self.num_workers)
+        tasks = []
+        async def run_with_semaphore(run_instance):
+            async with semaphore:
+                if self.verbose:
+                    print(f"Starting run {run_instance.id}")
+                await run_instance.run()
+                if self.verbose:
+                    print(f"Finished run {run_instance.id}")
+        for run_instance in self.runs:
+            tasks.append(asyncio.create_task(run_with_semaphore(run_instance)))
+        await asyncio.gather(*tasks)
+        self.analyze_runs()
+    def analyze_runs(self):
+        """We need to analze all the runs into a .json"""
+        final_results = {
+            "article_list": self.article_list,
+            "num_trials": self.num_trials,
+            "num_workers": self.num_workers,
+            "max_steps": self.max_steps,
+            "agent_settings": self.agent_settings,
+            "runs": [],
+        }
+        win_count = 0
+        lose_count = 0
+        hops_distribution = []
+        for run in self.runs:
+            with open(run.output_file, "r") as f:
+                result = json.load(f)
+                final_results["runs"].append(result)
+                if result["result"] == "win":
+                    win_count += 1
+                    hops_distribution.append(len(result["steps"]) - 1)
+                else:
+                    lose_count += 1
+        final_results["hops_distribution"] = hops_distribution
+        final_results["average_hops"] = sum(hops_distribution) / len(hops_distribution)
+        final_results["win_rate"] = win_count / len(self.runs)
+        final_results["lose_rate"] = lose_count / len(self.runs)
+        with open(f"{self.output_dir}/{self.proctor_id}-final-results.json", "w") as f:
+            json.dump(final_results, f, indent=4)
+class Run:
+    def __init__(
+        self,
+        start_article: str,
+        destination_article: str,
+        max_steps: int,
+        agent_settings: dict,
+        db: SQLiteDB,
+        output_dir: str,
+        verbose: bool,
+        id: str,
+        seed: int,
+    ):
+        self.start_article = start_article
+        self.destination_article = destination_article
+        self.max_steps = max_steps
+        self.agent_settings = agent_settings
+        self.db = db
+        self.output_dir = output_dir
+        self.verbose = verbose
+        self.id = id
+        self.seed = seed
+        self.output_file = f"{self.output_dir}/run_{self.id}.json"
+    async def run(self):
+        if os.path.exists(self.output_file):
+            return
+        player = AgentPlayer(
+            model=self.agent_settings["model"],
+            api_base=self.agent_settings["api_base"],
+            max_links=self.agent_settings["max_links"],
+            max_tries=self.agent_settings["max_tries"],
+            verbose=False,
+            seed=self.seed,
+        )
+        game = Game(
+            self.start_article,
+            self.destination_article,
+            self.db,
+            self.max_steps,
+            player,
+            verbose=False,
+        )
+        steps = await game.run()
+        output = {
+            "model": self.agent_settings["model"],
+            "api_base": self.agent_settings["api_base"],
+            "max_links": self.agent_settings["max_links"],
+            "max_tries": self.agent_settings["max_tries"],
+            "start_article": self.start_article,
+            "destination_article": self.destination_article,
+            "steps": steps,
+            "seed": self.seed,
+            "result": steps[-1]["type"],
+        }
+        with open(self.output_file, "w") as f:
+            json.dump(output, f, indent=4)
+        print(f"Run {self.id} completed in {len(steps)} steps")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run parallel Wikispeedia evaluations")
+    parser.add_argument("--model", type=str, default="gpt-4o", help="Model to use for agent")
+    parser.add_argument("--api-base", type=str, default=None, help="API base URL for hosted models")
+    parser.add_argument("--workers", type=int, default=20, help="Number of parallel workers")
+    parser.add_argument("--trials", type=int, default=1, help="Number of trials per start-destination pair")
+    parser.add_argument("--max-steps", type=int, default=20, help="Maximum steps per game")
+    parser.add_argument("--max-links", type=int, default=200, help="Maximum links per page for agent")
+    parser.add_argument("--max-tries", type=int, default=3, help="Maximum retries for agent")
+    parser.add_argument("--db-path", type=str, default="wikihop.db", help="Path to the wikihop database")
+    parser.add_argument("--output-dir", type=str, default="./proctor_tmp", help="Directory for output files")
+    parser.add_argument("--proctor-id", type=str, default="proctor_1", help="Unique identifier for this proctor run")
+    parser.add_argument("--seed", type=int, default=42, help="Starting random seed")
+    parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
+    parser.add_argument("--article-list", type=str, default="supernodes.json",
+                        help="Path to JSON file with list of articles to test")
+    args = parser.parse_args()
+    # check if db exists
+    if not os.path.exists(args.db_path):
+        raise FileNotFoundError(f"Database file not found at {args.db_path}")
+    # check if article list exists
+    if not os.path.exists(args.article_list):
+        raise FileNotFoundError(f"Article list file not found at {args.article_list}")
+    # Read article list from file
+    with open(args.article_list, "r") as f:
+        article_list = json.load(f)
+    agent_settings = {
+        "model": args.model,
+        "api_base": args.api_base,
+        "max_links": args.max_links,
+        "max_tries": args.max_tries,
+    }
+    proctor = Proctor(
+        article_list=article_list,
+        num_trials=args.trials,
+        num_workers=args.workers,
+        max_steps=args.max_steps,
+        agent_settings=agent_settings,
+        db_path=args.db_path,
+        verbose=args.verbose,
+        output_dir=args.output_dir,
+        proctor_id=args.proctor_id,
+        starting_seed=args.seed,
+    )
+    asyncio.run(proctor.run())

parallel_eval/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+litellm>=1.10.0
+asyncio
+tqdm
+sqlite3-wrapper
+aiohttp

parallel_eval/supernodes.json ADDED Viewed

	@@ -0,0 +1,19 @@

+[
+    "Soviet Union",
+    "Frank Lloyd Wright",
+    "Major League Baseball",
+    "R (programming language)",
+    "Hinduism",
+    "Singapore General Hospital",
+    "Nepenthes",
+    "Google AI",
+    "Freedom, Pennsylvania",
+    "Iron Man 3",
+    "Central Bank of Nigeria",
+    "Pok\u00e9mon",
+    "Nintendo",
+    "Bachelor of Arts",
+    "Polynesian languages",
+    "France",
+    "Jennifer Aniston"
+]

src/components/viewer-tab.tsx CHANGED Viewed

@@ -3,7 +3,7 @@
 import q3Results from "../../results/qwen3.json"
 import q3_30B_A3B_Results from "../../results/qwen3-30B-A3-results.json"
 // import mockResults from "../../qwen3-final-results.json"
-import { useMemo, useState, useEffect } from "react";
 import { Card } from "@/components/ui/card";
 import ForceDirectedGraph from "@/components/force-directed-graph";
 import RunsList from "@/components/runs-list";
@@ -16,8 +16,10 @@ import {
 } from "@/components/ui/select";
 import { Run as ForceGraphRun } from "@/components/reasoning-trace";
 import { Badge } from "@/components/ui/badge";
-const models = {
   "Qwen3-14B": q3Results,
   "Qwen3-30B-A3B": q3_30B_A3B_Results,
 }
@@ -51,10 +53,12 @@ export default function ViewerTab({
   const [runs, setRuns] = useState<Run[]>([]);
   const [selectedModel, setSelectedModel] = useState<string>("Qwen3-14B");
   const [modelStats, setModelStats] = useState<ModelStats | null>(null);
   useEffect(() => {
     // Convert the model data to the format expected by RunsList
-    const convertedRuns = models[selectedModel].runs.map((run: {
       start_article: string;
       destination_article: string;
       steps: { type: string; article: string }[];
@@ -64,7 +68,7 @@ export default function ViewerTab({
       destination_article: run.destination_article,
       steps: run.steps.map((step: { article: string }) => step.article),
       result: run.result
-    }));
     setRuns(convertedRuns);
     // Calculate model statistics
@@ -105,7 +109,7 @@ export default function ViewerTab({
       minSteps,
       maxSteps
     });
-  }, [selectedModel]);
   const handleRunSelect = (runId: number) => {
     setSelectedRun(runId);
@@ -124,6 +128,49 @@ export default function ViewerTab({
     }));
   }, [filterRuns]);
   return (
     <div className="grid grid-cols-1 md:grid-cols-12 gap-4 h-[calc(100vh-200px)] max-h-[calc(100vh-200px)] overflow-hidden p-2">
      <Card className="p-3 col-span-12 row-start-1">
@@ -143,6 +190,23 @@ export default function ViewerTab({
            </Select>
          </div>
          {modelStats && (
            <div className="flex flex-wrap gap-1.5 items-center">
              <Badge variant="outline" className="px-2 py-0.5 flex gap-1 items-center">

 import q3Results from "../../results/qwen3.json"
 import q3_30B_A3B_Results from "../../results/qwen3-30B-A3-results.json"
 // import mockResults from "../../qwen3-final-results.json"
+import { useMemo, useState, useEffect, useRef } from "react";
 import { Card } from "@/components/ui/card";
 import ForceDirectedGraph from "@/components/force-directed-graph";
 import RunsList from "@/components/runs-list";
 } from "@/components/ui/select";
 import { Run as ForceGraphRun } from "@/components/reasoning-trace";
 import { Badge } from "@/components/ui/badge";
+import { Button } from "@/components/ui/button";
+import { UploadIcon } from "lucide-react";
+const defaultModels = {
   "Qwen3-14B": q3Results,
   "Qwen3-30B-A3B": q3_30B_A3B_Results,
 }
   const [runs, setRuns] = useState<Run[]>([]);
   const [selectedModel, setSelectedModel] = useState<string>("Qwen3-14B");
   const [modelStats, setModelStats] = useState<ModelStats | null>(null);
+  const [models, setModels] = useState(defaultModels);
+  const fileInputRef = useRef<HTMLInputElement>(null);
   useEffect(() => {
     // Convert the model data to the format expected by RunsList
+    const convertedRuns = models[selectedModel]?.runs?.map((run: {
       start_article: string;
       destination_article: string;
       steps: { type: string; article: string }[];
       destination_article: run.destination_article,
       steps: run.steps.map((step: { article: string }) => step.article),
       result: run.result
+    })) || [];
     setRuns(convertedRuns);
     // Calculate model statistics
       minSteps,
       maxSteps
     });
+  }, [selectedModel, models]);
   const handleRunSelect = (runId: number) => {
     setSelectedRun(runId);
     }));
   }, [filterRuns]);
+  const handleFileUpload = (event: React.ChangeEvent<HTMLInputElement>) => {
+    const file = event.target.files?.[0];
+    if (!file) return;
+    const reader = new FileReader();
+    reader.onload = (e) => {
+      try {
+        const jsonData = JSON.parse(e.target?.result as string);
+        // Validate the JSON structure has the required fields
+        if (!jsonData.runs || !Array.isArray(jsonData.runs)) {
+          alert("Invalid JSON format. File must contain a 'runs' array.");
+          return;
+        }
+        // Create a filename-based model name, removing extension and path
+        const fileName = file.name.replace(/\.[^/.]+$/, "");
+        const modelName = `Custom: ${fileName}`;
+        // Add the new model to the models object
+        setModels(prev => ({
+          ...prev,
+          [modelName]: jsonData
+        }));
+        // Select the newly added model
+        setSelectedModel(modelName);
+      } catch (error) {
+        alert(`Error parsing JSON file: ${error.message}`);
+      }
+    };
+    reader.readAsText(file);
+    // Reset the file input
+    if (fileInputRef.current) {
+      fileInputRef.current.value = '';
+    }
+  };
+  const handleUploadClick = () => {
+    fileInputRef.current?.click();
+  };
   return (
     <div className="grid grid-cols-1 md:grid-cols-12 gap-4 h-[calc(100vh-200px)] max-h-[calc(100vh-200px)] overflow-hidden p-2">
      <Card className="p-3 col-span-12 row-start-1">
            </Select>
          </div>
+         <Button
+           variant="outline"
+           size="sm"
+           className="flex items-center gap-1"
+           onClick={handleUploadClick}
+         >
+           <UploadIcon size={14} />
+           <span>Upload JSON</span>
+           <input
+             type="file"
+             ref={fileInputRef}
+             accept=".json"
+             className="hidden"
+             onChange={handleFileUpload}
+           />
+         </Button>
          {modelStats && (
            <div className="flex flex-wrap gap-1.5 items-center">
              <Badge variant="outline" className="px-2 py-0.5 flex gap-1 items-center">