agents-course
/

notebooks

Model card Files Files and versions Community

109

MJannik commited on 27 days ago

Commit

9cf2cd8

verified ·

1 Parent(s): 6add786

fix: langfuse sdk v3

Browse files

Files changed (1) hide show

bonus-unit2/monitoring-and-evaluating-agents.ipynb +120 -110

bonus-unit2/monitoring-and-evaluating-agents.ipynb CHANGED Viewed

@@ -4,7 +4,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "# Bonus Unit 2: Observability and Evaluation of Agents\n",
         "\n",
         "In this tutorial, we will learn how to **monitor the internal steps (traces) of our AI agent** and **evaluate its performance** using open-source observability tools.\n",
         "\n",
@@ -44,9 +44,7 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "%pip install 'smolagents[telemetry]'\n",
-        "%pip install opentelemetry-sdk opentelemetry-exporter-otlp openinference-instrumentation-smolagents\n",
-        "%pip install langfuse datasets 'smolagents[gradio]' gradio"
       ]
     },
     {
@@ -62,61 +60,71 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
         "import os\n",
-        "import base64\n",
         "\n",
-        "# Get your own keys from https://cloud.langfuse.com\n",
         "os.environ[\"LANGFUSE_PUBLIC_KEY\"] = \"pk-lf-...\" \n",
         "os.environ[\"LANGFUSE_SECRET_KEY\"] = \"sk-lf-...\" \n",
-        "os.environ[\"LANGFUSE_HOST\"] = \"https://cloud.langfuse.com\"  # 🇪🇺 EU region example\n",
-        "# os.environ[\"LANGFUSE_HOST\"] = \"https://us.cloud.langfuse.com\"  # 🇺🇸 US region example\n",
-        "\n",
-        "LANGFUSE_AUTH = base64.b64encode(\n",
-        "    f\"{os.environ.get('LANGFUSE_PUBLIC_KEY')}:{os.environ.get('LANGFUSE_SECRET_KEY')}\".encode()\n",
-        ").decode()\n",
         "\n",
-        "os.environ[\"OTEL_EXPORTER_OTLP_ENDPOINT\"] = os.environ.get(\"LANGFUSE_HOST\") + \"/api/public/otel\"\n",
-        "os.environ[\"OTEL_EXPORTER_OTLP_HEADERS\"] = f\"Authorization=Basic {LANGFUSE_AUTH}\""
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
-        "# Set your Hugging Face and other tokens/secrets as environment variable\n",
-        "os.environ[\"HF_TOKEN\"] = \"hf_...\" "
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
       "source": [
-        "from opentelemetry.sdk.trace import TracerProvider\n",
         "from openinference.instrumentation.smolagents import SmolagentsInstrumentor\n",
-        "from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter\n",
-        "from opentelemetry.sdk.trace.export import SimpleSpanProcessor\n",
         " \n",
-        "# Create a TracerProvider for OpenTelemetry\n",
-        "trace_provider = TracerProvider()\n",
-        "\n",
-        "# Add a SimpleSpanProcessor with the OTLPSpanExporter to send traces\n",
-        "trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))\n",
-        "\n",
-        "# Set the global default tracer provider\n",
-        "from opentelemetry import trace\n",
-        "trace.set_tracer_provider(trace_provider)\n",
-        "tracer = trace.get_tracer(__name__)\n",
-        "\n",
-        "# Instrument smolagents with the configured provider\n",
-        "SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)\n"
       ]
     },
     {
@@ -248,7 +256,7 @@
       "source": [
         "#### 3. Additional Attributes\n",
         "\n",
-        "You may also pass additional attributes—such as user IDs, session IDs, or tags—by setting them on the spans. For example, smolagents instrumentation uses OpenTelemetry to attach attributes like `langfuse.user.id` or custom tags."
       ]
     },
     {
@@ -258,7 +266,6 @@
       "outputs": [],
       "source": [
         "from smolagents import (CodeAgent, DuckDuckGoSearchTool, InferenceClientModel)\n",
-        "from opentelemetry import trace\n",
         "\n",
         "search_tool = DuckDuckGoSearchTool()\n",
         "agent = CodeAgent(\n",
@@ -266,12 +273,25 @@
         "    model=InferenceClientModel()\n",
         ")\n",
         "\n",
-        "with tracer.start_as_current_span(\"Smolagent-Trace\") as span:\n",
-        "    span.set_attribute(\"langfuse.user.id\", \"smolagent-user-123\")\n",
-        "    span.set_attribute(\"langfuse.session.id\", \"smolagent-session-123456789\")\n",
-        "    span.set_attribute(\"langfuse.tags\", [\"city-question\", \"testing-agents\"])\n",
-        "\n",
-        "    agent.run(\"What is the capital of Germany?\")"
       ]
     },
     {
@@ -289,7 +309,7 @@
         "\n",
         "If your agent is embedded into a user interface, you can record direct user feedback (like a thumbs-up/down in a chat UI). Below is an example using [Gradio](https://gradio.app/) to embed a chat with a simple feedback mechanism.\n",
         "\n",
-        "In the code snippet below, when a user sends a chat message, we capture the OpenTelemetry trace ID. If the user likes/dislikes the last answer, we attach a score to the trace."
       ]
     },
     {
@@ -299,26 +319,25 @@
       "outputs": [],
       "source": [
         "import gradio as gr\n",
-        "from opentelemetry.trace import format_trace_id\n",
         "from smolagents import (CodeAgent, InferenceClientModel)\n",
-        "from langfuse import Langfuse\n",
         "\n",
-        "langfuse = Langfuse()\n",
         "model = InferenceClientModel()\n",
         "agent = CodeAgent(tools=[], model=model, add_base_tools=True)\n",
         "\n",
-        "formatted_trace_id = None  # We'll store the current trace_id globally for demonstration\n",
         "\n",
         "def respond(prompt, history):\n",
-        "    with trace.get_tracer(__name__).start_as_current_span(\"Smolagent-Trace\") as span:\n",
         "        output = agent.run(prompt)\n",
         "\n",
-        "        current_span = trace.get_current_span()\n",
-        "        span_context = current_span.get_span_context()\n",
-        "        trace_id = span_context.trace_id\n",
-        "        global formatted_trace_id\n",
-        "        formatted_trace_id = str(format_trace_id(trace_id))\n",
-        "        langfuse.trace(id=formatted_trace_id, input=prompt, output=output)\n",
         "\n",
         "    history.append({\"role\": \"assistant\", \"content\": str(output)})\n",
         "    return history\n",
@@ -326,16 +345,16 @@
         "def handle_like(data: gr.LikeData):\n",
         "    # For demonstration, we map user feedback to a 1 (like) or 0 (dislike)\n",
         "    if data.liked:\n",
-        "        langfuse.score(\n",
         "            value=1,\n",
         "            name=\"user-feedback\",\n",
-        "            trace_id=formatted_trace_id\n",
         "        )\n",
         "    else:\n",
-        "        langfuse.score(\n",
         "            value=0,\n",
         "            name=\"user-feedback\",\n",
-        "            trace_id=formatted_trace_id\n",
         "        )\n",
         "\n",
         "with gr.Blocks() as demo:\n",
@@ -470,8 +489,8 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "from langfuse import Langfuse\n",
-        "langfuse = Langfuse()\n",
         "\n",
         "langfuse_dataset_name = \"gsm8k_dataset_huggingface\"\n",
         "\n",
@@ -517,7 +536,7 @@
         "#### Running the Agent on the Dataset\n",
         "\n",
         "We define a helper function `run_smolagent()` that:\n",
-        "1. Starts an OpenTelemetry span\n",
         "2. Runs our agent on the prompt\n",
         "3. Records the trace ID in Langfuse\n",
         "\n",
@@ -532,6 +551,10 @@
       "source": [
         "from opentelemetry.trace import format_trace_id\n",
         "from smolagents import (CodeAgent, InferenceClientModel, LiteLLMModel)\n",
         "\n",
         "# Example: using InferenceClientModel or LiteLLMModel to access openai, anthropic, gemini, etc. models:\n",
         "model = InferenceClientModel()\n",
@@ -542,52 +565,39 @@
         "    add_base_tools=True\n",
         ")\n",
         "\n",
         "def run_smolagent(question):\n",
-        "    with tracer.start_as_current_span(\"Smolagent-Trace\") as span:\n",
-        "        span.set_attribute(\"langfuse.tag\", \"dataset-run\")\n",
-        "        output = agent.run(question)\n",
-        "\n",
-        "        current_span = trace.get_current_span()\n",
-        "        span_context = current_span.get_span_context()\n",
-        "        trace_id = span_context.trace_id\n",
-        "        formatted_trace_id = format_trace_id(trace_id)\n",
-        "\n",
-        "        langfuse_trace = langfuse.trace(\n",
-        "            id=formatted_trace_id, \n",
-        "            input=question, \n",
-        "            output=output\n",
         "        )\n",
-        "    return langfuse_trace, output"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "dataset = langfuse.get_dataset(langfuse_dataset_name)\n",
-        "\n",
-        "# Run our agent against each dataset item (limited to first 10 above)\n",
         "for item in dataset.items:\n",
-        "    langfuse_trace, output = run_smolagent(item.input[\"text\"])\n",
-        "\n",
-        "    # Link the trace to the dataset item for analysis\n",
-        "    item.link(\n",
-        "        langfuse_trace,\n",
-        "        run_name=\"smolagent-notebook-run-01\",\n",
-        "        run_metadata={ \"model\": model.model_id }\n",
-        "    )\n",
-        "\n",
-        "    # Optionally, store a quick evaluation score for demonstration\n",
-        "    langfuse_trace.score(\n",
-        "        name=\"<example_eval>\",\n",
-        "        value=1,\n",
-        "        comment=\"This is a comment\"\n",
-        "    )\n",
-        "\n",
-        "# Flush data to ensure all telemetry is sent\n",
-        "langfuse.flush()"
       ]
     },
     {
@@ -625,7 +635,7 @@
   ],
   "metadata": {
     "kernelspec": {
-      "display_name": "Python 3",
       "language": "python",
       "name": "python3"
     },

       "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "# Bonus Unit 1: Observability and Evaluation of Agents\n",
         "\n",
         "In this tutorial, we will learn how to **monitor the internal steps (traces) of our AI agent** and **evaluate its performance** using open-source observability tools.\n",
         "\n",
       "metadata": {},
       "outputs": [],
       "source": [
+        "%pip install langfuse 'smolagents[telemetry]' openinference-instrumentation-smolagents datasets 'smolagents[gradio]' gradio --upgrade"
       ]
     },
     {
     },
     {
       "cell_type": "code",
+      "execution_count": 1,
       "metadata": {},
       "outputs": [],
       "source": [
         "import os\n",
         "\n",
+        "# Get keys for your project from the project settings page: https://cloud.langfuse.com\n",
         "os.environ[\"LANGFUSE_PUBLIC_KEY\"] = \"pk-lf-...\" \n",
         "os.environ[\"LANGFUSE_SECRET_KEY\"] = \"sk-lf-...\" \n",
+        "os.environ[\"LANGFUSE_HOST\"] = \"https://cloud.langfuse.com\" # 🇪🇺 EU region\n",
+        "# os.environ[\"LANGFUSE_HOST\"] = \"https://us.cloud.langfuse.com\" # 🇺🇸 US region\n",
         "\n",
+        "# Set your Hugging Face and other tokens/secrets as environment variable\n",
+        "os.environ[\"HF_TOKEN\"] = \"hf_...\" "
       ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "With the environment variables set, we can now initialize the Langfuse client. get_client() initializes the Langfuse client using the credentials provided in the environment variables."
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Langfuse client is authenticated and ready!\n"
+          ]
+        }
+      ],
+      "source": [
+        "from langfuse import get_client\n",
+        " \n",
+        "langfuse = get_client()\n",
+        " \n",
+        "# Verify connection\n",
+        "if langfuse.auth_check():\n",
+        "    print(\"Langfuse client is authenticated and ready!\")\n",
+        "else:\n",
+        "    print(\"Authentication failed. Please check your credentials and host.\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Attempting to instrument while already instrumented\n"
+          ]
+        }
+      ],
       "source": [
         "from openinference.instrumentation.smolagents import SmolagentsInstrumentor\n",
         " \n",
+        "SmolagentsInstrumentor().instrument()"
       ]
     },
     {
       "source": [
         "#### 3. Additional Attributes\n",
         "\n",
+        "You may also pass additional attributes to your spans. These can include `user_id`, `tags`, `session_id`, and custom metadata. Enriching traces with these details is important for analysis, debugging, and monitoring of your application’s behavior across different users or sessions."
       ]
     },
     {
       "outputs": [],
       "source": [
         "from smolagents import (CodeAgent, DuckDuckGoSearchTool, InferenceClientModel)\n",
         "\n",
         "search_tool = DuckDuckGoSearchTool()\n",
         "agent = CodeAgent(\n",
         "    model=InferenceClientModel()\n",
         ")\n",
         "\n",
+        "with langfuse.start_as_current_span(\n",
+        "    name=\"Smolagent-Trace\",\n",
+        "    ) as span:\n",
+        "    \n",
+        "    # Run your application here\n",
+        "    response = agent.run(\"What is the capital of Germany?\")\n",
+        " \n",
+        "    # Pass additional attributes to the span\n",
+        "    span.update_trace(\n",
+        "        input=\"What is the capital of Germany?\",\n",
+        "        output=response,\n",
+        "        user_id=\"smolagent-user-123\",\n",
+        "        session_id=\"smolagent-session-123456789\",\n",
+        "        tags=[\"city-question\", \"testing-agents\"],\n",
+        "        metadata={\"email\": \"[email protected]\"},\n",
+        "        )\n",
+        " \n",
+        "# Flush events in short-lived applications\n",
+        "langfuse.flush()"
       ]
     },
     {
         "\n",
         "If your agent is embedded into a user interface, you can record direct user feedback (like a thumbs-up/down in a chat UI). Below is an example using [Gradio](https://gradio.app/) to embed a chat with a simple feedback mechanism.\n",
         "\n",
+        "In the code snippet below, when a user sends a chat message, we capture the trace in Langfuse. If the user likes/dislikes the last answer, we attach a score to the trace."
       ]
     },
     {
       "outputs": [],
       "source": [
         "import gradio as gr\n",
         "from smolagents import (CodeAgent, InferenceClientModel)\n",
+        "from langfuse import get_client\n",
+        "\n",
+        "langfuse = get_client()\n",
         "\n",
         "model = InferenceClientModel()\n",
         "agent = CodeAgent(tools=[], model=model, add_base_tools=True)\n",
         "\n",
+        "trace_id = None\n",
         "\n",
         "def respond(prompt, history):\n",
+        "    with langfuse.start_as_current_span(\n",
+        "        name=\"Smolagent-Trace\"):\n",
+        "        \n",
+        "        # Run your application here\n",
         "        output = agent.run(prompt)\n",
         "\n",
+        "        global trace_id\n",
+        "        trace_id = langfuse.get_current_trace_id()\n",
         "\n",
         "    history.append({\"role\": \"assistant\", \"content\": str(output)})\n",
         "    return history\n",
         "def handle_like(data: gr.LikeData):\n",
         "    # For demonstration, we map user feedback to a 1 (like) or 0 (dislike)\n",
         "    if data.liked:\n",
+        "        langfuse.create_score(\n",
         "            value=1,\n",
         "            name=\"user-feedback\",\n",
+        "            trace_id=trace_id\n",
         "        )\n",
         "    else:\n",
+        "        langfuse.create_score(\n",
         "            value=0,\n",
         "            name=\"user-feedback\",\n",
+        "            trace_id=trace_id\n",
         "        )\n",
         "\n",
         "with gr.Blocks() as demo:\n",
       "metadata": {},
       "outputs": [],
       "source": [
+        "from langfuse import get_client\n",
+        "langfuse = get_client()\n",
         "\n",
         "langfuse_dataset_name = \"gsm8k_dataset_huggingface\"\n",
         "\n",
         "#### Running the Agent on the Dataset\n",
         "\n",
         "We define a helper function `run_smolagent()` that:\n",
+        "1. Starts a Langfuse span\n",
         "2. Runs our agent on the prompt\n",
         "3. Records the trace ID in Langfuse\n",
         "\n",
       "source": [
         "from opentelemetry.trace import format_trace_id\n",
         "from smolagents import (CodeAgent, InferenceClientModel, LiteLLMModel)\n",
+        "from langfuse import get_client\n",
+        " \n",
+        "langfuse = get_client()\n",
+        "\n",
         "\n",
         "# Example: using InferenceClientModel or LiteLLMModel to access openai, anthropic, gemini, etc. models:\n",
         "model = InferenceClientModel()\n",
         "    add_base_tools=True\n",
         ")\n",
         "\n",
+        "dataset_name = \"gsm8k_dataset_huggingface\"\n",
+        "current_run_name = \"smolagent-notebook-run-01\" # Identifies this specific evaluation run\n",
+        " \n",
+        "# Assume 'run_smolagent' is your instrumented application function\n",
         "def run_smolagent(question):\n",
+        "    with langfuse.start_as_current_generation(name=\"qna-llm-call\") as generation:\n",
+        "        # Simulate LLM call\n",
+        "        result = agent.run(question)\n",
+        " \n",
+        "        # Update the trace with the input and output\n",
+        "        generation.update_trace(\n",
+        "            input= question,\n",
+        "            output=result,\n",
         "        )\n",
+        " \n",
+        "        return result\n",
+        " \n",
+        "dataset = langfuse.get_dataset(name=dataset_name) # Fetch your pre-populated dataset\n",
+        " \n",
         "for item in dataset.items:\n",
+        " \n",
+        "    # Use the item.run() context manager\n",
+        "    with item.run(\n",
+        "        run_name=current_run_name,\n",
+        "        run_metadata={\"model_provider\": \"Hugging Face\", \"temperature_setting\": 0.7},\n",
+        "        run_description=\"Evaluation run for GSM8K dataset\"\n",
+        "    ) as root_span: # root_span is the root span of the new trace for this item and run.\n",
+        "        # All subsequent langfuse operations within this block are part of this trace.\n",
+        " \n",
+        "        # Call your application logic\n",
+        "        generated_answer = run_smolagent(question=item.input[\"text\"])\n",
+        " \n",
+        "        print(item.input)"
       ]
     },
     {
   ],
   "metadata": {
     "kernelspec": {
+      "display_name": ".venv",
       "language": "python",
       "name": "python3"
     },