MJannik commited on
Commit
9cf2cd8
·
verified ·
1 Parent(s): 6add786

fix: langfuse sdk v3

Browse files
bonus-unit2/monitoring-and-evaluating-agents.ipynb CHANGED
@@ -4,7 +4,7 @@
4
  "cell_type": "markdown",
5
  "metadata": {},
6
  "source": [
7
- "# Bonus Unit 2: Observability and Evaluation of Agents\n",
8
  "\n",
9
  "In this tutorial, we will learn how to **monitor the internal steps (traces) of our AI agent** and **evaluate its performance** using open-source observability tools.\n",
10
  "\n",
@@ -44,9 +44,7 @@
44
  "metadata": {},
45
  "outputs": [],
46
  "source": [
47
- "%pip install 'smolagents[telemetry]'\n",
48
- "%pip install opentelemetry-sdk opentelemetry-exporter-otlp openinference-instrumentation-smolagents\n",
49
- "%pip install langfuse datasets 'smolagents[gradio]' gradio"
50
  ]
51
  },
52
  {
@@ -62,61 +60,71 @@
62
  },
63
  {
64
  "cell_type": "code",
65
- "execution_count": null,
66
  "metadata": {},
67
  "outputs": [],
68
  "source": [
69
  "import os\n",
70
- "import base64\n",
71
  "\n",
72
- "# Get your own keys from https://cloud.langfuse.com\n",
73
  "os.environ[\"LANGFUSE_PUBLIC_KEY\"] = \"pk-lf-...\" \n",
74
  "os.environ[\"LANGFUSE_SECRET_KEY\"] = \"sk-lf-...\" \n",
75
- "os.environ[\"LANGFUSE_HOST\"] = \"https://cloud.langfuse.com\" # 🇪🇺 EU region example\n",
76
- "# os.environ[\"LANGFUSE_HOST\"] = \"https://us.cloud.langfuse.com\" # 🇺🇸 US region example\n",
77
- "\n",
78
- "LANGFUSE_AUTH = base64.b64encode(\n",
79
- " f\"{os.environ.get('LANGFUSE_PUBLIC_KEY')}:{os.environ.get('LANGFUSE_SECRET_KEY')}\".encode()\n",
80
- ").decode()\n",
81
  "\n",
82
- "os.environ[\"OTEL_EXPORTER_OTLP_ENDPOINT\"] = os.environ.get(\"LANGFUSE_HOST\") + \"/api/public/otel\"\n",
83
- "os.environ[\"OTEL_EXPORTER_OTLP_HEADERS\"] = f\"Authorization=Basic {LANGFUSE_AUTH}\""
84
  ]
85
  },
86
  {
87
- "cell_type": "code",
88
- "execution_count": null,
89
  "metadata": {},
90
- "outputs": [],
91
  "source": [
92
- "# Set your Hugging Face and other tokens/secrets as environment variable\n",
93
- "os.environ[\"HF_TOKEN\"] = \"hf_...\" "
94
  ]
95
  },
96
  {
97
  "cell_type": "code",
98
- "execution_count": null,
99
- "metadata": {},
100
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  "source": [
102
- "from opentelemetry.sdk.trace import TracerProvider\n",
103
  "from openinference.instrumentation.smolagents import SmolagentsInstrumentor\n",
104
- "from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter\n",
105
- "from opentelemetry.sdk.trace.export import SimpleSpanProcessor\n",
106
  " \n",
107
- "# Create a TracerProvider for OpenTelemetry\n",
108
- "trace_provider = TracerProvider()\n",
109
- "\n",
110
- "# Add a SimpleSpanProcessor with the OTLPSpanExporter to send traces\n",
111
- "trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))\n",
112
- "\n",
113
- "# Set the global default tracer provider\n",
114
- "from opentelemetry import trace\n",
115
- "trace.set_tracer_provider(trace_provider)\n",
116
- "tracer = trace.get_tracer(__name__)\n",
117
- "\n",
118
- "# Instrument smolagents with the configured provider\n",
119
- "SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)\n"
120
  ]
121
  },
122
  {
@@ -248,7 +256,7 @@
248
  "source": [
249
  "#### 3. Additional Attributes\n",
250
  "\n",
251
- "You may also pass additional attributes—such as user IDs, session IDs, or tags—by setting them on the spans. For example, smolagents instrumentation uses OpenTelemetry to attach attributes like `langfuse.user.id` or custom tags."
252
  ]
253
  },
254
  {
@@ -258,7 +266,6 @@
258
  "outputs": [],
259
  "source": [
260
  "from smolagents import (CodeAgent, DuckDuckGoSearchTool, InferenceClientModel)\n",
261
- "from opentelemetry import trace\n",
262
  "\n",
263
  "search_tool = DuckDuckGoSearchTool()\n",
264
  "agent = CodeAgent(\n",
@@ -266,12 +273,25 @@
266
  " model=InferenceClientModel()\n",
267
  ")\n",
268
  "\n",
269
- "with tracer.start_as_current_span(\"Smolagent-Trace\") as span:\n",
270
- " span.set_attribute(\"langfuse.user.id\", \"smolagent-user-123\")\n",
271
- " span.set_attribute(\"langfuse.session.id\", \"smolagent-session-123456789\")\n",
272
- " span.set_attribute(\"langfuse.tags\", [\"city-question\", \"testing-agents\"])\n",
273
- "\n",
274
- " agent.run(\"What is the capital of Germany?\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  ]
276
  },
277
  {
@@ -289,7 +309,7 @@
289
  "\n",
290
  "If your agent is embedded into a user interface, you can record direct user feedback (like a thumbs-up/down in a chat UI). Below is an example using [Gradio](https://gradio.app/) to embed a chat with a simple feedback mechanism.\n",
291
  "\n",
292
- "In the code snippet below, when a user sends a chat message, we capture the OpenTelemetry trace ID. If the user likes/dislikes the last answer, we attach a score to the trace."
293
  ]
294
  },
295
  {
@@ -299,26 +319,25 @@
299
  "outputs": [],
300
  "source": [
301
  "import gradio as gr\n",
302
- "from opentelemetry.trace import format_trace_id\n",
303
  "from smolagents import (CodeAgent, InferenceClientModel)\n",
304
- "from langfuse import Langfuse\n",
 
 
305
  "\n",
306
- "langfuse = Langfuse()\n",
307
  "model = InferenceClientModel()\n",
308
  "agent = CodeAgent(tools=[], model=model, add_base_tools=True)\n",
309
  "\n",
310
- "formatted_trace_id = None # We'll store the current trace_id globally for demonstration\n",
311
  "\n",
312
  "def respond(prompt, history):\n",
313
- " with trace.get_tracer(__name__).start_as_current_span(\"Smolagent-Trace\") as span:\n",
 
 
 
314
  " output = agent.run(prompt)\n",
315
  "\n",
316
- " current_span = trace.get_current_span()\n",
317
- " span_context = current_span.get_span_context()\n",
318
- " trace_id = span_context.trace_id\n",
319
- " global formatted_trace_id\n",
320
- " formatted_trace_id = str(format_trace_id(trace_id))\n",
321
- " langfuse.trace(id=formatted_trace_id, input=prompt, output=output)\n",
322
  "\n",
323
  " history.append({\"role\": \"assistant\", \"content\": str(output)})\n",
324
  " return history\n",
@@ -326,16 +345,16 @@
326
  "def handle_like(data: gr.LikeData):\n",
327
  " # For demonstration, we map user feedback to a 1 (like) or 0 (dislike)\n",
328
  " if data.liked:\n",
329
- " langfuse.score(\n",
330
  " value=1,\n",
331
  " name=\"user-feedback\",\n",
332
- " trace_id=formatted_trace_id\n",
333
  " )\n",
334
  " else:\n",
335
- " langfuse.score(\n",
336
  " value=0,\n",
337
  " name=\"user-feedback\",\n",
338
- " trace_id=formatted_trace_id\n",
339
  " )\n",
340
  "\n",
341
  "with gr.Blocks() as demo:\n",
@@ -470,8 +489,8 @@
470
  "metadata": {},
471
  "outputs": [],
472
  "source": [
473
- "from langfuse import Langfuse\n",
474
- "langfuse = Langfuse()\n",
475
  "\n",
476
  "langfuse_dataset_name = \"gsm8k_dataset_huggingface\"\n",
477
  "\n",
@@ -517,7 +536,7 @@
517
  "#### Running the Agent on the Dataset\n",
518
  "\n",
519
  "We define a helper function `run_smolagent()` that:\n",
520
- "1. Starts an OpenTelemetry span\n",
521
  "2. Runs our agent on the prompt\n",
522
  "3. Records the trace ID in Langfuse\n",
523
  "\n",
@@ -532,6 +551,10 @@
532
  "source": [
533
  "from opentelemetry.trace import format_trace_id\n",
534
  "from smolagents import (CodeAgent, InferenceClientModel, LiteLLMModel)\n",
 
 
 
 
535
  "\n",
536
  "# Example: using InferenceClientModel or LiteLLMModel to access openai, anthropic, gemini, etc. models:\n",
537
  "model = InferenceClientModel()\n",
@@ -542,52 +565,39 @@
542
  " add_base_tools=True\n",
543
  ")\n",
544
  "\n",
 
 
 
 
545
  "def run_smolagent(question):\n",
546
- " with tracer.start_as_current_span(\"Smolagent-Trace\") as span:\n",
547
- " span.set_attribute(\"langfuse.tag\", \"dataset-run\")\n",
548
- " output = agent.run(question)\n",
549
- "\n",
550
- " current_span = trace.get_current_span()\n",
551
- " span_context = current_span.get_span_context()\n",
552
- " trace_id = span_context.trace_id\n",
553
- " formatted_trace_id = format_trace_id(trace_id)\n",
554
- "\n",
555
- " langfuse_trace = langfuse.trace(\n",
556
- " id=formatted_trace_id, \n",
557
- " input=question, \n",
558
- " output=output\n",
559
  " )\n",
560
- " return langfuse_trace, output"
561
- ]
562
- },
563
- {
564
- "cell_type": "code",
565
- "execution_count": null,
566
- "metadata": {},
567
- "outputs": [],
568
- "source": [
569
- "dataset = langfuse.get_dataset(langfuse_dataset_name)\n",
570
- "\n",
571
- "# Run our agent against each dataset item (limited to first 10 above)\n",
572
  "for item in dataset.items:\n",
573
- " langfuse_trace, output = run_smolagent(item.input[\"text\"])\n",
574
- "\n",
575
- " # Link the trace to the dataset item for analysis\n",
576
- " item.link(\n",
577
- " langfuse_trace,\n",
578
- " run_name=\"smolagent-notebook-run-01\",\n",
579
- " run_metadata={ \"model\": model.model_id }\n",
580
- " )\n",
581
- "\n",
582
- " # Optionally, store a quick evaluation score for demonstration\n",
583
- " langfuse_trace.score(\n",
584
- " name=\"<example_eval>\",\n",
585
- " value=1,\n",
586
- " comment=\"This is a comment\"\n",
587
- " )\n",
588
- "\n",
589
- "# Flush data to ensure all telemetry is sent\n",
590
- "langfuse.flush()"
591
  ]
592
  },
593
  {
@@ -625,7 +635,7 @@
625
  ],
626
  "metadata": {
627
  "kernelspec": {
628
- "display_name": "Python 3",
629
  "language": "python",
630
  "name": "python3"
631
  },
 
4
  "cell_type": "markdown",
5
  "metadata": {},
6
  "source": [
7
+ "# Bonus Unit 1: Observability and Evaluation of Agents\n",
8
  "\n",
9
  "In this tutorial, we will learn how to **monitor the internal steps (traces) of our AI agent** and **evaluate its performance** using open-source observability tools.\n",
10
  "\n",
 
44
  "metadata": {},
45
  "outputs": [],
46
  "source": [
47
+ "%pip install langfuse 'smolagents[telemetry]' openinference-instrumentation-smolagents datasets 'smolagents[gradio]' gradio --upgrade"
 
 
48
  ]
49
  },
50
  {
 
60
  },
61
  {
62
  "cell_type": "code",
63
+ "execution_count": 1,
64
  "metadata": {},
65
  "outputs": [],
66
  "source": [
67
  "import os\n",
 
68
  "\n",
69
+ "# Get keys for your project from the project settings page: https://cloud.langfuse.com\n",
70
  "os.environ[\"LANGFUSE_PUBLIC_KEY\"] = \"pk-lf-...\" \n",
71
  "os.environ[\"LANGFUSE_SECRET_KEY\"] = \"sk-lf-...\" \n",
72
+ "os.environ[\"LANGFUSE_HOST\"] = \"https://cloud.langfuse.com\" # 🇪🇺 EU region\n",
73
+ "# os.environ[\"LANGFUSE_HOST\"] = \"https://us.cloud.langfuse.com\" # 🇺🇸 US region\n",
 
 
 
 
74
  "\n",
75
+ "# Set your Hugging Face and other tokens/secrets as environment variable\n",
76
+ "os.environ[\"HF_TOKEN\"] = \"hf_...\" "
77
  ]
78
  },
79
  {
80
+ "cell_type": "markdown",
 
81
  "metadata": {},
 
82
  "source": [
83
+ "With the environment variables set, we can now initialize the Langfuse client. get_client() initializes the Langfuse client using the credentials provided in the environment variables."
 
84
  ]
85
  },
86
  {
87
  "cell_type": "code",
88
+ "execution_count": 12,
89
+ "metadata": {},
90
+ "outputs": [
91
+ {
92
+ "name": "stdout",
93
+ "output_type": "stream",
94
+ "text": [
95
+ "Langfuse client is authenticated and ready!\n"
96
+ ]
97
+ }
98
+ ],
99
+ "source": [
100
+ "from langfuse import get_client\n",
101
+ " \n",
102
+ "langfuse = get_client()\n",
103
+ " \n",
104
+ "# Verify connection\n",
105
+ "if langfuse.auth_check():\n",
106
+ " print(\"Langfuse client is authenticated and ready!\")\n",
107
+ "else:\n",
108
+ " print(\"Authentication failed. Please check your credentials and host.\")"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": 13,
114
+ "metadata": {},
115
+ "outputs": [
116
+ {
117
+ "name": "stderr",
118
+ "output_type": "stream",
119
+ "text": [
120
+ "Attempting to instrument while already instrumented\n"
121
+ ]
122
+ }
123
+ ],
124
  "source": [
 
125
  "from openinference.instrumentation.smolagents import SmolagentsInstrumentor\n",
 
 
126
  " \n",
127
+ "SmolagentsInstrumentor().instrument()"
 
 
 
 
 
 
 
 
 
 
 
 
128
  ]
129
  },
130
  {
 
256
  "source": [
257
  "#### 3. Additional Attributes\n",
258
  "\n",
259
+ "You may also pass additional attributes to your spans. These can include `user_id`, `tags`, `session_id`, and custom metadata. Enriching traces with these details is important for analysis, debugging, and monitoring of your application’s behavior across different users or sessions."
260
  ]
261
  },
262
  {
 
266
  "outputs": [],
267
  "source": [
268
  "from smolagents import (CodeAgent, DuckDuckGoSearchTool, InferenceClientModel)\n",
 
269
  "\n",
270
  "search_tool = DuckDuckGoSearchTool()\n",
271
  "agent = CodeAgent(\n",
 
273
  " model=InferenceClientModel()\n",
274
  ")\n",
275
  "\n",
276
+ "with langfuse.start_as_current_span(\n",
277
+ " name=\"Smolagent-Trace\",\n",
278
+ " ) as span:\n",
279
+ " \n",
280
+ " # Run your application here\n",
281
+ " response = agent.run(\"What is the capital of Germany?\")\n",
282
+ " \n",
283
+ " # Pass additional attributes to the span\n",
284
+ " span.update_trace(\n",
285
+ " input=\"What is the capital of Germany?\",\n",
286
+ " output=response,\n",
287
+ " user_id=\"smolagent-user-123\",\n",
288
+ " session_id=\"smolagent-session-123456789\",\n",
289
+ " tags=[\"city-question\", \"testing-agents\"],\n",
290
+ " metadata={\"email\": \"[email protected]\"},\n",
291
+ " )\n",
292
+ " \n",
293
+ "# Flush events in short-lived applications\n",
294
+ "langfuse.flush()"
295
  ]
296
  },
297
  {
 
309
  "\n",
310
  "If your agent is embedded into a user interface, you can record direct user feedback (like a thumbs-up/down in a chat UI). Below is an example using [Gradio](https://gradio.app/) to embed a chat with a simple feedback mechanism.\n",
311
  "\n",
312
+ "In the code snippet below, when a user sends a chat message, we capture the trace in Langfuse. If the user likes/dislikes the last answer, we attach a score to the trace."
313
  ]
314
  },
315
  {
 
319
  "outputs": [],
320
  "source": [
321
  "import gradio as gr\n",
 
322
  "from smolagents import (CodeAgent, InferenceClientModel)\n",
323
+ "from langfuse import get_client\n",
324
+ "\n",
325
+ "langfuse = get_client()\n",
326
  "\n",
 
327
  "model = InferenceClientModel()\n",
328
  "agent = CodeAgent(tools=[], model=model, add_base_tools=True)\n",
329
  "\n",
330
+ "trace_id = None\n",
331
  "\n",
332
  "def respond(prompt, history):\n",
333
+ " with langfuse.start_as_current_span(\n",
334
+ " name=\"Smolagent-Trace\"):\n",
335
+ " \n",
336
+ " # Run your application here\n",
337
  " output = agent.run(prompt)\n",
338
  "\n",
339
+ " global trace_id\n",
340
+ " trace_id = langfuse.get_current_trace_id()\n",
 
 
 
 
341
  "\n",
342
  " history.append({\"role\": \"assistant\", \"content\": str(output)})\n",
343
  " return history\n",
 
345
  "def handle_like(data: gr.LikeData):\n",
346
  " # For demonstration, we map user feedback to a 1 (like) or 0 (dislike)\n",
347
  " if data.liked:\n",
348
+ " langfuse.create_score(\n",
349
  " value=1,\n",
350
  " name=\"user-feedback\",\n",
351
+ " trace_id=trace_id\n",
352
  " )\n",
353
  " else:\n",
354
+ " langfuse.create_score(\n",
355
  " value=0,\n",
356
  " name=\"user-feedback\",\n",
357
+ " trace_id=trace_id\n",
358
  " )\n",
359
  "\n",
360
  "with gr.Blocks() as demo:\n",
 
489
  "metadata": {},
490
  "outputs": [],
491
  "source": [
492
+ "from langfuse import get_client\n",
493
+ "langfuse = get_client()\n",
494
  "\n",
495
  "langfuse_dataset_name = \"gsm8k_dataset_huggingface\"\n",
496
  "\n",
 
536
  "#### Running the Agent on the Dataset\n",
537
  "\n",
538
  "We define a helper function `run_smolagent()` that:\n",
539
+ "1. Starts a Langfuse span\n",
540
  "2. Runs our agent on the prompt\n",
541
  "3. Records the trace ID in Langfuse\n",
542
  "\n",
 
551
  "source": [
552
  "from opentelemetry.trace import format_trace_id\n",
553
  "from smolagents import (CodeAgent, InferenceClientModel, LiteLLMModel)\n",
554
+ "from langfuse import get_client\n",
555
+ " \n",
556
+ "langfuse = get_client()\n",
557
+ "\n",
558
  "\n",
559
  "# Example: using InferenceClientModel or LiteLLMModel to access openai, anthropic, gemini, etc. models:\n",
560
  "model = InferenceClientModel()\n",
 
565
  " add_base_tools=True\n",
566
  ")\n",
567
  "\n",
568
+ "dataset_name = \"gsm8k_dataset_huggingface\"\n",
569
+ "current_run_name = \"smolagent-notebook-run-01\" # Identifies this specific evaluation run\n",
570
+ " \n",
571
+ "# Assume 'run_smolagent' is your instrumented application function\n",
572
  "def run_smolagent(question):\n",
573
+ " with langfuse.start_as_current_generation(name=\"qna-llm-call\") as generation:\n",
574
+ " # Simulate LLM call\n",
575
+ " result = agent.run(question)\n",
576
+ " \n",
577
+ " # Update the trace with the input and output\n",
578
+ " generation.update_trace(\n",
579
+ " input= question,\n",
580
+ " output=result,\n",
 
 
 
 
 
581
  " )\n",
582
+ " \n",
583
+ " return result\n",
584
+ " \n",
585
+ "dataset = langfuse.get_dataset(name=dataset_name) # Fetch your pre-populated dataset\n",
586
+ " \n",
 
 
 
 
 
 
 
587
  "for item in dataset.items:\n",
588
+ " \n",
589
+ " # Use the item.run() context manager\n",
590
+ " with item.run(\n",
591
+ " run_name=current_run_name,\n",
592
+ " run_metadata={\"model_provider\": \"Hugging Face\", \"temperature_setting\": 0.7},\n",
593
+ " run_description=\"Evaluation run for GSM8K dataset\"\n",
594
+ " ) as root_span: # root_span is the root span of the new trace for this item and run.\n",
595
+ " # All subsequent langfuse operations within this block are part of this trace.\n",
596
+ " \n",
597
+ " # Call your application logic\n",
598
+ " generated_answer = run_smolagent(question=item.input[\"text\"])\n",
599
+ " \n",
600
+ " print(item.input)"
 
 
 
 
 
601
  ]
602
  },
603
  {
 
635
  ],
636
  "metadata": {
637
  "kernelspec": {
638
+ "display_name": ".venv",
639
  "language": "python",
640
  "name": "python3"
641
  },