fix: langfuse sdk v3
#101
by
MJannik
- opened
bonus-unit2/monitoring-and-evaluating-agents.ipynb
CHANGED
@@ -4,7 +4,7 @@
|
|
4 |
"cell_type": "markdown",
|
5 |
"metadata": {},
|
6 |
"source": [
|
7 |
-
"# Bonus Unit
|
8 |
"\n",
|
9 |
"In this tutorial, we will learn how to **monitor the internal steps (traces) of our AI agent** and **evaluate its performance** using open-source observability tools.\n",
|
10 |
"\n",
|
@@ -44,9 +44,7 @@
|
|
44 |
"metadata": {},
|
45 |
"outputs": [],
|
46 |
"source": [
|
47 |
-
"%pip install 'smolagents[telemetry]'
|
48 |
-
"%pip install opentelemetry-sdk opentelemetry-exporter-otlp openinference-instrumentation-smolagents\n",
|
49 |
-
"%pip install langfuse datasets 'smolagents[gradio]' gradio"
|
50 |
]
|
51 |
},
|
52 |
{
|
@@ -62,61 +60,71 @@
|
|
62 |
},
|
63 |
{
|
64 |
"cell_type": "code",
|
65 |
-
"execution_count":
|
66 |
"metadata": {},
|
67 |
"outputs": [],
|
68 |
"source": [
|
69 |
"import os\n",
|
70 |
-
"import base64\n",
|
71 |
"\n",
|
72 |
-
"# Get your
|
73 |
"os.environ[\"LANGFUSE_PUBLIC_KEY\"] = \"pk-lf-...\" \n",
|
74 |
"os.environ[\"LANGFUSE_SECRET_KEY\"] = \"sk-lf-...\" \n",
|
75 |
-
"os.environ[\"LANGFUSE_HOST\"] = \"https://cloud.langfuse.com\"
|
76 |
-
"# os.environ[\"LANGFUSE_HOST\"] = \"https://us.cloud.langfuse.com\"
|
77 |
-
"\n",
|
78 |
-
"LANGFUSE_AUTH = base64.b64encode(\n",
|
79 |
-
" f\"{os.environ.get('LANGFUSE_PUBLIC_KEY')}:{os.environ.get('LANGFUSE_SECRET_KEY')}\".encode()\n",
|
80 |
-
").decode()\n",
|
81 |
"\n",
|
82 |
-
"
|
83 |
-
"os.environ[\"
|
84 |
]
|
85 |
},
|
86 |
{
|
87 |
-
"cell_type": "
|
88 |
-
"execution_count": null,
|
89 |
"metadata": {},
|
90 |
-
"outputs": [],
|
91 |
"source": [
|
92 |
-
"
|
93 |
-
"os.environ[\"HF_TOKEN\"] = \"hf_...\" "
|
94 |
]
|
95 |
},
|
96 |
{
|
97 |
"cell_type": "code",
|
98 |
-
"execution_count":
|
99 |
-
"metadata": {},
|
100 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
"source": [
|
102 |
-
"from opentelemetry.sdk.trace import TracerProvider\n",
|
103 |
"from openinference.instrumentation.smolagents import SmolagentsInstrumentor\n",
|
104 |
-
"from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter\n",
|
105 |
-
"from opentelemetry.sdk.trace.export import SimpleSpanProcessor\n",
|
106 |
" \n",
|
107 |
-
"
|
108 |
-
"trace_provider = TracerProvider()\n",
|
109 |
-
"\n",
|
110 |
-
"# Add a SimpleSpanProcessor with the OTLPSpanExporter to send traces\n",
|
111 |
-
"trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))\n",
|
112 |
-
"\n",
|
113 |
-
"# Set the global default tracer provider\n",
|
114 |
-
"from opentelemetry import trace\n",
|
115 |
-
"trace.set_tracer_provider(trace_provider)\n",
|
116 |
-
"tracer = trace.get_tracer(__name__)\n",
|
117 |
-
"\n",
|
118 |
-
"# Instrument smolagents with the configured provider\n",
|
119 |
-
"SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)\n"
|
120 |
]
|
121 |
},
|
122 |
{
|
@@ -248,7 +256,7 @@
|
|
248 |
"source": [
|
249 |
"#### 3. Additional Attributes\n",
|
250 |
"\n",
|
251 |
-
"You may also pass additional attributes
|
252 |
]
|
253 |
},
|
254 |
{
|
@@ -258,7 +266,6 @@
|
|
258 |
"outputs": [],
|
259 |
"source": [
|
260 |
"from smolagents import (CodeAgent, DuckDuckGoSearchTool, InferenceClientModel)\n",
|
261 |
-
"from opentelemetry import trace\n",
|
262 |
"\n",
|
263 |
"search_tool = DuckDuckGoSearchTool()\n",
|
264 |
"agent = CodeAgent(\n",
|
@@ -266,12 +273,25 @@
|
|
266 |
" model=InferenceClientModel()\n",
|
267 |
")\n",
|
268 |
"\n",
|
269 |
-
"with
|
270 |
-
"
|
271 |
-
" span
|
272 |
-
"
|
273 |
-
"\n",
|
274 |
-
" agent.run(\"What is the capital of Germany?\")"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
]
|
276 |
},
|
277 |
{
|
@@ -289,7 +309,7 @@
|
|
289 |
"\n",
|
290 |
"If your agent is embedded into a user interface, you can record direct user feedback (like a thumbs-up/down in a chat UI). Below is an example using [Gradio](https://gradio.app/) to embed a chat with a simple feedback mechanism.\n",
|
291 |
"\n",
|
292 |
-
"In the code snippet below, when a user sends a chat message, we capture the
|
293 |
]
|
294 |
},
|
295 |
{
|
@@ -299,26 +319,25 @@
|
|
299 |
"outputs": [],
|
300 |
"source": [
|
301 |
"import gradio as gr\n",
|
302 |
-
"from opentelemetry.trace import format_trace_id\n",
|
303 |
"from smolagents import (CodeAgent, InferenceClientModel)\n",
|
304 |
-
"from langfuse import
|
|
|
|
|
305 |
"\n",
|
306 |
-
"langfuse = Langfuse()\n",
|
307 |
"model = InferenceClientModel()\n",
|
308 |
"agent = CodeAgent(tools=[], model=model, add_base_tools=True)\n",
|
309 |
"\n",
|
310 |
-
"
|
311 |
"\n",
|
312 |
"def respond(prompt, history):\n",
|
313 |
-
" with
|
|
|
|
|
|
|
314 |
" output = agent.run(prompt)\n",
|
315 |
"\n",
|
316 |
-
"
|
317 |
-
"
|
318 |
-
" trace_id = span_context.trace_id\n",
|
319 |
-
" global formatted_trace_id\n",
|
320 |
-
" formatted_trace_id = str(format_trace_id(trace_id))\n",
|
321 |
-
" langfuse.trace(id=formatted_trace_id, input=prompt, output=output)\n",
|
322 |
"\n",
|
323 |
" history.append({\"role\": \"assistant\", \"content\": str(output)})\n",
|
324 |
" return history\n",
|
@@ -326,16 +345,16 @@
|
|
326 |
"def handle_like(data: gr.LikeData):\n",
|
327 |
" # For demonstration, we map user feedback to a 1 (like) or 0 (dislike)\n",
|
328 |
" if data.liked:\n",
|
329 |
-
" langfuse.
|
330 |
" value=1,\n",
|
331 |
" name=\"user-feedback\",\n",
|
332 |
-
" trace_id=
|
333 |
" )\n",
|
334 |
" else:\n",
|
335 |
-
" langfuse.
|
336 |
" value=0,\n",
|
337 |
" name=\"user-feedback\",\n",
|
338 |
-
" trace_id=
|
339 |
" )\n",
|
340 |
"\n",
|
341 |
"with gr.Blocks() as demo:\n",
|
@@ -470,8 +489,8 @@
|
|
470 |
"metadata": {},
|
471 |
"outputs": [],
|
472 |
"source": [
|
473 |
-
"from langfuse import
|
474 |
-
"langfuse =
|
475 |
"\n",
|
476 |
"langfuse_dataset_name = \"gsm8k_dataset_huggingface\"\n",
|
477 |
"\n",
|
@@ -517,7 +536,7 @@
|
|
517 |
"#### Running the Agent on the Dataset\n",
|
518 |
"\n",
|
519 |
"We define a helper function `run_smolagent()` that:\n",
|
520 |
-
"1. Starts
|
521 |
"2. Runs our agent on the prompt\n",
|
522 |
"3. Records the trace ID in Langfuse\n",
|
523 |
"\n",
|
@@ -532,6 +551,10 @@
|
|
532 |
"source": [
|
533 |
"from opentelemetry.trace import format_trace_id\n",
|
534 |
"from smolagents import (CodeAgent, InferenceClientModel, LiteLLMModel)\n",
|
|
|
|
|
|
|
|
|
535 |
"\n",
|
536 |
"# Example: using InferenceClientModel or LiteLLMModel to access openai, anthropic, gemini, etc. models:\n",
|
537 |
"model = InferenceClientModel()\n",
|
@@ -542,52 +565,39 @@
|
|
542 |
" add_base_tools=True\n",
|
543 |
")\n",
|
544 |
"\n",
|
|
|
|
|
|
|
|
|
545 |
"def run_smolagent(question):\n",
|
546 |
-
" with
|
547 |
-
"
|
548 |
-
"
|
549 |
-
"\n",
|
550 |
-
"
|
551 |
-
"
|
552 |
-
"
|
553 |
-
"
|
554 |
-
"\n",
|
555 |
-
" langfuse_trace = langfuse.trace(\n",
|
556 |
-
" id=formatted_trace_id, \n",
|
557 |
-
" input=question, \n",
|
558 |
-
" output=output\n",
|
559 |
" )\n",
|
560 |
-
"
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
"execution_count": null,
|
566 |
-
"metadata": {},
|
567 |
-
"outputs": [],
|
568 |
-
"source": [
|
569 |
-
"dataset = langfuse.get_dataset(langfuse_dataset_name)\n",
|
570 |
-
"\n",
|
571 |
-
"# Run our agent against each dataset item (limited to first 10 above)\n",
|
572 |
"for item in dataset.items:\n",
|
573 |
-
"
|
574 |
-
"\n",
|
575 |
-
"
|
576 |
-
"
|
577 |
-
"
|
578 |
-
"
|
579 |
-
"
|
580 |
-
"
|
581 |
-
"\n",
|
582 |
-
"
|
583 |
-
"
|
584 |
-
"
|
585 |
-
"
|
586 |
-
" comment=\"This is a comment\"\n",
|
587 |
-
" )\n",
|
588 |
-
"\n",
|
589 |
-
"# Flush data to ensure all telemetry is sent\n",
|
590 |
-
"langfuse.flush()"
|
591 |
]
|
592 |
},
|
593 |
{
|
@@ -625,7 +635,7 @@
|
|
625 |
],
|
626 |
"metadata": {
|
627 |
"kernelspec": {
|
628 |
-
"display_name": "
|
629 |
"language": "python",
|
630 |
"name": "python3"
|
631 |
},
|
|
|
4 |
"cell_type": "markdown",
|
5 |
"metadata": {},
|
6 |
"source": [
|
7 |
+
"# Bonus Unit 1: Observability and Evaluation of Agents\n",
|
8 |
"\n",
|
9 |
"In this tutorial, we will learn how to **monitor the internal steps (traces) of our AI agent** and **evaluate its performance** using open-source observability tools.\n",
|
10 |
"\n",
|
|
|
44 |
"metadata": {},
|
45 |
"outputs": [],
|
46 |
"source": [
|
47 |
+
"%pip install langfuse 'smolagents[telemetry]' openinference-instrumentation-smolagents datasets 'smolagents[gradio]' gradio --upgrade"
|
|
|
|
|
48 |
]
|
49 |
},
|
50 |
{
|
|
|
60 |
},
|
61 |
{
|
62 |
"cell_type": "code",
|
63 |
+
"execution_count": 1,
|
64 |
"metadata": {},
|
65 |
"outputs": [],
|
66 |
"source": [
|
67 |
"import os\n",
|
|
|
68 |
"\n",
|
69 |
+
"# Get keys for your project from the project settings page: https://cloud.langfuse.com\n",
|
70 |
"os.environ[\"LANGFUSE_PUBLIC_KEY\"] = \"pk-lf-...\" \n",
|
71 |
"os.environ[\"LANGFUSE_SECRET_KEY\"] = \"sk-lf-...\" \n",
|
72 |
+
"os.environ[\"LANGFUSE_HOST\"] = \"https://cloud.langfuse.com\" # 🇪🇺 EU region\n",
|
73 |
+
"# os.environ[\"LANGFUSE_HOST\"] = \"https://us.cloud.langfuse.com\" # 🇺🇸 US region\n",
|
|
|
|
|
|
|
|
|
74 |
"\n",
|
75 |
+
"# Set your Hugging Face and other tokens/secrets as environment variable\n",
|
76 |
+
"os.environ[\"HF_TOKEN\"] = \"hf_...\" "
|
77 |
]
|
78 |
},
|
79 |
{
|
80 |
+
"cell_type": "markdown",
|
|
|
81 |
"metadata": {},
|
|
|
82 |
"source": [
|
83 |
+
"With the environment variables set, we can now initialize the Langfuse client. get_client() initializes the Langfuse client using the credentials provided in the environment variables."
|
|
|
84 |
]
|
85 |
},
|
86 |
{
|
87 |
"cell_type": "code",
|
88 |
+
"execution_count": 12,
|
89 |
+
"metadata": {},
|
90 |
+
"outputs": [
|
91 |
+
{
|
92 |
+
"name": "stdout",
|
93 |
+
"output_type": "stream",
|
94 |
+
"text": [
|
95 |
+
"Langfuse client is authenticated and ready!\n"
|
96 |
+
]
|
97 |
+
}
|
98 |
+
],
|
99 |
+
"source": [
|
100 |
+
"from langfuse import get_client\n",
|
101 |
+
" \n",
|
102 |
+
"langfuse = get_client()\n",
|
103 |
+
" \n",
|
104 |
+
"# Verify connection\n",
|
105 |
+
"if langfuse.auth_check():\n",
|
106 |
+
" print(\"Langfuse client is authenticated and ready!\")\n",
|
107 |
+
"else:\n",
|
108 |
+
" print(\"Authentication failed. Please check your credentials and host.\")"
|
109 |
+
]
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"cell_type": "code",
|
113 |
+
"execution_count": 13,
|
114 |
+
"metadata": {},
|
115 |
+
"outputs": [
|
116 |
+
{
|
117 |
+
"name": "stderr",
|
118 |
+
"output_type": "stream",
|
119 |
+
"text": [
|
120 |
+
"Attempting to instrument while already instrumented\n"
|
121 |
+
]
|
122 |
+
}
|
123 |
+
],
|
124 |
"source": [
|
|
|
125 |
"from openinference.instrumentation.smolagents import SmolagentsInstrumentor\n",
|
|
|
|
|
126 |
" \n",
|
127 |
+
"SmolagentsInstrumentor().instrument()"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
]
|
129 |
},
|
130 |
{
|
|
|
256 |
"source": [
|
257 |
"#### 3. Additional Attributes\n",
|
258 |
"\n",
|
259 |
+
"You may also pass additional attributes to your spans. These can include `user_id`, `tags`, `session_id`, and custom metadata. Enriching traces with these details is important for analysis, debugging, and monitoring of your application’s behavior across different users or sessions."
|
260 |
]
|
261 |
},
|
262 |
{
|
|
|
266 |
"outputs": [],
|
267 |
"source": [
|
268 |
"from smolagents import (CodeAgent, DuckDuckGoSearchTool, InferenceClientModel)\n",
|
|
|
269 |
"\n",
|
270 |
"search_tool = DuckDuckGoSearchTool()\n",
|
271 |
"agent = CodeAgent(\n",
|
|
|
273 |
" model=InferenceClientModel()\n",
|
274 |
")\n",
|
275 |
"\n",
|
276 |
+
"with langfuse.start_as_current_span(\n",
|
277 |
+
" name=\"Smolagent-Trace\",\n",
|
278 |
+
" ) as span:\n",
|
279 |
+
" \n",
|
280 |
+
" # Run your application here\n",
|
281 |
+
" response = agent.run(\"What is the capital of Germany?\")\n",
|
282 |
+
" \n",
|
283 |
+
" # Pass additional attributes to the span\n",
|
284 |
+
" span.update_trace(\n",
|
285 |
+
" input=\"What is the capital of Germany?\",\n",
|
286 |
+
" output=response,\n",
|
287 |
+
" user_id=\"smolagent-user-123\",\n",
|
288 |
+
" session_id=\"smolagent-session-123456789\",\n",
|
289 |
+
" tags=[\"city-question\", \"testing-agents\"],\n",
|
290 |
+
" metadata={\"email\": \"[email protected]\"},\n",
|
291 |
+
" )\n",
|
292 |
+
" \n",
|
293 |
+
"# Flush events in short-lived applications\n",
|
294 |
+
"langfuse.flush()"
|
295 |
]
|
296 |
},
|
297 |
{
|
|
|
309 |
"\n",
|
310 |
"If your agent is embedded into a user interface, you can record direct user feedback (like a thumbs-up/down in a chat UI). Below is an example using [Gradio](https://gradio.app/) to embed a chat with a simple feedback mechanism.\n",
|
311 |
"\n",
|
312 |
+
"In the code snippet below, when a user sends a chat message, we capture the trace in Langfuse. If the user likes/dislikes the last answer, we attach a score to the trace."
|
313 |
]
|
314 |
},
|
315 |
{
|
|
|
319 |
"outputs": [],
|
320 |
"source": [
|
321 |
"import gradio as gr\n",
|
|
|
322 |
"from smolagents import (CodeAgent, InferenceClientModel)\n",
|
323 |
+
"from langfuse import get_client\n",
|
324 |
+
"\n",
|
325 |
+
"langfuse = get_client()\n",
|
326 |
"\n",
|
|
|
327 |
"model = InferenceClientModel()\n",
|
328 |
"agent = CodeAgent(tools=[], model=model, add_base_tools=True)\n",
|
329 |
"\n",
|
330 |
+
"trace_id = None\n",
|
331 |
"\n",
|
332 |
"def respond(prompt, history):\n",
|
333 |
+
" with langfuse.start_as_current_span(\n",
|
334 |
+
" name=\"Smolagent-Trace\"):\n",
|
335 |
+
" \n",
|
336 |
+
" # Run your application here\n",
|
337 |
" output = agent.run(prompt)\n",
|
338 |
"\n",
|
339 |
+
" global trace_id\n",
|
340 |
+
" trace_id = langfuse.get_current_trace_id()\n",
|
|
|
|
|
|
|
|
|
341 |
"\n",
|
342 |
" history.append({\"role\": \"assistant\", \"content\": str(output)})\n",
|
343 |
" return history\n",
|
|
|
345 |
"def handle_like(data: gr.LikeData):\n",
|
346 |
" # For demonstration, we map user feedback to a 1 (like) or 0 (dislike)\n",
|
347 |
" if data.liked:\n",
|
348 |
+
" langfuse.create_score(\n",
|
349 |
" value=1,\n",
|
350 |
" name=\"user-feedback\",\n",
|
351 |
+
" trace_id=trace_id\n",
|
352 |
" )\n",
|
353 |
" else:\n",
|
354 |
+
" langfuse.create_score(\n",
|
355 |
" value=0,\n",
|
356 |
" name=\"user-feedback\",\n",
|
357 |
+
" trace_id=trace_id\n",
|
358 |
" )\n",
|
359 |
"\n",
|
360 |
"with gr.Blocks() as demo:\n",
|
|
|
489 |
"metadata": {},
|
490 |
"outputs": [],
|
491 |
"source": [
|
492 |
+
"from langfuse import get_client\n",
|
493 |
+
"langfuse = get_client()\n",
|
494 |
"\n",
|
495 |
"langfuse_dataset_name = \"gsm8k_dataset_huggingface\"\n",
|
496 |
"\n",
|
|
|
536 |
"#### Running the Agent on the Dataset\n",
|
537 |
"\n",
|
538 |
"We define a helper function `run_smolagent()` that:\n",
|
539 |
+
"1. Starts a Langfuse span\n",
|
540 |
"2. Runs our agent on the prompt\n",
|
541 |
"3. Records the trace ID in Langfuse\n",
|
542 |
"\n",
|
|
|
551 |
"source": [
|
552 |
"from opentelemetry.trace import format_trace_id\n",
|
553 |
"from smolagents import (CodeAgent, InferenceClientModel, LiteLLMModel)\n",
|
554 |
+
"from langfuse import get_client\n",
|
555 |
+
" \n",
|
556 |
+
"langfuse = get_client()\n",
|
557 |
+
"\n",
|
558 |
"\n",
|
559 |
"# Example: using InferenceClientModel or LiteLLMModel to access openai, anthropic, gemini, etc. models:\n",
|
560 |
"model = InferenceClientModel()\n",
|
|
|
565 |
" add_base_tools=True\n",
|
566 |
")\n",
|
567 |
"\n",
|
568 |
+
"dataset_name = \"gsm8k_dataset_huggingface\"\n",
|
569 |
+
"current_run_name = \"smolagent-notebook-run-01\" # Identifies this specific evaluation run\n",
|
570 |
+
" \n",
|
571 |
+
"# Assume 'run_smolagent' is your instrumented application function\n",
|
572 |
"def run_smolagent(question):\n",
|
573 |
+
" with langfuse.start_as_current_generation(name=\"qna-llm-call\") as generation:\n",
|
574 |
+
" # Simulate LLM call\n",
|
575 |
+
" result = agent.run(question)\n",
|
576 |
+
" \n",
|
577 |
+
" # Update the trace with the input and output\n",
|
578 |
+
" generation.update_trace(\n",
|
579 |
+
" input= question,\n",
|
580 |
+
" output=result,\n",
|
|
|
|
|
|
|
|
|
|
|
581 |
" )\n",
|
582 |
+
" \n",
|
583 |
+
" return result\n",
|
584 |
+
" \n",
|
585 |
+
"dataset = langfuse.get_dataset(name=dataset_name) # Fetch your pre-populated dataset\n",
|
586 |
+
" \n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
587 |
"for item in dataset.items:\n",
|
588 |
+
" \n",
|
589 |
+
" # Use the item.run() context manager\n",
|
590 |
+
" with item.run(\n",
|
591 |
+
" run_name=current_run_name,\n",
|
592 |
+
" run_metadata={\"model_provider\": \"Hugging Face\", \"temperature_setting\": 0.7},\n",
|
593 |
+
" run_description=\"Evaluation run for GSM8K dataset\"\n",
|
594 |
+
" ) as root_span: # root_span is the root span of the new trace for this item and run.\n",
|
595 |
+
" # All subsequent langfuse operations within this block are part of this trace.\n",
|
596 |
+
" \n",
|
597 |
+
" # Call your application logic\n",
|
598 |
+
" generated_answer = run_smolagent(question=item.input[\"text\"])\n",
|
599 |
+
" \n",
|
600 |
+
" print(item.input)"
|
|
|
|
|
|
|
|
|
|
|
601 |
]
|
602 |
},
|
603 |
{
|
|
|
635 |
],
|
636 |
"metadata": {
|
637 |
"kernelspec": {
|
638 |
+
"display_name": ".venv",
|
639 |
"language": "python",
|
640 |
"name": "python3"
|
641 |
},
|