Step by Step Integration Walkthrough
This guide helps you think through and decide what to define and log in Deepchecks so your evaluation goals are achievable. We'll follow a realistic example from raw telemetry data to a fully structured Deepchecks integration.
Example Workflow: Customer Support Agent
Let's follow a complex but realistic workflow: a customer support agent that handles order inquiries with multiple LLM calls.
User Question: "What's the status of my recent order and when will it ship? Also, can you give me a summary of all my recent orders?"
Complete Agent Workflow:
- Tool Use → Query database for customer's orders
- Q&A → Answer specific question about order status
- Summarization → Create summary of all recent orders

Raw Telemetry Data (Before Integration)
Here's what your current logging might look like—OpenTelemetry spans from your LLM application:
[
{
"span_id": "tool_call_001",
"parent_span_id": "user_request_001",
"operation_name": "llm.chat.completions",
"start_time": 1715000000.0,
"end_time": 1715000001.2,
"attributes": {
"gen_ai.system": "OpenAI",
"gen_ai.request.model": "gpt-4o-mini",
"gen_ai.prompt.0.role": "system",
"gen_ai.prompt.0.content": "You are a function-calling assistant. Use the provided tools to help answer user questions.\n\nAvailable tools:\n- get_customer_orders: Retrieve all orders for a customer\n Parameters:\n - customer_id (string, required): The customer's unique identifier\n Returns: List of orders with order_id, status, tracking, delivery info, items, and total",
"gen_ai.prompt.1.role": "user",
"gen_ai.prompt.1.content": "What's the status of my recent order and when will it ship? Also, can you give me a summary of all my recent orders? My customer ID is 12345.",
"gen_ai.completion.0.content": "I'll help you check your order status and provide a summary of your recent orders. Let me look up your order information.",
"gen_ai.completion.0.tool_calls": "[{\"function\": {\"name\": \"get_customer_orders\", \"arguments\": \"{\\\"customer_id\\\": \\\"12345\\\"}\"}}]"
}
},
{
"span_id": "db_query_001",
"parent_span_id": "tool_call_001",
"operation_name": "database.query",
"start_time": 1715000001.3,
"end_time": 1715000001.8,
"attributes": {
"db.statement": "SELECT * FROM orders WHERE customer_id = 12345 AND created_at > '2024-01-01' ORDER BY created_at DESC",
"db.operation": "select",
"user_id": "customer_12345"
},
"result": {
"orders": [
{
"order_id": "ORD-789",
"status": "shipped",
"tracking": "1Z999AA1234567890",
"estimated_delivery": "2024-05-15",
"items": "Wireless Headphones, Phone Case",
"total": "$89.99"
},
{
"order_id": "ORD-654",
"status": "delivered",
"delivered_date": "2024-04-20",
"items": "Laptop Stand",
"total": "$45.00"
},
{
"order_id": "ORD-321",
"status": "delivered",
"delivered_date": "2024-03-15",
"items": "USB Cable, Screen Protector",
"total": "$25.99"
}
]
}
},
{
"span_id": "qa_response_001",
"parent_span_id": "user_request_001",
"operation_name": "llm.chat.completions",
"start_time": 1715000002.0,
"end_time": 1715000003.2,
"attributes": {
"gen_ai.system": "OpenAI",
"gen_ai.request.model": "gpt-4o-mini",
"gen_ai.prompt.0.role": "system",
"gen_ai.prompt.0.content": "You are a helpful customer service agent. Answer the customer's specific question about their order status using the provided order information. Be accurate and helpful.\n\nInstructions:\n- Focus on answering the specific question asked\n- Include relevant order details (ID, status, tracking, delivery date)\n- Be friendly and professional\n- If the order is shipped, provide tracking information",
"gen_ai.prompt.1.role": "user",
"gen_ai.prompt.1.content": "Customer question: 'What's the status of my recent order and when will it ship?'\n\nOrder information: Order ORD-789: status=shipped, tracking=1Z999AA1234567890, estimated_delivery=2024-05-15, items=Wireless Headphones, Phone Case, total=$89.99\n\nAnswer the customer's question:",
"gen_ai.completion.0.content": "Great news! Your most recent order ORD-789 has already shipped and is on its way to you. Here are the details:\n\n• Order ID: ORD-789\n• Status: Shipped\n• Items: Wireless Headphones, Phone Case\n• Tracking Number: 1Z999AA1234567890\n• Estimated Delivery: May 15, 2024\n\nYou can track your package using the tracking number above."
}
},
{
"span_id": "summarization_001",
"parent_span_id": "user_request_001",
"operation_name": "llm.chat.completions",
"start_time": 1715000003.5,
"end_time": 1715000005.0,
"attributes": {
"gen_ai.system": "OpenAI",
"gen_ai.request.model": "gpt-4o-mini",
"gen_ai.prompt.0.role": "system",
"gen_ai.prompt.0.content": "You are a helpful assistant that creates concise summaries. Summarize the customer's recent order history in a clear, organized way that highlights key information.\n\nGuidelines:\n- Create a structured summary with clear sections\n- Include order counts and total spending\n- Show current status for each order\n- Highlight any orders in transit\n- Keep the summary concise but informative\n- Use bullet points or numbered lists for clarity",
"gen_ai.prompt.1.role": "user",
"gen_ai.prompt.1.content": "Please summarize this customer's recent order history:\n\nOrder ORD-789: status=shipped, tracking=1Z999AA1234567890, estimated_delivery=2024-05-15, items=Wireless Headphones, Phone Case, total=$89.99\nOrder ORD-654: status=delivered, delivered_date=2024-04-20, items=Laptop Stand, total=$45.00\nOrder ORD-321: status=delivered, delivered_date=2024-03-15, items=USB Cable, Screen Protector, total=$25.99\n\nCreate a summary:",
"gen_ai.completion.0.content": "Here's a summary of your recent orders:\n\n**Recent Order Activity (3 orders, $160.98 total)**\n\n1. **ORD-789** (Current) - $89.99\n - Wireless Headphones, Phone Case\n - Status: Shipped, arriving May 15, 2024\n\n2. **ORD-654** - $45.00\n - Laptop Stand\n - Delivered April 20, 2024\n\n3. **ORD-321** - $25.99\n - USB Cable, Screen Protector\n - Delivered March 15, 2024\n\nYou have one order currently in transit and two successfully delivered orders from the past few months."
}
}
]
Now let's walk through the decision-making process to map this to Deepchecks.
Step 1: Define Your Application & Version
Decision: What logical product does this data belong to?
Your Choice:
- Application:
"CustomerSupportAgent"
- Version:
"v3.0-multi-step"
(indicates this multi-step approach)
Why This Matters: Applications group use cases. Versions let you compare different iterations (single-step vs multi-step, different models, etc.) of the same use case.
Implementation:
dc_client.create_application("CustomerSupportAgent", ApplicationType.QA)
📖 Reference: Application setup
Step 2: Choose Interaction Types
Decision: What fundamental tasks does your pipeline perform?
Looking at our workflow, we have three distinct LLM interactions:
- Tool Use → LLM decides to call database function
- Q&A → LLM answers specific question about order status
- Summarization → LLM creates summary of all recent orders
Your Choice: Map each LLM call to its appropriate interaction type:
- Tool calling →
"tool_use"
- Specific question answering →
"q&a"
- Order history summary →
"summarization"
Why Separate Interactions? To be able to independently evaluate the different aspects of our workflow. Tool-use accuracy vs Q&A relevance vs summarization quality are distinct evaluation goals.
📖 Reference: Supported interaction types
Step 3: Map Evaluation Goals → Data Fields
Decision: What aspects do you want to measure for each interaction?
Let's think through evaluation goals for each interaction type:
Tool-Use Interaction:
Evaluation Goal | Required Deepchecks Fields |
---|---|
Tool Selection Accuracy | input + action (function call) + tool_response (DB results) |
Function Call Correctness | action (function call) + tool_response (DB results) |
Q&A Interaction:
Evaluation Goal | Required Deepchecks Fields |
---|---|
Answer Accuracy | input (question) + information_retrieval (DB results) + output |
Groundedness | information_retrieval + output |
Relevance | input + output |
Summarization Interaction:
Evaluation Goal | Required Deepchecks Fields |
---|---|
Summary Quality | input (DB results) + output (summary) |
Conciseness | output |
Coverage | input + output + expected_output (if available |
Step 4: Decide on Sessions & Interaction Granularity
Decision: How do you group and structure your data?
Session Strategy: All three interactions belong to one session since they're part of handling a single customer request.
Session Structure:
{
"session_id": "customer_12345_chat_20240507_001",
// Three separate interactions within this session:
// 1. Tool-use interaction
// 2. Q&A interaction
// 3. Summarization interaction
}
Why One Session? These interactions are causally linked—the tool-use enables both Q&A and summarization. Session-level analysis will show the complete workflow performance.
📖 Reference: Sessions
Step 5: Annotation and Reference Answers
Do we have Reference Answers or Annotations for our Data?
Properties are a great way to define evaluation criteria for LLM pipelines, but in some cases we have additional, more powerful evolution tools - when labeled reference answers are present. In these cases, Deepchecks can measure the agreement between the LLM output actually generated and the reference expected_output.
Tool-Use Interaction:
{
"expected_output": "get_customer_orders called with customer_id=12345",
// Can validate correct function selection and parameters
}
Q&A Interaction:
{
"expected_output": "Order ORD-789 is shipped, tracking: 1Z999AA1234567890, delivery: May 15",
// For specific order status questions
}
Summarization Interaction:
{
"user_annotation": "Good",
"user_annotation_reason": "Concise summary with key details, well-organized, accurate totals"
}
📖 Reference: Annotation types
Step 6: Select Properties to Calculate
Decision: Which built-in properties match your evaluation goals?
We'll define these properties for each of the individual interaction types. Once the properties are defined, they will be computed for each of the interactions belonging to these types.
Tool-Use Properties:
- Planning Efficiency - Evaluates tool selection quality
- Custom user-value property - Tool call success rate
Q&A Properties:
- Grounded in Context - No hallucinated order details
- Retrieval Relevance - Order data relevance to question
- Information Density - Useful, complete answers
Summarization Properties:
- Conciseness - Summary is appropriately brief
- Coverage - Summary includes key information
📖 Reference: Properties activation
Step 7: End to End Evaluation
Decision: What makes an Interaction & Session a "good" one?
See Configuring Estimated Annotations for the next part of this example.
Complete Integration Example
Final Deepchecks Payload (3 Interactions):
from deepchecks_llm_client.data_types import LogInteraction
session_id = "customer_12345_chat_20240507_001"
# Interaction 1: Tool Use
tool_interaction = LogInteraction(
input="What's the status of my recent order and when will it ship? Also, can you give me a summary of all my recent orders? My customer ID is 12345.",
full_prompt="You are a function-calling assistant. Use the provided tools to help answer user questions.\n\nAvailable tools:\n- get_customer_orders: Retrieve all orders for a customer\n Parameters:\n - customer_id (string, required): The customer's unique identifier\n Returns: List of orders with order_id, status, tracking, delivery info, items, and total\n\nUser: What's the status of my recent order and when will it ship? Also, can you give me a summary of all my recent orders? My customer ID is 12345.",
action="get_customer_orders(customer_id='12345')",
tool_response="3 orders found: ORD-789 (shipped), ORD-654 (delivered), ORD-321 (delivered)",
output="I'll help you check your order status and provide a summary of your recent orders. Let me look up your order information.",
expected_output="get_customer_orders called with customer_id=12345",
user_interaction_id="customer_12345_tool_001",
session_id=session_id,
started_at=1715000000.0,
finished_at=1715000001.8,
interaction_type="tool_use"
)
# Interaction 2: Q&A
qa_interaction = LogInteraction(
input="What's the status of my recent order and when will it ship?",
full_prompt="You are a helpful customer service agent. Answer the customer's specific question about their order status using the provided order information. Be accurate and helpful.\n\nInstructions:\n- Focus on answering the specific question asked\n- Include relevant order details (ID, status, tracking, delivery date)\n- Be friendly and professional\n- If the order is shipped, provide tracking information\n\nCustomer question: 'What's the status of my recent order and when will it ship?'\n\nOrder information: Order ORD-789: status=shipped, tracking=1Z999AA1234567890, estimated_delivery=2024-05-15, items=Wireless Headphones, Phone Case, total=$89.99\n\nAnswer the customer's question:",
information_retrieval=[
"Order ORD-789: status=shipped, tracking=1Z999AA1234567890, estimated_delivery=2024-05-15, items=Wireless Headphones, Phone Case, total=$89.99"
],
output="Great news! Your most recent order ORD-789 has already shipped and is on its way to you. Here are the details:\n\n• Order ID: ORD-789\n• Status: Shipped\n• Items: Wireless Headphones, Phone Case\n• Tracking Number: 1Z999AA1234567890\n• Estimated Delivery: May 15, 2024\n\nYou can track your package using the tracking number above.",
user_interaction_id="customer_12345_qa_001",
session_id=session_id,
started_at=1715000002.0,
finished_at=1715000003.2,
interaction_type="q&a"
)
# Interaction 3: Summarization
summarization_interaction = LogInteraction(
input="Order ORD-789: status=shipped, tracking=1Z999AA1234567890, estimated_delivery=2024-05-15, items=Wireless Headphones, Phone Case, total=$89.99\nOrder ORD-654: status=delivered, delivered_date=2024-04-20, items=Laptop Stand, total=$45.00\nOrder ORD-321: status=delivered, delivered_date=2024-03-15, items=USB Cable, Screen Protector, total=$25.99",
full_prompt="You are a helpful assistant that creates concise summaries. Summarize the customer's recent order history in a clear, organized way that highlights key information.\n\nGuidelines:\n- Create a structured summary with clear sections\n- Include order counts and total spending\n- Show current status for each order\n- Highlight any orders in transit\n- Keep the summary concise but informative\n- Use bullet points or numbered lists for clarity\n\nPlease summarize this customer's recent order history:\n\nOrder ORD-789: status=shipped, tracking=1Z999AA1234567890, estimated_delivery=2024-05-15, items=Wireless Headphones, Phone Case, total=$89.99\nOrder ORD-654: status=delivered, delivered_date=2024-04-20, items=Laptop Stand, total=$45.00\nOrder ORD-321: status=delivered, delivered_date=2024-03-15, items=USB Cable, Screen Protector, total=$25.99\n\nCreate a summary:",
output="Here's a summary of your recent orders:\n\n**Recent Order Activity (3 orders, $160.98 total)**\n\n1. **ORD-789** (Current) - $89.99\n - Wireless Headphones, Phone Case\n - Status: Shipped, arriving May 15, 2024\n\n2. **ORD-654** - $45.00\n - Laptop Stand\n - Delivered April 20, 2024\n\n3. **ORD-321** - $25.99\n - USB Cable, Screen Protector\n - Delivered March 15, 2024\n\nYou have one order currently in transit and two successfully delivered orders from the past few months.",
user_interaction_id="customer_12345_summary_001",
session_id=session_id,
started_at=1715000003.5,
finished_at=1715000005.0,
interaction_type="summarization"
)
# Log all interactions
for interaction in [tool_interaction, qa_interaction, summarization_interaction]:
dc_client.log_interaction(
app_name="CustomerSupportAgent",
version_name="v3.0-multi-step",
env_type=EnvType.EVAL,
**interaction.model_dump()
)
Key Decision Summary
Step | Your Decision | Impact |
---|---|---|
Application/Version | CustomerSupportAgent v3.0-multi-step | Groups related data, enables version comparison |
Interaction Decomposition | 3 separate interactions (Tool-Use, Q&A, Summarization) | Enables specialized evaluation per interaction type |
Session Strategy | Single session for complete workflow | Enables end-to-end workflow analysis |
Field Mapping | Different fields per interaction type | Enables appropriate properties for each task |
Property Definition & Evaluation | Tool accuracy + Q&A relevance + Summary quality | Comprehensive evaluation across all workflow steps |
Appendix: Transformation Logic
Code Example for Converting OpenTelemetry to Deepchecks:
def transform_telemetry_to_deepchecks(spans):
"""Transform OpenTelemetry spans to multiple Deepchecks interactions"""
session_id = f"customer_{spans[1]['attributes']['user_id']}_chat_20240507_001"
interactions = []
# 1. Tool-Use Interaction
tool_span = next(s for s in spans if "tool_calls" in s["attributes"])
tool_interaction = LogInteraction(
input=tool_span["attributes"]["gen_ai.prompt.1.content"],
full_prompt=f"{tool_span['attributes']['gen_ai.prompt.0.content']}\n\nUser: {tool_span['attributes']['gen_ai.prompt.1.content']}",
action=f"get_customer_orders(customer_id='{spans[1]['attributes']['user_id']}')",
tool_response=format_db_summary(spans[1]["result"]),
output=tool_span["attributes"]["gen_ai.completion.0.content"],
expected_output="get_customer_orders called with customer_id=12345",
user_interaction_id=f"customer_{spans[1]['attributes']['user_id']}_tool_001",
session_id=session_id,
started_at=tool_span["start_time"],
finished_at=spans[1]["end_time"], # Include DB query time
interaction_type="tool_use"
)
interactions.append(tool_interaction)
# 2. Q&A Interaction
qa_span = next(s for s in spans if s["span_id"] == "qa_response_001")
qa_interaction = LogInteraction(
input="What's the status of my recent order and when will it ship?",
full_prompt=qa_span["attributes"]["gen_ai.prompt.0.content"] + "\n\n" + qa_span["attributes"]["gen_ai.prompt.1.content"],
information_retrieval=[format_recent_order(spans[1]["result"]["orders"][0])],
output=qa_span["attributes"]["gen_ai.completion.0.content"],
user_interaction_id=f"customer_{spans[1]['attributes']['user_id']}_qa_001",
session_id=session_id,
started_at=qa_span["start_time"],
finished_at=qa_span["end_time"],
interaction_type="q&a"
)
interactions.append(qa_interaction)
# 3. Summarization Interaction
summary_span = next(s for s in spans if s["span_id"] == "summarization_001")
summarization_interaction = LogInteraction(
input=format_all_orders_for_summary(spans[1]["result"]["orders"]),
full_prompt=summary_span["attributes"]["gen_ai.prompt.0.content"] + "\n\n" + summary_span["attributes"]["gen_ai.prompt.1.content"],
output=summary_span["attributes"]["gen_ai.completion.0.content"],
user_interaction_id=f"customer_{spans[1]['attributes']['user_id']}_summary_001",
session_id=session_id,
started_at=summary_span["start_time"],
finished_at=summary_span["end_time"],
interaction_type="summarization"
)
interactions.append(summarization_interaction)
return interactions
def format_db_summary(db_result):
orders = db_result["orders"]
return f"{len(orders)} orders found: {', '.join([f'{o['order_id']} ({o['status']})' for o in orders])}"
def format_recent_order(order):
return f"Order {order['order_id']}: status={order['status']}, tracking={order.get('tracking', 'N/A')}, estimated_delivery={order.get('estimated_delivery', 'N/A')}, items={order['items']}, total={order['total']}"
def format_all_orders_for_summary(orders):
return "\n".join([format_recent_order(order) for order in orders])
Updated 17 days ago