with
Code eval: score: 1 · label: "valid"
LLM judge: score: 0 · label: "incorrect"
explanation: "The response fails to include..."
label: "incorrect"
explanation: "The response fails to include a budget breakdown,
which is a core requirement. The agent provides destination
info and local recommendations but omits all cost estimates,
making the plan incomplete for a user who asked specifically
about budget travel to Tokyo."
pip install arize-phoenix openinference-instrumentation-crewai
from phoenix.otel import register
register(project_name="my-project", auto_instrument=True)
npm install @arizeai/openinference-vercel
// instrumentation.ts
export function register() {
registerOTel({ spanProcessors: [
new OpenInferenceSimpleSpanProcessor({ exporter })
]})
}
generateText({ experimental_telemetry: { isEnabled: true } })
from phoenix.evals import create_evaluator
@create_evaluator(
name="has-answer",
kind="code",
direction="maximize"
)
def has_answer(output: str) -> bool:
return len(output.strip()) > 0
@create_evaluator(name="valid-json", kind="code", direction="maximize")
def valid_json(output: str) -> bool:
try:
json.loads(output)
return True
except json.JSONDecodeError:
return False
from phoenix.evals.llm import LLM
llm = LLM(
provider="openai",
model="gpt-4o",
client="openai",
)
from phoenix.evals.metrics import CorrectnessEvaluator
correctness_eval = CorrectnessEvaluator(llm=llm)
print(correctness_eval.describe())
from phoenix.client import Client
client = Client()
spans_df = client.spans.get_spans_dataframe(
project_identifier="my-project"
)
agent_spans = spans_df[spans_df['span_kind'] == 'AGENT']
bound_eval = bind_evaluator(
evaluator=correctness_eval,
input_mapping={
"input": "attributes.input.value",
"output": "attributes.output.value",
}
)
results = evaluate_dataframe(agent_spans, [bound_eval])
Client().spans.log_span_annotations_dataframe(
to_annotation_dataframe(results)
)
"You are an expert evaluator judging whether a travel planner agent's response is correct. The agent must produce: (1) essential info, (2) a budget breakdown, and (3) local recommendations."
[BEGIN DATA]
************
[User Input]:
{{input}}
************
[Travel Plan]:
{{output}}
************
[END DATA]
"Is the output correct or incorrect?"
CUSTOM_CORRECTNESS_TEMPLATE = """
You are an expert evaluator judging whether
a travel planner agent's response is correct...
CORRECT — [criteria]
INCORRECT — [criteria]
[examples]
[BEGIN DATA]
[User Input]: {{input}}
[Travel Plan]: {{output}}
[END DATA]
Is the output correct or incorrect?
"""
custom_eval = ClassificationEvaluator(
name="travel-plan-correctness",
llm=llm,
prompt_template=CUSTOM_CORRECTNESS_TEMPLATE,
choices={"correct": 1, "incorrect": 0}
)
Follow me on BlueSky:
🦋 @seldo.com
These slides: