"""A chain for evaluating ReAct style agents.
This chain is used to evaluate ReAct style agents by reasoning about
the sequence of actions taken and their outcomes. It uses a language model
chain (LLMChain) to generate the reasoning and scores.
"""
import re
from typing import (
Any,
Dict,
List,
Optional,
Sequence,
Tuple,
TypedDict,
Union,
cast,
)
from langchain_core.agents import AgentAction
from langchain_core.exceptions import OutputParserException
from langchain_core.language_models import BaseLanguageModel
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.pydantic_v1 import Extra, Field
from langchain_core.tools import BaseTool
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
Callbacks,
)
from langchain.chains.llm import LLMChain
from langchain.evaluation.agents.trajectory_eval_prompt import (
EVAL_CHAT_PROMPT,
TOOL_FREE_EVAL_CHAT_PROMPT,
)
from langchain.evaluation.schema import AgentTrajectoryEvaluator, LLMEvalChain
[docs]class TrajectoryEval(TypedDict):
"""A named tuple containing the score and reasoning for a trajectory."""
score: float
"""The score for the trajectory, normalized from 0 to 1."""
reasoning: str
"""The reasoning for the score."""
[docs]class TrajectoryOutputParser(BaseOutputParser):
"""Trajectory output parser."""
@property
def _type(self) -> str:
return "agent_trajectory"
[docs] def parse(self, text: str) -> TrajectoryEval:
"""Parse the output text and extract the score and reasoning.
Args:
text (str): The output text to parse.
Returns:
TrajectoryEval: A named tuple containing the normalized score and reasoning.
Raises:
OutputParserException: If the score is not found in the output text or
if the LLM's score is not a digit in the range 1-5.
"""
if "Score:" not in text:
raise OutputParserException(
f"Could not find score in model eval output: {text}"
)
reasoning, score_str = text.split("Score: ", maxsplit=1)
reasoning, score_str = reasoning.strip(), score_str.strip()
# Use regex to extract the score.
# This will get the number in the string, even if it is a float or more than 10.
# E.g. "Score: 1" will return 1, "Score: 3.5" will return 3.5, and
# "Score: 10" will return 10.
# The score should be an integer digit in the range 1-5.
_score = re.search(r"(\d+(\.\d+)?)", score_str)
# If the score is not found or is a float, raise an exception.
if _score is None or "." in _score.group(1):
raise OutputParserException(
f"Score is not an integer digit in the range 1-5: {text}"
)
score = int(_score.group(1))
# If the score is not in the range 1-5, raise an exception.
if not 1 <= score <= 5:
raise OutputParserException(
f"Score is not a digit in the range 1-5: {text}"
)
normalized_score = (score - 1) / 4
return TrajectoryEval(score=normalized_score, reasoning=reasoning)
[docs]class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain):
"""A chain for evaluating ReAct style agents.
This chain is used to evaluate ReAct style agents by reasoning about
the sequence of actions taken and their outcomes.
Example:
.. code-block:: python
from langchain.agents import AgentType, initialize_agent
from langchain_community.chat_models import ChatOpenAI
from langchain.evaluation import TrajectoryEvalChain
from langchain.tools import tool
@tool
def geography_answers(country: str, question: str) -> str:
\"\"\"Very helpful answers to geography questions.\"\"\"
return f"{country}? IDK - We may never know {question}."
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
agent = initialize_agent(
tools=[geography_answers],
llm=llm,
agent=AgentType.OPENAI_FUNCTIONS,
return_intermediate_steps=True,
)
question = "How many dwell in the largest minor region in Argentina?"
response = agent(question)
eval_chain = TrajectoryEvalChain.from_llm(
llm=llm, agent_tools=[geography_answers], return_reasoning=True
)
result = eval_chain.evaluate_agent_trajectory(
input=question,
agent_trajectory=response["intermediate_steps"],
prediction=response["output"],
reference="Paris",
)
print(result["score"]) # noqa: T201
# 0
""" # noqa: E501
agent_tools: Optional[List[BaseTool]] = None
"""A list of tools available to the agent."""
eval_chain: LLMChain
"""The language model chain used for evaluation."""
output_parser: TrajectoryOutputParser = Field(
default_factory=TrajectoryOutputParser
)
"""The output parser used to parse the output."""
return_reasoning: bool = False # :meta private:
"""DEPRECATED. Reasoning always returned."""
class Config:
"""Configuration for the QAEvalChain."""
extra = Extra.ignore
@property
def requires_reference(self) -> bool:
"""Whether this evaluator requires a reference label."""
return False
@property
def _tools_description(self) -> str:
"""Get the description of the agent tools.
Returns:
str: The description of the agent tools.
"""
if self.agent_tools is None:
return ""
return "\n\n".join(
[
f"""Tool {i}: {tool.name}
Description: {tool.description}"""
for i, tool in enumerate(self.agent_tools, 1)
]
)
[docs] @staticmethod
def get_agent_trajectory(
steps: Union[str, Sequence[Tuple[AgentAction, str]]],
) -> str:
"""Get the agent trajectory as a formatted string.
Args:
steps (Union[str, List[Tuple[AgentAction, str]]]): The agent trajectory.
Returns:
str: The formatted agent trajectory.
"""
if isinstance(steps, str):
return steps
return "\n\n".join(
[
f"""Step {i}:
Tool used: {action.tool}
Tool input: {action.tool_input}
Tool output: {output}"""
for i, (action, output) in enumerate(steps, 1)
]
)
@staticmethod
def _format_reference(reference: Optional[str]) -> str:
"""Format the reference text.
Args:
reference (str): The reference text.
Returns:
str: The formatted reference text.
"""
if not reference:
return ""
return f"""
The following is the expected answer. Use this to measure correctness:
[GROUND_TRUTH]
{reference}
[END_GROUND_TRUTH]
"""
[docs] @classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
agent_tools: Optional[Sequence[BaseTool]] = None,
output_parser: Optional[TrajectoryOutputParser] = None,
**kwargs: Any,
) -> "TrajectoryEvalChain":
"""Create a TrajectoryEvalChain object from a language model chain.
Args:
llm (BaseChatModel): The language model chain.
agent_tools (Optional[Sequence[BaseTool]]): A list of tools
available to the agent.
output_parser (Optional[TrajectoryOutputParser]): The output parser
used to parse the chain output into a score.
Returns:
TrajectoryEvalChain: The TrajectoryEvalChain object.
"""
if not isinstance(llm, BaseChatModel):
raise NotImplementedError(
"Only chat models supported by the current trajectory eval"
)
if agent_tools:
prompt = EVAL_CHAT_PROMPT
else:
prompt = TOOL_FREE_EVAL_CHAT_PROMPT
eval_chain = LLMChain(llm=llm, prompt=prompt)
return cls(
agent_tools=agent_tools,
eval_chain=eval_chain,
output_parser=output_parser or TrajectoryOutputParser(),
**kwargs,
)
@property
def input_keys(self) -> List[str]:
"""Get the input keys for the chain.
Returns:
List[str]: The input keys.
"""
return ["question", "agent_trajectory", "answer", "reference"]
@property
def output_keys(self) -> List[str]:
"""Get the output keys for the chain.
Returns:
List[str]: The output keys.
"""
return ["score", "reasoning"]
def _call(
self,
inputs: Dict[str, str],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Run the chain and generate the output.
Args:
inputs (Dict[str, str]): The input values for the chain.
run_manager (Optional[CallbackManagerForChainRun]): The callback
manager for the chain run.
Returns:
Dict[str, Any]: The output values of the chain.
"""
chain_input = {**inputs}
if self.agent_tools:
chain_input["tool_descriptions"] = self._tools_description
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
raw_output = self.eval_chain.run(
chain_input, callbacks=_run_manager.get_child()
)
return cast(dict, self.output_parser.parse(raw_output))
async def _acall(
self,
inputs: Dict[str, str],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Run the chain and generate the output.
Args:
inputs (Dict[str, str]): The input values for the chain.
run_manager (Optional[CallbackManagerForChainRun]): The callback
manager for the chain run.
Returns:
Dict[str, Any]: The output values of the chain.
"""
chain_input = {**inputs}
if self.agent_tools:
chain_input["tool_descriptions"] = self._tools_description
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
raw_output = await self.eval_chain.arun(
chain_input, callbacks=_run_manager.get_child()
)
return cast(dict, self.output_parser.parse(raw_output))
def _evaluate_agent_trajectory(
self,
*,
prediction: str,
input: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
reference: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Evaluate a trajectory.
Args:
prediction (str): The final predicted response.
input (str): The input to the agent.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
reference (Optional[str]): The reference answer.
callbacks (Callbacks): Callbacks to use for this chain run.
Returns:
dict: The evaluation result, which includes the score and optionally
the reasoning for reaching that.
"""
inputs = {
"question": input,
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
"answer": prediction,
"reference": reference,
}
return self.__call__(
inputs=inputs,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
return_only_outputs=True,
)
async def _aevaluate_agent_trajectory(
self,
*,
prediction: str,
input: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
reference: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate a trajectory.
Args:
prediction (str): The final predicted response.
input (str): The input to the agent.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
reference (Optional[str]): The reference answer.
callbacks (Callbacks): Callbacks to use for this chain run.
Returns:
dict: The evaluation result, which includes the score and optionally
the reasoning for reaching that.
"""
inputs = {
"question": input,
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
"answer": prediction,
"reference": reference,
}
return await self.acall(
inputs=inputs,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
return_only_outputs=True,
)