diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
index 36f344b..b71b501 100644
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@@ -28,7 +28,6 @@ import React, {
import { v4 as uuid } from "uuid";
import {
- Affix,
@@ -41,7 +40,7 @@ import {
- ScrollArea,
+ Skeleton,
@@ -51,7 +50,7 @@ import {
} from "@mantine/core";
import { useDisclosure } from "@mantine/hooks";
-import { LLMResponse, PromptVarsDict, RatingDict } from "./backend/typing";
+import { Dict, LLMResponse, PromptVarsDict, RatingDict } from "./backend/typing";
import { EvalCriteria } from "./backend/evalgen/typing";
import {
@@ -65,11 +64,17 @@ import {
} from "@tabler/icons-react";
-import { cleanMetavarsFilterFunc, deepcopy, sampleRandomElements, transformDict } from "./backend/utils";
+import {
+ cleanMetavarsFilterFunc,
+ deepcopy,
+ sampleRandomElements,
+ transformDict,
+} from "./backend/utils";
import useStore from "./store";
import { getRatingKeyForResponse } from "./ResponseRatingToolbar";
import StorageCache from "./backend/cache";
import EvaluationFunctionExecutor from "./backend/evalgen/executor";
+import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils";
const INIT_CRITERIA: EvalCriteria[] = [
@@ -115,7 +120,7 @@ const ThumbUpDownButtons = ({
if (onChangeGrade) onChangeGrade(grade === true ? undefined : true);
@@ -323,18 +328,36 @@ export interface EvalGenModalRef {
const EvalGenModal = forwardRef>(
function EvalGenModal(props, ref) {
const [opened, { open, close }] = useDisclosure(false);
+ const apiKeys = useStore((state) => state.apiKeys);
const [criteria, setCriteria] = useState(INIT_CRITERIA);
const [responses, setResponses] = useState([]);
- const [shownResponse, setShownResponse] = useState(undefined);
- const [pastShownResponses, setPastShownResponses] = useState([]);
+ const [shownResponse, setShownResponse] = useState(
+ undefined,
+ );
+ const [pastShownResponses, setPastShownResponses] = useState(
+ [],
+ );
const [shownResponseIdx, setShownResponseIdx] = useState(0);
- const [annotation, setAnnotation] = useState(null);
- const [promptReasoning, setPromptReasoning] = useState(null);
+ const [annotation, setAnnotation] = useState(undefined);
+ const [holisticGrade, setHolisticGrade] = useState<"good" | "bad" | undefined>(undefined);
+ // Per-criteria grades (indexed by uid of response, then uid of criteria)
+ const [grades, setGrades] = useState>>({});
+ const setPerCriteriaGrade = (responseUID: string, criteriaUID: string, newGrade: boolean | undefined) => {
+ setGrades((grades) => {
+ if (!grades[responseUID]) grades[responseUID] = {};
+ grades[responseUID][criteriaUID] = newGrade;
+ grades[responseUID] = {...grades[responseUID]};
+ return {...grades};
+ });
+ };
// The EvalGen object responsible for generating, implementing, and filtering candidate implementations
- const [executor, setExecutor] = useState(null);
+ const [executor, setExecutor] = useState(
+ null,
+ );
const [execProgress, setExecProgress] = useState(0);
// For updating the global human ratings state
@@ -354,6 +377,19 @@ const EvalGenModal = forwardRef>(
// We pass the responses here manually to ensure they remain the same
// for the duration of one EvalGen operation.
+ setGrades(resps.reduce((acc: Dict>, curr) => {
+ acc[curr.uid] = {};
+ return acc;
+ }, {}));
+ setShownResponseIdx(0);
+ if (resps.length > 0) {
+ const first_resp = sampleRandomElements(resps, 1)[0];
+ setShownResponse(first_resp);
+ setPastShownResponses([first_resp]);
+ } else {
+ setShownResponse(undefined);
+ setPastShownResponses([]);
+ }
useImperativeHandle(ref, () => ({
@@ -388,6 +424,54 @@ const EvalGenModal = forwardRef>(
+ // Synthesize a new criteria according to the feedback given for the shown response
+ const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
+ const synthNewCriteriaWithLLM = (response: string, feedback: string, grade: "good" | "bad" | "unknown") => {
+ // Add a loading Skeleton
+ setIsLoadingCriteria((num) => num + 1);
+ // Make async LLM call to expand criteria
+ generateLLMEvaluationCriteria(
+ "",
+ apiKeys,
+ `I've given some feedback on some text output. Use this feedback to decide on a single evaluation criteria with a yes/no answer. I want you to take the criteria and output a JSON object in the format below.
+GRADE (whether text was good or bad):
+Your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else.`, // prompt
+ "gpt-4-turbo", // llm
+ )
+ .then((evalCrits) => {
+ // Take only the first
+ setCriteria((crit) =>
+ crit.concat([
+ {
+ ...evalCrits[0],
+ uid: uuid(),
+ },
+ ]),
+ );
+ // Remove a loading Skeleton
+ setIsLoadingCriteria((num) => num - 1);
+ })
+ .catch((err) => {
+ console.error(err);
+ setIsLoadingCriteria((num) => num - 1);
+ });
+ };
// Goto next response in the queue (skipping grading the current one)
const nextResponse = () => {
if (responses.length === 0) return;
@@ -400,11 +484,12 @@ const EvalGenModal = forwardRef>(
typeof annotation === "string" &&
annotation.trim().length > 0
) {
- // console.log("setting annotation for resp", shownResponse.uid, annotation);
+ console.log("setting annotation for resp", shownResponse.uid, annotation);
updateGlobalRating(shownResponse.uid, "note", { 0: annotation });
- setAnnotation(null);
+ setAnnotation("");
- setPromptReasoning(null);
+ // @ts-expect-error The only way to deselect the Radio.Group is to set it to null. Undefined doesn't work.
+ setHolisticGrade(null);
if (shownResponseIdx < pastShownResponses.length - 1) {
// If we are not at the end of the history of shown responses, then show the next response:
@@ -417,7 +502,8 @@ const EvalGenModal = forwardRef>(
let next_resp = executor?.getNextExampleToGrade();
while (
num_tries > 0 &&
- (!next_resp || pastShownResponses.some((r) => r.uid === next_resp?.uid))
+ (!next_resp ||
+ pastShownResponses.some((r) => r.uid === next_resp?.uid))
) {
// We're presenting a response that's already been shown. Try again.
// NOTE: If we're trying again the first time, executor will flip and get the response on the other side of the grading stack, so we try once more:
@@ -431,7 +517,7 @@ const EvalGenModal = forwardRef>(
// Note that this doesn't guarantee uniqueness here ---it is possible to see a response again.
// However, the internal "grades" dict will help us in remembering what grade the user gave the response.
- setShownResponse(next_resp ? next_resp : undefined);
+ setShownResponse(next_resp ?? undefined);
if (next_resp)
@@ -458,7 +544,11 @@ const EvalGenModal = forwardRef>(
{/* View showing the response the user is currently grading */}
{/* Progress bar */}
@@ -494,25 +584,30 @@ const EvalGenModal = forwardRef>(
onChange={(newCrit) => handleChangeCriteria(newCrit, e.uid)}
onDelete={() => handleDeleteCriteria(e.uid)}
- grade={undefined}
- onChangeGrade={() => {
- console.log("hi");
+ grade={shownResponse ? grades[shownResponse.uid][e.uid] : undefined}
+ onChangeGrade={(newGrade) => {
+ if (shownResponse)
+ setPerCriteriaGrade(shownResponse.uid, e.uid, newGrade);
+ { isLoadingCriteria > 0 ? Array.from({length: isLoadingCriteria}, () => ) : <>>}