First working version (no backend)

2025-03-14 08:16:37 +00:00 · 2024-05-26 15:49:13 -04:00 · 2024-05-26 15:49:13 -04:00 · 50407b8c9b
commit 50407b8c9b
parent 1b093b8560
5 changed files with 196 additions and 76 deletions
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@ -28,7 +28,6 @@ import React, {
 import { v4 as uuid } from "uuid";
 import {
  ActionIcon,
-  Affix,
  Box,
  Button,
  Card,
@ -41,7 +40,7 @@ import {
  Menu,
  Modal,
  Radio,
-  ScrollArea,
+  Skeleton,
  Stack,
  Text,
  TextInput,
@ -51,7 +50,7 @@ import {
  rem,
 } from "@mantine/core";
 import { useDisclosure } from "@mantine/hooks";
-import { LLMResponse, PromptVarsDict, RatingDict } from "./backend/typing";
+import { Dict, LLMResponse, PromptVarsDict, RatingDict } from "./backend/typing";
 import { EvalCriteria } from "./backend/evalgen/typing";
 import {
  IconChevronDown,
@ -65,11 +64,17 @@ import {
  IconThumbUp,
  IconTrash,
 } from "@tabler/icons-react";
-import { cleanMetavarsFilterFunc, deepcopy, sampleRandomElements, transformDict } from "./backend/utils";
+import {
+  cleanMetavarsFilterFunc,
+  deepcopy,
+  sampleRandomElements,
+  transformDict,
+} from "./backend/utils";
 import useStore from "./store";
 import { getRatingKeyForResponse } from "./ResponseRatingToolbar";
 import StorageCache from "./backend/cache";
 import EvaluationFunctionExecutor from "./backend/evalgen/executor";
+import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils";

 const INIT_CRITERIA: EvalCriteria[] = [
  {
@ -115,7 +120,7 @@ const ThumbUpDownButtons = ({
          if (onChangeGrade) onChangeGrade(grade === true ? undefined : true);
        }}
      >
-        <IconThumbUp size="14pt" />
+        <IconThumbUp size="14pt" fill={grade === true ? "#aea" : "white"} />
      </Button>
      <Button
        color={grade === false ? "red" : "gray"}
@ -127,7 +132,7 @@ const ThumbUpDownButtons = ({
          if (onChangeGrade) onChangeGrade(grade === false ? undefined : false);
        }}
      >
-        <IconThumbDown size="14pt" />
+        <IconThumbDown size="14pt" fill={grade === false ? "pink" : "white"} />
      </Button>
    </>
  );
@ -323,18 +328,36 @@ export interface EvalGenModalRef {
 const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
  function EvalGenModal(props, ref) {
    const [opened, { open, close }] = useDisclosure(false);
+    const apiKeys = useStore((state) => state.apiKeys);
    const [criteria, setCriteria] = useState<EvalCriteria[]>(INIT_CRITERIA);

    const [responses, setResponses] = useState<LLMResponse[]>([]);
-    const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(undefined);
-    const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>([]);
+    const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(
+      undefined,
+    );
+    const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>(
+      [],
+    );
    const [shownResponseIdx, setShownResponseIdx] = useState(0);

-    const [annotation, setAnnotation] = useState<string | null>(null);
-    const [promptReasoning, setPromptReasoning] = useState<true | null>(null);
+    const [annotation, setAnnotation] = useState<string | undefined>(undefined);
+    const [holisticGrade, setHolisticGrade] = useState<"good" | "bad" | undefined>(undefined);
+
+    // Per-criteria grades (indexed by uid of response, then uid of criteria)
+    const [grades, setGrades] = useState<Dict<Dict<boolean | undefined>>>({});
+    const setPerCriteriaGrade = (responseUID: string, criteriaUID: string, newGrade: boolean | undefined) => {
+      setGrades((grades) => {
+        if (!grades[responseUID]) grades[responseUID] = {};
+        grades[responseUID][criteriaUID] = newGrade;
+        grades[responseUID] = {...grades[responseUID]};
+        return {...grades};
+      });
+    };

    // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
-    const [executor, setExecutor] = useState<EvaluationFunctionExecutor | null>(null);
+    const [executor, setExecutor] = useState<EvaluationFunctionExecutor | null>(
+      null,
+    );
    const [execProgress, setExecProgress] = useState(0);

    // For updating the global human ratings state
@ -354,6 +377,19 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
      // We pass the responses here manually to ensure they remain the same
      // for the duration of one EvalGen operation.
      setResponses(resps);
+      setGrades(resps.reduce((acc: Dict<Dict<boolean | undefined>>, curr) => {
+        acc[curr.uid] = {};
+        return acc;
+      }, {}));
+      setShownResponseIdx(0);
+      if (resps.length > 0) {
+        const first_resp = sampleRandomElements(resps, 1)[0];
+        setShownResponse(first_resp);
+        setPastShownResponses([first_resp]);
+      } else {
+        setShownResponse(undefined);
+        setPastShownResponses([]);
+      }
      open();
    };
    useImperativeHandle(ref, () => ({
@ -388,6 +424,54 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
      });
    };

+    // Synthesize a new criteria according to the feedback given for the shown response
+    const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
+    const synthNewCriteriaWithLLM = (response: string, feedback: string, grade: "good" | "bad" | "unknown") => {
+      // Add a loading Skeleton
+      setIsLoadingCriteria((num) => num + 1);
+      // Make async LLM call to expand criteria
+      generateLLMEvaluationCriteria(
+        "",
+        apiKeys,
+        `I've given some feedback on some text output. Use this feedback to decide on a single evaluation criteria with a yes/no answer. I want you to take the criteria and output a JSON object in the format below. 
+
+TEXT OUTPUT: 
+\`\`\`
+${response}
+\`\`\`
+
+GRADE (whether text was good or bad):
+\`\`\`
+${grade}
+\`\`\`
+
+FEEDBACK: 
+\`\`\`
+${feedback}
+\`\`\`
+
+Your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else.`, // prompt
+        "gpt-4-turbo", // llm
+      )
+        .then((evalCrits) => {
+          // Take only the first
+          setCriteria((crit) =>
+            crit.concat([
+              {
+                ...evalCrits[0],
+                uid: uuid(),
+              },
+            ]),
+          );
+          // Remove a loading Skeleton
+          setIsLoadingCriteria((num) => num - 1);
+        })
+        .catch((err) => {
+          console.error(err);
+          setIsLoadingCriteria((num) => num - 1);
+        });
+    };
+
    // Goto next response in the queue (skipping grading the current one)
    const nextResponse = () => {
      if (responses.length === 0) return;
@ -400,11 +484,12 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
        typeof annotation === "string" &&
        annotation.trim().length > 0
      ) {
-        // console.log("setting annotation for resp", shownResponse.uid, annotation);
+        console.log("setting annotation for resp", shownResponse.uid, annotation);
        updateGlobalRating(shownResponse.uid, "note", { 0: annotation });
-        setAnnotation(null);
+        setAnnotation("");
      }
-      setPromptReasoning(null);
+      // @ts-expect-error The only way to deselect the Radio.Group is to set it to null. Undefined doesn't work.
+      setHolisticGrade(null);

      if (shownResponseIdx < pastShownResponses.length - 1) {
        // If we are not at the end of the history of shown responses, then show the next response:
@ -417,7 +502,8 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
        let next_resp = executor?.getNextExampleToGrade();
        while (
          num_tries > 0 &&
-          (!next_resp || pastShownResponses.some((r) => r.uid === next_resp?.uid))
+          (!next_resp ||
+            pastShownResponses.some((r) => r.uid === next_resp?.uid))
        ) {
          // We're presenting a response that's already been shown. Try again.
          // NOTE: If we're trying again the first time, executor will flip and get the response on the other side of the grading stack, so we try once more:
@ -431,7 +517,7 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
        }
        // Note that this doesn't guarantee uniqueness here ---it is possible to see a response again.
        // However, the internal "grades" dict will help us in remembering what grade the user gave the response.
-        setShownResponse(next_resp ? next_resp : undefined);
+        setShownResponse(next_resp ?? undefined);
        if (next_resp)
          setPastShownResponses(pastShownResponses.concat(next_resp));
        setShownResponseIdx(pastShownResponses.length);
@ -458,7 +544,11 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
          <Grid.Col span={8}>
            <Stack justify="space-between">
              {/* View showing the response the user is currently grading */}
-              <GradingView shownResponse={shownResponse} gotoNextResponse={nextResponse} gotoPrevResponse={prevResponse} />
+              <GradingView
+                shownResponse={shownResponse}
+                gotoNextResponse={nextResponse}
+                gotoPrevResponse={prevResponse}
+              />

              {/* Progress bar */}
              {/* <Flex justify="left" align="center" gap="md">
@ -494,25 +584,30 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
                    key={e.uid}
                    onChange={(newCrit) => handleChangeCriteria(newCrit, e.uid)}
                    onDelete={() => handleDeleteCriteria(e.uid)}
-                    grade={undefined}
-                    onChangeGrade={() => {
-                      console.log("hi");
+                    grade={shownResponse ? grades[shownResponse.uid][e.uid] : undefined}
+                    onChangeGrade={(newGrade) => {
+                      if (shownResponse)
+                        setPerCriteriaGrade(shownResponse.uid, e.uid, newGrade);
                    }}
                    initiallyOpen={true}
                  />
                ))}
+                { isLoadingCriteria > 0 ? Array.from({length: isLoadingCriteria}, () => <Skeleton h={80} />) : <></>}
                <Center>
-                  <button onClick={() => {
-                    handleAddCriteria({
-                      shortname: "New Criteria",
-                      criteria: "",
-                      eval_method: "code",
-                      priority: 0,
-                      uid: uuid(),
-                    });
-                  }}>+</button>
+                  <button
+                    onClick={() => {
+                      handleAddCriteria({
+                        shortname: "New Criteria",
+                        criteria: "",
+                        eval_method: "code",
+                        priority: 0,
+                        uid: uuid(),
+                      });
+                    }}
+                  >
+                    +
+                  </button>
                </Center>
-                
              </div>

              <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
@ -521,12 +616,16 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
                  Provide Additional Feedback
                </Title>
                <Textarea
+                  value={annotation}
+                  onChange={(e) => setAnnotation(e.target.value)}
                  description="How good is this response? Explain anything not captured under your existing criteria. Your feedback will be used to generate new criteria."
                  mb="sm"
                />
                <Radio.Group
                  name="favoriteFramework"
                  label="Rate the response holistically:"
+                  value={holisticGrade}
+                  onChange={(v) => setHolisticGrade(v as ("good" | "bad"))}
                  withAsterisk
                  mb="md"
                >
@ -539,14 +638,10 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
                <Button
                  color="green"
                  variant="filled"
+                  disabled={!holisticGrade || (annotation === undefined || annotation.length === 0)}
                  onClick={() => {
-                    handleAddCriteria({
-                      shortname: "Criteria",
-                      criteria: "",
-                      eval_method: "code",
-                      priority: 0,
-                      uid: uuid(),
-                    });
+                    synthNewCriteriaWithLLM(shownResponse?.responses[0].toString() ?? "", annotation ?? "", holisticGrade ?? "unknown")
+                    nextResponse();
                  }}
                >
                  + Submit Feedback
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@ -176,7 +176,7 @@ export default class EvaluationFunctionExecutor {
      this.evalCriteria.length * 5 * this.examples.length;

    let criteriaProcessed = 0; // Track the number of criteria processed
-    let resolveAllFunctionsGenerated: any = undefined; // To be called when all functions are generated and executed
+    let resolveAllFunctionsGenerated: any; // To be called when all functions are generated and executed
    const functionExecutionPromises: Promise<any>[] = []; // Track execution promises for function executions

    // This promise resolves when the 'allFunctionsGenerated' event is emitted
--- a/chainforge/react-server/src/backend/evalgen/oai_utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
@ -56,7 +56,6 @@ export class AzureOpenAIStreamer extends EventEmitter {

    // const events = await this.client.listChatCompletions(model, messages, {});

-
    // for await (const event of events) {
    //   for (const choice of event.choices) {
    //     const delta = choice.delta?.content;
@ -76,50 +75,70 @@ export class AzureOpenAIStreamer extends EventEmitter {

    // Used restapi as here: https://stackoverflow.com/questions/76137987/openai-completion-stream-with-node-js-and-express-js

-    const streamRes = await fetch("https://api.openai.com/v1/chat/completions", {method: "POST", headers: {"Authorization": `Bearer ${this.openai_api_key}`, "Content-Type": "application/json"}, body: JSON.stringify({model: model, messages: messages, stream: true})});
-
+    const streamRes = await fetch(
+      "https://api.openai.com/v1/chat/completions",
+      {
+        method: "POST",
+        headers: {
+          Authorization: `Bearer ${this.openai_api_key}`,
+          "Content-Type": "application/json",
+        },
+        body: JSON.stringify({
+          model: model,
+          messages: messages,
+          stream: true,
+        }),
+      },
+    );

    const reader = streamRes.body?.getReader();
-    let done = false;
-    let concenattedJsonStrn = '';
+    if (!reader) {
+      console.error("Error initializing reader for OpenAI requests.");
+      return;
+    }

-    while (!done && reader) {
-      const { value, done: readerDone, } = await reader.read();
+    let done = false;
+    let concenattedJsonStrn = "";
+
+    while (!done) {
+      const { value, done: readerDone } = await reader.read();
      done = readerDone;
      const buffer = Buffer.from(value as ArrayBuffer);
      const textPayload = buffer.toString();
      concenattedJsonStrn += textPayload;
-      if (!concenattedJsonStrn.includes(`data: `) || !concenattedJsonStrn.includes(`\n\n`)) {
-          continue;
+      if (
+        !concenattedJsonStrn.includes(`data: `) ||
+        !concenattedJsonStrn.includes(`\n\n`)
+      ) {
+        continue;
      }
      const payloads = concenattedJsonStrn.toString().split("\n\n");
-      concenattedJsonStrn = '';
-  
-      for (const payload of payloads) {
-          if (payload.includes('[DONE]')) return;
-          if (payload.startsWith("data:")) {
-              try {
-                  const data = JSON.parse(payload.replace("data: ", ""));
-                  const delta: undefined | string = data.choices[0].delta?.content;
-                  if (delta !== undefined) {
-                    if (type === "criteria") {
-                      this.processCriteriaDelta(delta);
-                    } else if (type === "llm_eval") {
-                      this.processStringDelta(delta);
-                    } else if (type === "python_fn") {
-                      this.processFunctionDelta(delta);
-                    } else {
-                      throw new Error("Invalid type");
-                    }
-                  }
-              } catch (error) {
-                  console.log(`Error with JSON.parse and ${payload}.\n${error}`);
-                  concenattedJsonStrn += payload;
-              }
-          }
-      }
-  }
+      concenattedJsonStrn = "";

+      for (const payload of payloads) {
+        if (payload.includes("[DONE]")) return;
+        if (payload.startsWith("data:")) {
+          try {
+            const data = JSON.parse(payload.replace("data: ", ""));
+            const delta: undefined | string = data.choices[0].delta?.content;
+            if (delta !== undefined) {
+              if (type === "criteria") {
+                this.processCriteriaDelta(delta);
+              } else if (type === "llm_eval") {
+                this.processStringDelta(delta);
+              } else if (type === "python_fn") {
+                this.processFunctionDelta(delta);
+              } else {
+                throw new Error("Invalid type");
+              }
+            }
+          } catch (error) {
+            console.log(`Error with JSON.parse and ${payload}.\n${error}`);
+            concenattedJsonStrn += payload;
+          }
+        }
+      }
+    }

    this.emit("end"); // Signal that streaming is complete
  }
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@ -59,7 +59,6 @@ export async function generateLLMEvaluationCriteria(

  // Query the LLM (below, we will try this up to 3 times)
  async function _query() {
-
    // TODO: Get rid of this hard-coded spec in favor of regular openai (or another model)
    const spec = [
      {
@ -140,7 +139,14 @@ export async function generateLLMEvaluationCriteria(
    console.log("Parsed", data);

    // Double-check the formatting
-    if (data.every(validEvalCriteriaFormat)) return data;
+    if (data.every(validEvalCriteriaFormat)) {
+      // Initialize any required properties
+      data.forEach(d => {
+        d.uid = uuid();
+        d.priority = 0;
+      });
+      return data;
+    }
    // Incorrect formatting
    else
      throw new Error(
--- a/chainforge/react-server/src/backend/typing.ts
+++ b/chainforge/react-server/src/backend/typing.ts
@ -273,4 +273,4 @@ export type TabularDataColType = {

 export type PythonInterpreter = "flask" | "pyodide";

-export type RatingDict = Record<number, boolean | string | undefined>;
+export type RatingDict = Record<number, boolean | string | undefined>;