From 50407b8c9bb9446cf99d9235cc9a81bb6a6a69ce Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sun, 26 May 2024 15:49:13 -0400
Subject: [PATCH] First working version (no backend)

---
 chainforge/react-server/src/EvalGenModal.tsx  | 169 ++++++++++++++----
 .../src/backend/evalgen/executor.ts           |   2 +-
 .../src/backend/evalgen/oai_utils.ts          |  89 +++++----
 .../react-server/src/backend/evalgen/utils.ts |  10 +-
 chainforge/react-server/src/backend/typing.ts |   2 +-
 5 files changed, 196 insertions(+), 76 deletions(-)
diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
index 36f344b..b71b501 100644
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@@ -28,7 +28,6 @@ import React, {
 import { v4 as uuid } from "uuid";
 import {
   ActionIcon,
-  Affix,
   Box,
   Button,
   Card,
@@ -41,7 +40,7 @@ import {
   Menu,
   Modal,
   Radio,
-  ScrollArea,
+  Skeleton,
   Stack,
   Text,
   TextInput,
@@ -51,7 +50,7 @@ import {
   rem,
 } from "@mantine/core";
 import { useDisclosure } from "@mantine/hooks";
-import { LLMResponse, PromptVarsDict, RatingDict } from "./backend/typing";
+import { Dict, LLMResponse, PromptVarsDict, RatingDict } from "./backend/typing";
 import { EvalCriteria } from "./backend/evalgen/typing";
 import {
   IconChevronDown,
@@ -65,11 +64,17 @@ import {
   IconThumbUp,
   IconTrash,
 } from "@tabler/icons-react";
-import { cleanMetavarsFilterFunc, deepcopy, sampleRandomElements, transformDict } from "./backend/utils";
+import {
+  cleanMetavarsFilterFunc,
+  deepcopy,
+  sampleRandomElements,
+  transformDict,
+} from "./backend/utils";
 import useStore from "./store";
 import { getRatingKeyForResponse } from "./ResponseRatingToolbar";
 import StorageCache from "./backend/cache";
 import EvaluationFunctionExecutor from "./backend/evalgen/executor";
+import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils";
 
 const INIT_CRITERIA: EvalCriteria[] = [
   {
@@ -115,7 +120,7 @@ const ThumbUpDownButtons = ({
           if (onChangeGrade) onChangeGrade(grade === true ? undefined : true);
         }}
       >
-        <IconThumbUp size="14pt" />
+        <IconThumbUp size="14pt" fill={grade === true ? "#aea" : "white"} />
       </Button>
       <Button
         color={grade === false ? "red" : "gray"}
@@ -127,7 +132,7 @@ const ThumbUpDownButtons = ({
           if (onChangeGrade) onChangeGrade(grade === false ? undefined : false);
         }}
       >
-        <IconThumbDown size="14pt" />
+        <IconThumbDown size="14pt" fill={grade === false ? "pink" : "white"} />
       </Button>
     </>
   );
@@ -323,18 +328,36 @@ export interface EvalGenModalRef {
 const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
   function EvalGenModal(props, ref) {
     const [opened, { open, close }] = useDisclosure(false);
+    const apiKeys = useStore((state) => state.apiKeys);
     const [criteria, setCriteria] = useState<EvalCriteria[]>(INIT_CRITERIA);
 
     const [responses, setResponses] = useState<LLMResponse[]>([]);
-    const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(undefined);
-    const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>([]);
+    const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(
+      undefined,
+    );
+    const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>(
+      [],
+    );
     const [shownResponseIdx, setShownResponseIdx] = useState(0);
 
-    const [annotation, setAnnotation] = useState<string | null>(null);
-    const [promptReasoning, setPromptReasoning] = useState<true | null>(null);
+    const [annotation, setAnnotation] = useState<string | undefined>(undefined);
+    const [holisticGrade, setHolisticGrade] = useState<"good" | "bad" | undefined>(undefined);
+
+    // Per-criteria grades (indexed by uid of response, then uid of criteria)
+    const [grades, setGrades] = useState<Dict<Dict<boolean | undefined>>>({});
+    const setPerCriteriaGrade = (responseUID: string, criteriaUID: string, newGrade: boolean | undefined) => {
+      setGrades((grades) => {
+        if (!grades[responseUID]) grades[responseUID] = {};
+        grades[responseUID][criteriaUID] = newGrade;
+        grades[responseUID] = {...grades[responseUID]};
+        return {...grades};
+      });
+    };
 
     // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
-    const [executor, setExecutor] = useState<EvaluationFunctionExecutor | null>(null);
+    const [executor, setExecutor] = useState<EvaluationFunctionExecutor | null>(
+      null,
+    );
     const [execProgress, setExecProgress] = useState(0);
 
     // For updating the global human ratings state
@@ -354,6 +377,19 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
       // We pass the responses here manually to ensure they remain the same
       // for the duration of one EvalGen operation.
       setResponses(resps);
+      setGrades(resps.reduce((acc: Dict<Dict<boolean | undefined>>, curr) => {
+        acc[curr.uid] = {};
+        return acc;
+      }, {}));
+      setShownResponseIdx(0);
+      if (resps.length > 0) {
+        const first_resp = sampleRandomElements(resps, 1)[0];
+        setShownResponse(first_resp);
+        setPastShownResponses([first_resp]);
+      } else {
+        setShownResponse(undefined);
+        setPastShownResponses([]);
+      }
       open();
     };
     useImperativeHandle(ref, () => ({
@@ -388,6 +424,54 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
       });
     };
 
+    // Synthesize a new criteria according to the feedback given for the shown response
+    const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
+    const synthNewCriteriaWithLLM = (response: string, feedback: string, grade: "good" | "bad" | "unknown") => {
+      // Add a loading Skeleton
+      setIsLoadingCriteria((num) => num + 1);
+      // Make async LLM call to expand criteria
+      generateLLMEvaluationCriteria(
+        "",
+        apiKeys,
+        `I've given some feedback on some text output. Use this feedback to decide on a single evaluation criteria with a yes/no answer. I want you to take the criteria and output a JSON object in the format below. 
+
+TEXT OUTPUT: 
+\`\`\`
+${response}
+\`\`\`
+
+GRADE (whether text was good or bad):
+\`\`\`
+${grade}
+\`\`\`
+
+FEEDBACK: 
+\`\`\`
+${feedback}
+\`\`\`
+
+Your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else.`, // prompt
+        "gpt-4-turbo", // llm
+      )
+        .then((evalCrits) => {
+          // Take only the first
+          setCriteria((crit) =>
+            crit.concat([
+              {
+                ...evalCrits[0],
+                uid: uuid(),
+              },
+            ]),
+          );
+          // Remove a loading Skeleton
+          setIsLoadingCriteria((num) => num - 1);
+        })
+        .catch((err) => {
+          console.error(err);
+          setIsLoadingCriteria((num) => num - 1);
+        });
+    };
+
     // Goto next response in the queue (skipping grading the current one)
     const nextResponse = () => {
       if (responses.length === 0) return;
@@ -400,11 +484,12 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
         typeof annotation === "string" &&
         annotation.trim().length > 0
       ) {
-        // console.log("setting annotation for resp", shownResponse.uid, annotation);
+        console.log("setting annotation for resp", shownResponse.uid, annotation);
         updateGlobalRating(shownResponse.uid, "note", { 0: annotation });
-        setAnnotation(null);
+        setAnnotation("");
       }
-      setPromptReasoning(null);
+      // @ts-expect-error The only way to deselect the Radio.Group is to set it to null. Undefined doesn't work.
+      setHolisticGrade(null);
 
       if (shownResponseIdx < pastShownResponses.length - 1) {
         // If we are not at the end of the history of shown responses, then show the next response:
@@ -417,7 +502,8 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
         let next_resp = executor?.getNextExampleToGrade();
         while (
           num_tries > 0 &&
-          (!next_resp || pastShownResponses.some((r) => r.uid === next_resp?.uid))
+          (!next_resp ||
+            pastShownResponses.some((r) => r.uid === next_resp?.uid))
         ) {
           // We're presenting a response that's already been shown. Try again.
           // NOTE: If we're trying again the first time, executor will flip and get the response on the other side of the grading stack, so we try once more:
@@ -431,7 +517,7 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
         }
         // Note that this doesn't guarantee uniqueness here ---it is possible to see a response again.
         // However, the internal "grades" dict will help us in remembering what grade the user gave the response.
-        setShownResponse(next_resp ? next_resp : undefined);
+        setShownResponse(next_resp ?? undefined);
         if (next_resp)
           setPastShownResponses(pastShownResponses.concat(next_resp));
         setShownResponseIdx(pastShownResponses.length);
@@ -458,7 +544,11 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
           <Grid.Col span={8}>
             <Stack justify="space-between">
               {/* View showing the response the user is currently grading */}
-              <GradingView shownResponse={shownResponse} gotoNextResponse={nextResponse} gotoPrevResponse={prevResponse} />
+              <GradingView
+                shownResponse={shownResponse}
+                gotoNextResponse={nextResponse}
+                gotoPrevResponse={prevResponse}
+              />
 
               {/* Progress bar */}
               {/* <Flex justify="left" align="center" gap="md">
@@ -494,25 +584,30 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
                     key={e.uid}
                     onChange={(newCrit) => handleChangeCriteria(newCrit, e.uid)}
                     onDelete={() => handleDeleteCriteria(e.uid)}
-                    grade={undefined}
-                    onChangeGrade={() => {
-                      console.log("hi");
+                    grade={shownResponse ? grades[shownResponse.uid][e.uid] : undefined}
+                    onChangeGrade={(newGrade) => {
+                      if (shownResponse)
+                        setPerCriteriaGrade(shownResponse.uid, e.uid, newGrade);
                     }}
                     initiallyOpen={true}
                   />
                 ))}
+                { isLoadingCriteria > 0 ? Array.from({length: isLoadingCriteria}, () => <Skeleton h={80} />) : <></>}
                 <Center>
-                  <button onClick={() => {
-                    handleAddCriteria({
-                      shortname: "New Criteria",
-                      criteria: "",
-                      eval_method: "code",
-                      priority: 0,
-                      uid: uuid(),
-                    });
-                  }}>+</button>
+                  <button
+                    onClick={() => {
+                      handleAddCriteria({
+                        shortname: "New Criteria",
+                        criteria: "",
+                        eval_method: "code",
+                        priority: 0,
+                        uid: uuid(),
+                      });
+                    }}
+                  >
+                    +
+                  </button>
                 </Center>
-                
               </div>
 
               <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
@@ -521,12 +616,16 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
                   Provide Additional Feedback
                 </Title>
                 <Textarea
+                  value={annotation}
+                  onChange={(e) => setAnnotation(e.target.value)}
                   description="How good is this response? Explain anything not captured under your existing criteria. Your feedback will be used to generate new criteria."
                   mb="sm"
                 />
                 <Radio.Group
                   name="favoriteFramework"
                   label="Rate the response holistically:"
+                  value={holisticGrade}
+                  onChange={(v) => setHolisticGrade(v as ("good" | "bad"))}
                   withAsterisk
                   mb="md"
                 >
@@ -539,14 +638,10 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
                 <Button
                   color="green"
                   variant="filled"
+                  disabled={!holisticGrade || (annotation === undefined || annotation.length === 0)}
                   onClick={() => {
-                    handleAddCriteria({
-                      shortname: "Criteria",
-                      criteria: "",
-                      eval_method: "code",
-                      priority: 0,
-                      uid: uuid(),
-                    });
+                    synthNewCriteriaWithLLM(shownResponse?.responses[0].toString() ?? "", annotation ?? "", holisticGrade ?? "unknown")
+                    nextResponse();
                   }}
                 >
                   + Submit Feedback
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index a6c64a6..ccefb2e 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -176,7 +176,7 @@ export default class EvaluationFunctionExecutor {
       this.evalCriteria.length * 5 * this.examples.length;
 
     let criteriaProcessed = 0; // Track the number of criteria processed
-    let resolveAllFunctionsGenerated: any = undefined; // To be called when all functions are generated and executed
+    let resolveAllFunctionsGenerated: any; // To be called when all functions are generated and executed
     const functionExecutionPromises: Promise<any>[] = []; // Track execution promises for function executions
 
     // This promise resolves when the 'allFunctionsGenerated' event is emitted
diff --git a/chainforge/react-server/src/backend/evalgen/oai_utils.ts b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
index 5891ba9..85a27d5 100644
--- a/chainforge/react-server/src/backend/evalgen/oai_utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
@@ -56,7 +56,6 @@ export class AzureOpenAIStreamer extends EventEmitter {
 
     // const events = await this.client.listChatCompletions(model, messages, {});
 
-
     // for await (const event of events) {
     //   for (const choice of event.choices) {
     //     const delta = choice.delta?.content;
@@ -76,50 +75,70 @@ export class AzureOpenAIStreamer extends EventEmitter {
 
     // Used restapi as here: https://stackoverflow.com/questions/76137987/openai-completion-stream-with-node-js-and-express-js
 
-    const streamRes = await fetch("https://api.openai.com/v1/chat/completions", {method: "POST", headers: {"Authorization": `Bearer ${this.openai_api_key}`, "Content-Type": "application/json"}, body: JSON.stringify({model: model, messages: messages, stream: true})});
-
+    const streamRes = await fetch(
+      "https://api.openai.com/v1/chat/completions",
+      {
+        method: "POST",
+        headers: {
+          Authorization: `Bearer ${this.openai_api_key}`,
+          "Content-Type": "application/json",
+        },
+        body: JSON.stringify({
+          model: model,
+          messages: messages,
+          stream: true,
+        }),
+      },
+    );
 
     const reader = streamRes.body?.getReader();
-    let done = false;
-    let concenattedJsonStrn = '';
+    if (!reader) {
+      console.error("Error initializing reader for OpenAI requests.");
+      return;
+    }
 
-    while (!done && reader) {
-      const { value, done: readerDone, } = await reader.read();
+    let done = false;
+    let concenattedJsonStrn = "";
+
+    while (!done) {
+      const { value, done: readerDone } = await reader.read();
       done = readerDone;
       const buffer = Buffer.from(value as ArrayBuffer);
       const textPayload = buffer.toString();
       concenattedJsonStrn += textPayload;
-      if (!concenattedJsonStrn.includes(`data: `) || !concenattedJsonStrn.includes(`\n\n`)) {
-          continue;
+      if (
+        !concenattedJsonStrn.includes(`data: `) ||
+        !concenattedJsonStrn.includes(`\n\n`)
+      ) {
+        continue;
       }
       const payloads = concenattedJsonStrn.toString().split("\n\n");
-      concenattedJsonStrn = '';
-  
-      for (const payload of payloads) {
-          if (payload.includes('[DONE]')) return;
-          if (payload.startsWith("data:")) {
-              try {
-                  const data = JSON.parse(payload.replace("data: ", ""));
-                  const delta: undefined | string = data.choices[0].delta?.content;
-                  if (delta !== undefined) {
-                    if (type === "criteria") {
-                      this.processCriteriaDelta(delta);
-                    } else if (type === "llm_eval") {
-                      this.processStringDelta(delta);
-                    } else if (type === "python_fn") {
-                      this.processFunctionDelta(delta);
-                    } else {
-                      throw new Error("Invalid type");
-                    }
-                  }
-              } catch (error) {
-                  console.log(`Error with JSON.parse and ${payload}.\n${error}`);
-                  concenattedJsonStrn += payload;
-              }
-          }
-      }
-  }
+      concenattedJsonStrn = "";
 
+      for (const payload of payloads) {
+        if (payload.includes("[DONE]")) return;
+        if (payload.startsWith("data:")) {
+          try {
+            const data = JSON.parse(payload.replace("data: ", ""));
+            const delta: undefined | string = data.choices[0].delta?.content;
+            if (delta !== undefined) {
+              if (type === "criteria") {
+                this.processCriteriaDelta(delta);
+              } else if (type === "llm_eval") {
+                this.processStringDelta(delta);
+              } else if (type === "python_fn") {
+                this.processFunctionDelta(delta);
+              } else {
+                throw new Error("Invalid type");
+              }
+            }
+          } catch (error) {
+            console.log(`Error with JSON.parse and ${payload}.\n${error}`);
+            concenattedJsonStrn += payload;
+          }
+        }
+      }
+    }
 
     this.emit("end"); // Signal that streaming is complete
   }
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index 2b67d02..44443bf 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -59,7 +59,6 @@ export async function generateLLMEvaluationCriteria(
 
   // Query the LLM (below, we will try this up to 3 times)
   async function _query() {
-
     // TODO: Get rid of this hard-coded spec in favor of regular openai (or another model)
     const spec = [
       {
@@ -140,7 +139,14 @@ export async function generateLLMEvaluationCriteria(
     console.log("Parsed", data);
 
     // Double-check the formatting
-    if (data.every(validEvalCriteriaFormat)) return data;
+    if (data.every(validEvalCriteriaFormat)) {
+      // Initialize any required properties
+      data.forEach(d => {
+        d.uid = uuid();
+        d.priority = 0;
+      });
+      return data;
+    }
     // Incorrect formatting
     else
       throw new Error(
diff --git a/chainforge/react-server/src/backend/typing.ts b/chainforge/react-server/src/backend/typing.ts
index 0027da6..a698a00 100644
--- a/chainforge/react-server/src/backend/typing.ts
+++ b/chainforge/react-server/src/backend/typing.ts
@@ -273,4 +273,4 @@ export type TabularDataColType = {
 
 export type PythonInterpreter = "flask" | "pyodide";
 
-export type RatingDict = Record<number, boolean | string | undefined>;
\ No newline at end of file
+export type RatingDict = Record<number, boolean | string | undefined>;