mirror of
https://github.com/ianarawjo/ChainForge.git
synced 2025-03-14 08:16:37 +00:00
Add Code Processor nodes (#180)
* Add Code Processor nodes. * Renamed EvaluatorNode to CodeEvaluatorNode. * Changed the way evaluator nodes pull responses (from grabResponses to pullInputData). * Fix SimpleEvalNode to be consistent with CodeEvaluatorNode * Fix Vis Node where no eval resps are connected, but resps are connected. * Rebuilt react and update package version
This commit is contained in:
parent
a861695c87
commit
7223735b7f
@ -158,12 +158,12 @@ def check_typeof_vals(arr: list) -> MetricType:
|
||||
else:
|
||||
return val_type
|
||||
|
||||
def run_over_responses(eval_func, responses: list, scope: str) -> list:
|
||||
def run_over_responses(process_func, responses: list, scope: str, process_type: str) -> list:
|
||||
for resp_obj in responses:
|
||||
res = resp_obj['responses']
|
||||
if scope == 'response':
|
||||
# Run evaluator func over every individual response text
|
||||
evals = [eval_func(
|
||||
# Run process func over every individual response text
|
||||
proc = [process_func(
|
||||
ResponseInfo(
|
||||
text=r,
|
||||
prompt=resp_obj['prompt'],
|
||||
@ -172,51 +172,62 @@ def run_over_responses(eval_func, responses: list, scope: str) -> list:
|
||||
llm=resp_obj['llm'])
|
||||
) for r in res]
|
||||
|
||||
# Check the type of evaluation results
|
||||
# NOTE: We assume this is consistent across all evaluations, but it may not be.
|
||||
eval_res_type = check_typeof_vals(evals)
|
||||
if process_type == 'processor':
|
||||
# Response text was just transformed, not evaluated
|
||||
resp_obj['responses'] = proc
|
||||
else:
|
||||
# Responses were evaluated/scored
|
||||
# Check the type of evaluation results
|
||||
# NOTE: We assume this is consistent across all evaluations, but it may not be.
|
||||
eval_res_type = check_typeof_vals(proc)
|
||||
|
||||
if eval_res_type == MetricType.Numeric:
|
||||
# Store items with summary of mean, median, etc
|
||||
resp_obj['eval_res'] = {
|
||||
'mean': mean(evals),
|
||||
'median': median(evals),
|
||||
'stdev': stdev(evals) if len(evals) > 1 else 0,
|
||||
'range': (min(evals), max(evals)),
|
||||
'items': evals,
|
||||
'dtype': eval_res_type.name,
|
||||
}
|
||||
elif eval_res_type in (MetricType.Unknown, MetricType.Empty):
|
||||
raise Exception('Unsupported types found in evaluation results. Only supported types for metrics are: int, float, bool, str.')
|
||||
else:
|
||||
# Categorical, KeyValue, etc, we just store the items:
|
||||
resp_obj['eval_res'] = {
|
||||
'items': evals,
|
||||
'dtype': eval_res_type.name,
|
||||
}
|
||||
if eval_res_type == MetricType.Numeric:
|
||||
# Store items with summary of mean, median, etc
|
||||
resp_obj['eval_res'] = {
|
||||
'mean': mean(proc),
|
||||
'median': median(proc),
|
||||
'stdev': stdev(proc) if len(proc) > 1 else 0,
|
||||
'range': (min(proc), max(proc)),
|
||||
'items': proc,
|
||||
'dtype': eval_res_type.name,
|
||||
}
|
||||
elif eval_res_type in (MetricType.Unknown, MetricType.Empty):
|
||||
raise Exception('Unsupported types found in evaluation results. Only supported types for metrics are: int, float, bool, str.')
|
||||
else:
|
||||
# Categorical, KeyValue, etc, we just store the items:
|
||||
resp_obj['eval_res'] = {
|
||||
'items': proc,
|
||||
'dtype': eval_res_type.name,
|
||||
}
|
||||
else:
|
||||
# Run evaluator func over the entire response batch
|
||||
ev = eval_func([
|
||||
# Run process func over the entire response batch
|
||||
proc = process_func([
|
||||
ResponseInfo(text=r,
|
||||
prompt=resp_obj['prompt'],
|
||||
var=resp_obj['vars'],
|
||||
llm=resp_obj['llm'])
|
||||
for r in res])
|
||||
ev_type = check_typeof_vals([ev])
|
||||
if ev_type == MetricType.Numeric:
|
||||
resp_obj['eval_res'] = {
|
||||
'mean': ev,
|
||||
'median': ev,
|
||||
'stdev': 0,
|
||||
'range': (ev, ev),
|
||||
'items': [ev],
|
||||
'type': ev_type.name,
|
||||
}
|
||||
else:
|
||||
resp_obj['eval_res'] = {
|
||||
'items': [ev],
|
||||
'type': ev_type.name,
|
||||
}
|
||||
for r in res])
|
||||
|
||||
if process_type == 'processor':
|
||||
# Response text was just transformed, not evaluated
|
||||
resp_obj['responses'] = proc
|
||||
else:
|
||||
# Responses were evaluated/scored
|
||||
ev_type = check_typeof_vals([proc])
|
||||
if ev_type == MetricType.Numeric:
|
||||
resp_obj['eval_res'] = {
|
||||
'mean': proc,
|
||||
'median': proc,
|
||||
'stdev': 0,
|
||||
'range': (proc, proc),
|
||||
'items': [proc],
|
||||
'type': ev_type.name,
|
||||
}
|
||||
else:
|
||||
resp_obj['eval_res'] = {
|
||||
'items': [proc],
|
||||
'type': ev_type.name,
|
||||
}
|
||||
return responses
|
||||
|
||||
async def make_sync_call_async(sync_method, *args, **params):
|
||||
@ -266,6 +277,7 @@ def executepy():
|
||||
'responses': List[StandardizedLLMResponse] # the responses to run on.
|
||||
'scope': 'response' | 'batch' # the scope of responses to run on --a single response, or all across each batch.
|
||||
# If batch, evaluator has access to 'responses'. Only matters if n > 1 for each prompt.
|
||||
'process_type': 'evaluator' | 'processor' # the type of processing to perform. Evaluators only 'score'/annotate responses. Processors change responses (e.g. text).
|
||||
'script_paths': unspecified | List[str] # the paths to scripts to be added to the path before the lambda function is evaluated
|
||||
}
|
||||
|
||||
@ -304,6 +316,9 @@ def executepy():
|
||||
except Exception as e:
|
||||
return jsonify({'error': f'Could not add script path to sys.path. Error message:\n{str(e)}'})
|
||||
|
||||
# Get processor type, if any
|
||||
process_type = data['process_type'] if 'process_type' in data else 'evaluator'
|
||||
|
||||
# Create the evaluator function
|
||||
# DANGER DANGER!
|
||||
try:
|
||||
@ -311,7 +326,10 @@ def executepy():
|
||||
|
||||
# Double-check that there is an 'evaluate' method in our namespace.
|
||||
# This will throw a NameError if not:
|
||||
evaluate # noqa
|
||||
if process_type == 'evaluator':
|
||||
evaluate # noqa
|
||||
else:
|
||||
process # noqa
|
||||
except Exception as e:
|
||||
return jsonify({'error': f'Could not compile evaluator code. Error message:\n{str(e)}'})
|
||||
|
||||
@ -319,7 +337,7 @@ def executepy():
|
||||
logs = []
|
||||
try:
|
||||
HIJACK_PYTHON_PRINT()
|
||||
evald_responses = run_over_responses(evaluate, responses, scope=data['scope']) # noqa
|
||||
evald_responses = run_over_responses(evaluate if process_type == 'evaluator' else process, responses, scope=data['scope'], process_type=process_type) # noqa
|
||||
logs = REVERT_PYTHON_PRINT()
|
||||
except Exception as e:
|
||||
logs = REVERT_PYTHON_PRINT()
|
||||
|
@ -1,15 +1,15 @@
|
||||
{
|
||||
"files": {
|
||||
"main.css": "/static/css/main.8665fcca.css",
|
||||
"main.js": "/static/js/main.72fd301c.js",
|
||||
"main.js": "/static/js/main.77a6d92d.js",
|
||||
"static/js/787.4c72bb55.chunk.js": "/static/js/787.4c72bb55.chunk.js",
|
||||
"index.html": "/index.html",
|
||||
"main.8665fcca.css.map": "/static/css/main.8665fcca.css.map",
|
||||
"main.72fd301c.js.map": "/static/js/main.72fd301c.js.map",
|
||||
"main.77a6d92d.js.map": "/static/js/main.77a6d92d.js.map",
|
||||
"787.4c72bb55.chunk.js.map": "/static/js/787.4c72bb55.chunk.js.map"
|
||||
},
|
||||
"entrypoints": [
|
||||
"static/css/main.8665fcca.css",
|
||||
"static/js/main.72fd301c.js"
|
||||
"static/js/main.77a6d92d.js"
|
||||
]
|
||||
}
|
@ -1 +1 @@
|
||||
<!doctype html><html lang="en"><head><meta charset="utf-8"/><script async src="https://www.googletagmanager.com/gtag/js?id=G-RN3FDBLMCR"></script><script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-RN3FDBLMCR")</script><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="A visual programming environment for prompt engineering"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>ChainForge</title><script defer="defer" src="/static/js/main.72fd301c.js"></script><link href="/static/css/main.8665fcca.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|
||||
<!doctype html><html lang="en"><head><meta charset="utf-8"/><script async src="https://www.googletagmanager.com/gtag/js?id=G-RN3FDBLMCR"></script><script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-RN3FDBLMCR")</script><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="A visual programming environment for prompt engineering"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>ChainForge</title><script defer="defer" src="/static/js/main.77a6d92d.js"></script><link href="/static/css/main.8665fcca.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
28
chainforge/react-server/src/App.js
vendored
28
chainforge/react-server/src/App.js
vendored
@ -12,7 +12,7 @@ import { IconSettings, IconTextPlus, IconTerminal, IconCsv, IconSettingsAutomati
|
||||
import RemoveEdge from './RemoveEdge';
|
||||
import TextFieldsNode from './TextFieldsNode'; // Import a custom node
|
||||
import PromptNode from './PromptNode';
|
||||
import EvaluatorNode from './EvaluatorNode';
|
||||
import CodeEvaluatorNode from './CodeEvaluatorNode';
|
||||
import VisNode from './VisNode';
|
||||
import InspectNode from './InspectorNode';
|
||||
import ScriptNode from './ScriptNode';
|
||||
@ -81,7 +81,7 @@ const nodeTypes = {
|
||||
prompt: PromptNode,
|
||||
chat: PromptNode,
|
||||
simpleval: SimpleEvalNode,
|
||||
evaluator: EvaluatorNode,
|
||||
evaluator: CodeEvaluatorNode,
|
||||
llmeval: LLMEvaluatorNode,
|
||||
vis: VisNode,
|
||||
inspect: InspectNode,
|
||||
@ -91,6 +91,7 @@ const nodeTypes = {
|
||||
comment: CommentNode,
|
||||
join: JoinNode,
|
||||
split: SplitNode,
|
||||
processor: CodeEvaluatorNode,
|
||||
};
|
||||
|
||||
const edgeTypes = {
|
||||
@ -238,6 +239,15 @@ const App = () => {
|
||||
const { x, y } = getViewportCenter();
|
||||
addNode({ id: 'split-'+Date.now(), type: 'split', data: {}, position: {x: x-200, y:y-100} });
|
||||
};
|
||||
const addProcessorNode = (progLang) => {
|
||||
const { x, y } = getViewportCenter();
|
||||
let code = "";
|
||||
if (progLang === 'python')
|
||||
code = "def process(response):\n return response.text;";
|
||||
else if (progLang === 'javascript')
|
||||
code = "function process(response) {\n return response.text;\n}";
|
||||
addNode({ id: 'process-'+Date.now(), type: 'processor', data: { language: progLang, code: code }, position: {x: x-200, y:y-100} });
|
||||
};
|
||||
|
||||
const onClickExamples = () => {
|
||||
if (examplesModal && examplesModal.current)
|
||||
@ -762,16 +772,16 @@ const App = () => {
|
||||
<Menu.Divider />
|
||||
<Menu.Label>Evaluators</Menu.Label>
|
||||
<MenuTooltip label="Evaluate responses with a simple check (no coding required).">
|
||||
<Menu.Item onClick={addSimpleEvalNode} icon={<IconRuler2 size="16px" />}> Simple Evaluator Node </Menu.Item>
|
||||
<Menu.Item onClick={addSimpleEvalNode} icon={<IconRuler2 size="16px" />}> Simple Evaluator </Menu.Item>
|
||||
</MenuTooltip>
|
||||
<MenuTooltip label="Evaluate responses by writing JavaScript code.">
|
||||
<Menu.Item onClick={() => addEvalNode('javascript')} icon={<IconTerminal size="16px" />}> JavaScript Evaluator Node </Menu.Item>
|
||||
<Menu.Item onClick={() => addEvalNode('javascript')} icon={<IconTerminal size="16px" />}> JavaScript Evaluator </Menu.Item>
|
||||
</MenuTooltip>
|
||||
{IS_RUNNING_LOCALLY ? (<MenuTooltip label="Evaluate responses by writing Python code.">
|
||||
<Menu.Item onClick={() => addEvalNode('python')} icon={<IconTerminal size="16px" />}> Python Evaluator Node </Menu.Item>
|
||||
<Menu.Item onClick={() => addEvalNode('python')} icon={<IconTerminal size="16px" />}> Python Evaluator </Menu.Item>
|
||||
</MenuTooltip>): <></>}
|
||||
<MenuTooltip label="Evaluate responses with an LLM like GPT-4.">
|
||||
<Menu.Item onClick={addLLMEvalNode} icon={<IconRobot size="16px" />}> LLM Scorer Node</Menu.Item>
|
||||
<Menu.Item onClick={addLLMEvalNode} icon={<IconRobot size="16px" />}> LLM Scorer </Menu.Item>
|
||||
</MenuTooltip>
|
||||
<Menu.Divider />
|
||||
<Menu.Label>Visualizers</Menu.Label>
|
||||
@ -783,6 +793,12 @@ const App = () => {
|
||||
</MenuTooltip>
|
||||
<Menu.Divider />
|
||||
<Menu.Label>Processors</Menu.Label>
|
||||
<MenuTooltip label="Transform responses by mapping a JavaScript function over them.">
|
||||
<Menu.Item onClick={() => addProcessorNode('javascript')} icon={<IconTerminal size='14pt' />}> JavaScript Processor </Menu.Item>
|
||||
</MenuTooltip>
|
||||
{IS_RUNNING_LOCALLY ? <MenuTooltip label="Transform responses by mapping a Python function over them.">
|
||||
<Menu.Item onClick={() => addProcessorNode('python')} icon={<IconTerminal size='14pt' />}> Python Processor </Menu.Item>
|
||||
</MenuTooltip>: <></>}
|
||||
<MenuTooltip label="Concatenate responses or input data together before passing into later nodes, within or across variables and LLMs.">
|
||||
<Menu.Item onClick={addJoinNode} icon={<IconArrowMerge size='14pt' />}> Join Node </Menu.Item>
|
||||
</MenuTooltip>
|
||||
|
@ -1,4 +1,4 @@
|
||||
import React, { useState, useRef, useCallback, useEffect } from 'react';
|
||||
import React, { useState, useRef, useCallback, useEffect, useMemo } from 'react';
|
||||
import { Handle } from 'reactflow';
|
||||
import { Button, Code, Modal, Tooltip, Box, Text } from '@mantine/core';
|
||||
import { Prism } from '@mantine/prism';
|
||||
@ -16,8 +16,9 @@ import "ace-builds/src-noconflict/mode-javascript";
|
||||
import "ace-builds/src-noconflict/theme-xcode";
|
||||
import "ace-builds/src-noconflict/ext-language_tools";
|
||||
import fetch_from_backend from './fetch_from_backend';
|
||||
import { APP_IS_RUNNING_LOCALLY } from './backend/utils';
|
||||
import { APP_IS_RUNNING_LOCALLY, stripLLMDetailsFromResponses, toStandardResponseFormat } from './backend/utils';
|
||||
import InspectFooter from './InspectFooter';
|
||||
import { escapeBraces } from './backend/template';
|
||||
|
||||
// Whether we are running on localhost or not, and hence whether
|
||||
// we have access to the Flask backend for, e.g., Python code evaluation.
|
||||
@ -53,6 +54,7 @@ class ResponseInfo:
|
||||
...
|
||||
`;
|
||||
|
||||
// Code evaluator examples for info modal
|
||||
const _info_example_py = `
|
||||
def evaluate(response):
|
||||
# Return the length of the response (num of characters)
|
||||
@ -78,9 +80,48 @@ function evaluate(response) {
|
||||
return ... // for instance, true or false
|
||||
}`;
|
||||
|
||||
const EvaluatorNode = ({ data, id }) => {
|
||||
// Code processor examples for info modal
|
||||
const _info_proc_example_py = `
|
||||
def process(response):
|
||||
# Return the first 12 characters
|
||||
return response.text[:12]
|
||||
`;
|
||||
const _info_proc_example_js = `
|
||||
function process(response) {
|
||||
// Return the first 12 characters
|
||||
return response.text.slice(0, 12);
|
||||
}`;
|
||||
const _info_proc_example_var_py = `
|
||||
def process(response):
|
||||
# Find the index of the substring "ANSWER:"
|
||||
answer_index = response.text.find("ANSWER:")
|
||||
|
||||
# If "ANSWER:" is in the text, return everything after it
|
||||
if answer_index != -1:
|
||||
return response.text[answer_index + len("ANSWER:"):]
|
||||
else: # return error message
|
||||
return "NOT FOUND"
|
||||
`;
|
||||
const _info_proc_example_var_js = `
|
||||
function process(response) {
|
||||
// Find the index of the substring "ANSWER:"
|
||||
const answerIndex = response.text.indexOf("ANSWER:");
|
||||
|
||||
// If "ANSWER:" is in the text, return everything after it
|
||||
if (answerIndex !== -1)
|
||||
return response.text.substring(answerIndex + "ANSWER:".length);
|
||||
else // return error message
|
||||
return "NOT FOUND";
|
||||
}`;
|
||||
|
||||
/**
|
||||
* The Code Evaluator class supports users in writing JavaScript and Python functions that map across LLM responses.
|
||||
* It has two modes: evaluator and processor mode. Evaluators annotate responses with scores; processors transform response objects themselves.
|
||||
*/
|
||||
const CodeEvaluatorNode = ({ data, id, type: node_type }) => {
|
||||
|
||||
const inputEdgesForNode = useStore((state) => state.inputEdgesForNode);
|
||||
const pullInputData = useStore((state) => state.pullInputData);
|
||||
const pingOutputNodes = useStore((state) => state.pingOutputNodes);
|
||||
const setDataPropsForNode = useStore((state) => state.setDataPropsForNode);
|
||||
const [status, setStatus] = useState('none');
|
||||
@ -129,7 +170,7 @@ const EvaluatorNode = ({ data, id }) => {
|
||||
}).then(function(json) {
|
||||
if (json.responses && json.responses.length > 0) {
|
||||
// Store responses and set status to green checkmark
|
||||
setLastResponses(json.responses);
|
||||
setLastResponses(stripLLMDetailsFromResponses(json.responses));
|
||||
setStatus('ready');
|
||||
}
|
||||
});
|
||||
@ -164,18 +205,22 @@ const EvaluatorNode = ({ data, id }) => {
|
||||
instead. If you'd like to run the Python evaluator, consider installing ChainForge locally.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Get the ids from the connected input nodes:
|
||||
const input_node_ids = inputEdgesForNode(id).map(e => e.source);
|
||||
if (input_node_ids.length === 0) {
|
||||
console.warn("No inputs for evaluator node.");
|
||||
return;
|
||||
|
||||
// Pull input data
|
||||
let pulled_inputs = pullInputData(["responseBatch"], id);
|
||||
if (!pulled_inputs || !pulled_inputs["responseBatch"]) {
|
||||
console.warn(`No inputs for code ${node_type} node.`);
|
||||
return;
|
||||
}
|
||||
// Convert to standard response format (StandardLLMResponseFormat)
|
||||
pulled_inputs = pulled_inputs["responseBatch"].map(toStandardResponseFormat);
|
||||
|
||||
// Double-check that the code includes an 'evaluate' function:
|
||||
const find_evalfunc_regex = progLang === 'python' ? /def\s+evaluate\s*(.*):/ : /function\s+evaluate\s*(.*)/;
|
||||
if (codeText.search(find_evalfunc_regex) === -1) {
|
||||
const err_msg = `Could not find required function 'evaluate'. Make sure you have defined an 'evaluate' function.`;
|
||||
const find_func_regex = node_type === 'evaluator' ? (progLang === 'python' ? /def\s+evaluate\s*(.*):/ : /function\s+evaluate\s*(.*)/)
|
||||
: (progLang === 'python' ? /def\s+process\s*(.*):/ : /function\s+process\s*(.*)/);
|
||||
if (codeText.search(find_func_regex) === -1) {
|
||||
const req_func_name = node_type === 'evaluator' ? 'evaluate' : 'process';
|
||||
const err_msg = `Could not find required function '${req_func_name}'. Make sure you have defined an '${req_func_name}' function.`;
|
||||
setStatus('error');
|
||||
alertModal.current.trigger(err_msg);
|
||||
return;
|
||||
@ -190,6 +235,8 @@ const EvaluatorNode = ({ data, id }) => {
|
||||
alertModal.current.trigger(err_msg);
|
||||
};
|
||||
|
||||
// const _llmItemsCurrState = getLLMsInPulledInputData(pulled_data);
|
||||
|
||||
// Get all the Python script nodes, and get all the folder paths
|
||||
// NOTE: Python only!
|
||||
let script_paths = [];
|
||||
@ -204,8 +251,9 @@ const EvaluatorNode = ({ data, id }) => {
|
||||
fetch_from_backend(execute_route, {
|
||||
id: id,
|
||||
code: codeTextOnRun,
|
||||
responses: input_node_ids,
|
||||
responses: pulled_inputs,
|
||||
scope: 'response',
|
||||
process_type: node_type,
|
||||
script_paths: script_paths,
|
||||
}).then(function(json) {
|
||||
// Store any Python print output
|
||||
@ -226,12 +274,27 @@ const EvaluatorNode = ({ data, id }) => {
|
||||
|
||||
// Ping any vis + inspect nodes attached to this node to refresh their contents:
|
||||
pingOutputNodes(id);
|
||||
|
||||
console.log(json.responses);
|
||||
setLastResponses(json.responses);
|
||||
setLastResponses(stripLLMDetailsFromResponses(json.responses));
|
||||
setCodeTextOnLastRun(codeTextOnRun);
|
||||
setLastRunSuccess(true);
|
||||
|
||||
setDataPropsForNode(id, {fields: json.responses.map(
|
||||
resp_obj => resp_obj['responses'].map(r => {
|
||||
// Carry over the response text, prompt, prompt fill history (vars), and llm data
|
||||
let o = { text: escapeBraces(r),
|
||||
prompt: resp_obj['prompt'],
|
||||
fill_history: resp_obj['vars'],
|
||||
metavars: resp_obj['metavars'] || {},
|
||||
llm: resp_obj['llm'] };
|
||||
|
||||
// Carry over any chat history
|
||||
if (resp_obj['chat_history'])
|
||||
o.chat_history = resp_obj['chat_history'];
|
||||
|
||||
return o;
|
||||
})).flat()
|
||||
});
|
||||
|
||||
if (status !== 'ready')
|
||||
setUninspectedResponses(true);
|
||||
|
||||
@ -250,10 +313,67 @@ const EvaluatorNode = ({ data, id }) => {
|
||||
}
|
||||
}, [inspectModal, lastResponses]);
|
||||
|
||||
const default_header = (progLang === 'python') ?
|
||||
'Python Evaluator Node'
|
||||
: 'JavaScript Evaluator Node';
|
||||
/* Memoized variables for displaying the UI that depend on the node type (evaluator or processor) and the programming language. */
|
||||
const default_header = useMemo(() => {
|
||||
const capitalized_type = node_type.charAt(0).toUpperCase() + node_type.slice(1);
|
||||
if (progLang === 'python')
|
||||
return `Python ${capitalized_type} Node`;
|
||||
else
|
||||
return `JavaScript ${capitalized_type} Node`;
|
||||
}, [progLang, node_type]);
|
||||
const node_header = data.title || default_header;
|
||||
const run_tooltip = useMemo(() => `Run ${node_type} over inputs`, [node_type]);
|
||||
const code_instruct_header = useMemo(() => {
|
||||
if (node_type === 'evaluator')
|
||||
return <div className="code-mirror-field-header">Define an <Code>evaluate</Code> func to map over each response:</div>;
|
||||
else
|
||||
return <div className="code-mirror-field-header">Define a <Code>process</Code> func to map over each response:</div>;
|
||||
}, [node_type]);
|
||||
const code_info_modal = useMemo(() => {
|
||||
if (node_type === 'evaluator')
|
||||
return <Box m='lg' mt='xl'>
|
||||
<Text mb='sm'>To use a {default_header}, write a function <Code>evaluate</Code> that takes a single argument of class <Code>ResponseInfo</Code>.
|
||||
The function should return a \'score\' for that response, which usually is a number or a boolean value (strings as categoricals are supported, but experimental).</Text>
|
||||
<Text mt='sm' mb='sm'>
|
||||
For instance, here is an evaluator that returns the length of a response:</Text>
|
||||
<Prism language={progLang === 'python' ? 'py' : 'ts'}>
|
||||
{progLang === 'python' ? _info_example_py : _info_example_js}
|
||||
</Prism>
|
||||
<Text mt='md' mb='sm'>This function gets the text of the response via <Code>response.text</Code>, then calculates its length in characters. The full <Code>ResponseInfo</Code> class has the following properties and methods:</Text>
|
||||
<Prism language={progLang === 'python' ? 'py' : 'ts'}>
|
||||
{progLang === 'python' ? _info_codeblock_py : _info_codeblock_js}
|
||||
</Prism>
|
||||
<Text mt='md' mb='sm'>For instance, say you have a prompt template <Code>What is the capital of {country}?</Code> on a Prompt Node.
|
||||
You want to get the input variable 'country', which filled the prompt that led to the current response. You can use<Code>response.var</Code>:</Text>
|
||||
<Prism language={progLang === 'python' ? 'py' : 'ts'}>
|
||||
{progLang === 'python' ? _info_example_var_py : _info_example_var_js}
|
||||
</Prism>
|
||||
<Text mt='md'>Note that you are allowed to define variables outside of the function, or define more functions, as long as a function called <Code>evaluate</Code> is defined.
|
||||
For more information on what's possible, see the <a href="https://chainforge.ai/docs/" target='_blank'>documentation</a> or load some Example Flows.</Text>
|
||||
</Box>;
|
||||
else
|
||||
return <Box m='lg' mt='xl'>
|
||||
<Text mb='sm'>To use a {default_header}, write a function <Code>process</Code> that takes a single argument of class <Code>ResponseInfo</Code>.
|
||||
The function should returned the <strong>transformed response text</strong>, as a string or number.</Text>
|
||||
<Text mt='sm' mb='sm'>
|
||||
For instance, here is a processor that simply returns the first 12 characters of the response:</Text>
|
||||
<Prism language={progLang === 'python' ? 'py' : 'ts'}>
|
||||
{progLang === 'python' ? _info_proc_example_py : _info_proc_example_js}
|
||||
</Prism>
|
||||
<Text mt='md' mb='sm'>This function gets the text of the response via <Code>response.text</Code>, then slices it until the 12th-indexed character. The full <Code>ResponseInfo</Code> class has the following properties and methods:</Text>
|
||||
<Prism language={progLang === 'python' ? 'py' : 'ts'}>
|
||||
{progLang === 'python' ? _info_codeblock_py : _info_codeblock_js}
|
||||
</Prism>
|
||||
<Text mt='md' mb='sm'>For another example, say you have a prompt that requests the LLM output in a consistent format, with "ANSWER:" at the end like Chain-of-Thought.
|
||||
You want to get just the part after 'ANSWER:' Here's how you can do this:</Text>
|
||||
<Prism language={progLang === 'python' ? 'py' : 'ts'}>
|
||||
{progLang === 'python' ? _info_proc_example_var_py : _info_proc_example_var_js}
|
||||
</Prism>
|
||||
<Text mt='md'>Note that you are allowed to define variables outside of the function, or define more functions, as long as a function called <Code>process</Code> is defined.
|
||||
For more information on what's possible, see the <a href="https://chainforge.ai/docs/" target='_blank'>documentation</a>. Finally, note that currently
|
||||
you cannot change the response metadata itself (i.e., var, meta dictionaries); if you have a use case for that feature, raise an Issue on our GitHub.</Text>
|
||||
</Box>;
|
||||
}, [progLang, node_type])
|
||||
|
||||
return (
|
||||
<BaseNode classNames="evaluator-node" nodeId={id}>
|
||||
@ -264,7 +384,7 @@ const EvaluatorNode = ({ data, id }) => {
|
||||
status={status}
|
||||
alertModal={alertModal}
|
||||
handleRunClick={handleRunClick}
|
||||
runButtonTooltip="Run evaluator over inputs"
|
||||
runButtonTooltip={run_tooltip}
|
||||
customButtons={[
|
||||
<Tooltip label='Info' key="eval-info">
|
||||
<button onClick={openInfoModal} className='custom-button' style={{border:'none'}}>
|
||||
@ -274,26 +394,7 @@ const EvaluatorNode = ({ data, id }) => {
|
||||
/>
|
||||
<LLMResponseInspectorModal ref={inspectModal} jsonResponses={lastResponses} />
|
||||
<Modal title={default_header} size='60%' opened={infoModalOpened} onClose={closeInfoModal} styles={{header: {backgroundColor: '#FFD700'}, root: {position: 'relative', left: '-5%'}}}>
|
||||
<Box m='lg' mt='xl'>
|
||||
<Text mb='sm'>To use a {default_header}, write a function <Code>evaluate</Code> that takes a single argument of class <Code>ResponseInfo</Code>.
|
||||
The function should return a 'score' for that response, which usually is a number or a boolean value (strings as categoricals are supported, but experimental).</Text>
|
||||
<Text mt='sm' mb='sm'>
|
||||
For instance, here is an evaluator that returns the length of a response:</Text>
|
||||
<Prism language={progLang === 'python' ? 'py' : 'ts'}>
|
||||
{progLang === 'python' ? _info_example_py : _info_example_js}
|
||||
</Prism>
|
||||
<Text mt='md' mb='sm'>This function gets the text of the response via <Code>response.text</Code>, then calculates its length in characters. The full <Code>ResponseInfo</Code> class has the following properties and methods:</Text>
|
||||
<Prism language={progLang === 'python' ? 'py' : 'ts'}>
|
||||
{progLang === 'python' ? _info_codeblock_py : _info_codeblock_js}
|
||||
</Prism>
|
||||
<Text mt='md' mb='sm'>For instance, say you have a prompt template <Code>What is the capital of {country}?</Code> on a Prompt Node.
|
||||
You want to get the input variable 'country', which filled the prompt that led to the current response. You can use<Code>response.var</Code>:</Text>
|
||||
<Prism language={progLang === 'python' ? 'py' : 'ts'}>
|
||||
{progLang === 'python' ? _info_example_var_py : _info_example_var_js}
|
||||
</Prism>
|
||||
<Text mt='md'>Note that you are allowed to define variables outside of the function, or define more functions, as long as a function called <Code>evaluate</Code> is defined.
|
||||
For more information on what's possible, see the <a href="https://github.com/ianarawjo/ChainForge/blob/main/GUIDE.md#python-evaluator-node" target='_blank'>documentation</a> or load some Example Flows.</Text>
|
||||
</Box>
|
||||
{code_info_modal}
|
||||
</Modal>
|
||||
<iframe style={{display: 'none'}} id={`${id}-iframe`}></iframe>
|
||||
<Handle
|
||||
@ -311,14 +412,7 @@ const EvaluatorNode = ({ data, id }) => {
|
||||
style={{ top: '50%' }}
|
||||
/>
|
||||
<div className="core-mirror-field">
|
||||
<div className="code-mirror-field-header">Define an <Code>evaluate</Code> func to map over each response:
|
||||
{/* <select name="mapscope" id="mapscope" onChange={handleOnMapScopeSelect}>
|
||||
<option value="response">response</option>
|
||||
<option value="batch">batch of responses</option>
|
||||
</select> */}
|
||||
</div>
|
||||
|
||||
{/* <span className="code-style">response</span>: */}
|
||||
{code_instruct_header}
|
||||
<div className="ace-editor-container nodrag">
|
||||
<AceEditor
|
||||
mode={progLang}
|
||||
@ -358,4 +452,4 @@ const EvaluatorNode = ({ data, id }) => {
|
||||
);
|
||||
};
|
||||
|
||||
export default EvaluatorNode;
|
||||
export default CodeEvaluatorNode;
|
13
chainforge/react-server/src/PromptNode.js
vendored
13
chainforge/react-server/src/PromptNode.js
vendored
@ -13,7 +13,7 @@ import fetch_from_backend from './fetch_from_backend';
|
||||
import { escapeBraces } from './backend/template';
|
||||
import ChatHistoryView from './ChatHistoryView';
|
||||
import InspectFooter from './InspectFooter';
|
||||
import { countNumLLMs, setsAreEqual } from './backend/utils';
|
||||
import { countNumLLMs, setsAreEqual, getLLMsInPulledInputData } from './backend/utils';
|
||||
|
||||
const getUniqueLLMMetavarKey = (responses) => {
|
||||
const metakeys = new Set(responses.map(resp_obj => Object.keys(resp_obj.metavars)).flat());
|
||||
@ -32,17 +32,6 @@ const bucketChatHistoryInfosByLLM = (chat_hist_infos) => {
|
||||
});
|
||||
return chats_by_llm;
|
||||
}
|
||||
const getLLMsInPulledInputData = (pulled_data) => {
|
||||
let found_llms = {};
|
||||
Object.values(pulled_data).filter(_vs => {
|
||||
let vs = Array.isArray(_vs) ? _vs : [_vs];
|
||||
vs.forEach(v => {
|
||||
if (v?.llm !== undefined && !(v.llm.key in found_llms))
|
||||
found_llms[v.llm.key] = v.llm;
|
||||
});
|
||||
});
|
||||
return Object.values(found_llms);
|
||||
};
|
||||
|
||||
class PromptInfo {
|
||||
prompt; // string
|
||||
|
109
chainforge/react-server/src/SimpleEvalNode.js
vendored
109
chainforge/react-server/src/SimpleEvalNode.js
vendored
@ -8,6 +8,7 @@ import InspectFooter from "./InspectFooter";
|
||||
import LLMResponseInspectorModal from "./LLMResponseInspectorModal";
|
||||
import useStore from "./store";
|
||||
import fetch_from_backend from "./fetch_from_backend";
|
||||
import { stripLLMDetailsFromResponses, toStandardResponseFormat } from "./backend/utils";
|
||||
|
||||
const createJSEvalCodeFor = (responseFormat, operation, value, valueType) => {
|
||||
let responseObj = 'r.text'
|
||||
@ -52,7 +53,7 @@ const createJSEvalCodeFor = (responseFormat, operation, value, valueType) => {
|
||||
const SimpleEvalNode = ({data, id}) => {
|
||||
|
||||
const setDataPropsForNode = useStore((state) => state.setDataPropsForNode);
|
||||
const inputEdgesForNode = useStore((state) => state.inputEdgesForNode);
|
||||
const pullInputData = useStore((state) => state.pullInputData);
|
||||
const pingOutputNodes = useStore((state) => state.pingOutputNodes);
|
||||
const [pastInputs, setPastInputs] = useState([]);
|
||||
|
||||
@ -97,17 +98,25 @@ const SimpleEvalNode = ({data, id}) => {
|
||||
dirtyStatus();
|
||||
}, [lastTextValue, dirtyStatus]);
|
||||
|
||||
const handleRunClick = useCallback(() => {
|
||||
// Get the ids from the connected input nodes:
|
||||
const input_node_ids = inputEdgesForNode(id).map(e => e.source);
|
||||
if (input_node_ids.length === 0) {
|
||||
console.warn("No inputs for simple evaluator node.");
|
||||
return;
|
||||
const handlePullInputs = useCallback(() => {
|
||||
// Pull input data
|
||||
let pulled_inputs = pullInputData(["responseBatch"], id);
|
||||
if (!pulled_inputs || !pulled_inputs["responseBatch"]) {
|
||||
console.warn(`No inputs to the Simple Evaluator node.`);
|
||||
return [];
|
||||
}
|
||||
// Convert to standard response format (StandardLLMResponseFormat)
|
||||
return pulled_inputs["responseBatch"].map(toStandardResponseFormat);
|
||||
}, [pullInputData, id, toStandardResponseFormat]);
|
||||
|
||||
const handleRunClick = useCallback(() => {
|
||||
// Pull inputs to the node
|
||||
let pulled_inputs = handlePullInputs();
|
||||
|
||||
// Set status and created rejection callback
|
||||
setStatus('loading');
|
||||
setLastResponses([]);
|
||||
|
||||
const rejected = (err_msg) => {
|
||||
setStatus('error');
|
||||
alertModal.current.trigger(err_msg);
|
||||
@ -120,31 +129,32 @@ const SimpleEvalNode = ({data, id}) => {
|
||||
|
||||
// Run evaluator in backend
|
||||
fetch_from_backend('executejs', {
|
||||
id: id,
|
||||
code: code,
|
||||
responses: input_node_ids,
|
||||
scope: 'response',
|
||||
id: id,
|
||||
code: code,
|
||||
responses: pulled_inputs,
|
||||
scope: 'response',
|
||||
process_type: 'evaluator'
|
||||
}).then(function(json) {
|
||||
// Check if there's an error; if so, bubble it up to user and exit:
|
||||
if (!json || json.error) {
|
||||
setLastRunSuccess(false);
|
||||
rejected(json ? json.error : 'Unknown error encountered when requesting evaluations: empty response returned.');
|
||||
return;
|
||||
}
|
||||
|
||||
// Ping any vis + inspect nodes attached to this node to refresh their contents:
|
||||
pingOutputNodes(id);
|
||||
// Check if there's an error; if so, bubble it up to user and exit:
|
||||
if (!json || json.error) {
|
||||
setLastRunSuccess(false);
|
||||
rejected(json ? json.error : 'Unknown error encountered when requesting evaluations: empty response returned.');
|
||||
return;
|
||||
}
|
||||
|
||||
// Ping any vis + inspect nodes attached to this node to refresh their contents:
|
||||
pingOutputNodes(id);
|
||||
|
||||
console.log(json.responses);
|
||||
setLastResponses(json.responses);
|
||||
setLastRunSuccess(true);
|
||||
console.log(json.responses);
|
||||
setLastResponses(stripLLMDetailsFromResponses(json.responses));
|
||||
setLastRunSuccess(true);
|
||||
|
||||
if (status !== 'ready')
|
||||
setUninspectedResponses(true);
|
||||
|
||||
setStatus('ready');
|
||||
if (status !== 'ready')
|
||||
setUninspectedResponses(true);
|
||||
|
||||
setStatus('ready');
|
||||
}).catch((err) => rejected(err.message));
|
||||
}, [inputEdgesForNode, pingOutputNodes, setStatus, alertModal, status, varValue, varValueType, responseFormat, textValue, valueFieldDisabled]);
|
||||
}, [handlePullInputs, pingOutputNodes, setStatus, alertModal, status, varValue, varValueType, responseFormat, textValue, valueFieldDisabled]);
|
||||
|
||||
const showResponseInspector = useCallback(() => {
|
||||
if (inspectModal && inspectModal.current && lastResponses) {
|
||||
@ -154,31 +164,24 @@ const SimpleEvalNode = ({data, id}) => {
|
||||
}, [inspectModal, lastResponses]);
|
||||
|
||||
const handleOnConnect = useCallback(() => {
|
||||
// Get the ids from the connected input nodes:
|
||||
const input_node_ids = inputEdgesForNode(id).map(e => e.source);
|
||||
|
||||
// Fetch all input responses
|
||||
fetch_from_backend(
|
||||
'grabResponses',
|
||||
{responses: input_node_ids}
|
||||
).then(function(json) {
|
||||
if (json.responses && json.responses.length > 0) {
|
||||
// Find all vars and metavars in responses
|
||||
let varnames = new Set();
|
||||
let metavars = new Set();
|
||||
json.responses.forEach(resp_obj => {
|
||||
Object.keys(resp_obj.vars).forEach(v => varnames.add(v));
|
||||
if (resp_obj.metavars)
|
||||
Object.keys(resp_obj.metavars).forEach(v => metavars.add(v));
|
||||
});
|
||||
const avs = Array.from(varnames);
|
||||
const amvs = Array.from(metavars).filter(v => !(v.startsWith('LLM_')));
|
||||
setAvailableVars(avs);
|
||||
setAvailableMetavars(amvs);
|
||||
setDataPropsForNode(id, { availableVars: avs, availableMetavars: amvs });
|
||||
}
|
||||
});
|
||||
}, [data, id, inputEdgesForNode, setDataPropsForNode]);
|
||||
// Pull inputs to the node
|
||||
let pulled_inputs = handlePullInputs();
|
||||
if (pulled_inputs && pulled_inputs.length > 0) {
|
||||
// Find all vars and metavars in responses
|
||||
let varnames = new Set();
|
||||
let metavars = new Set();
|
||||
pulled_inputs.forEach(resp_obj => {
|
||||
Object.keys(resp_obj.vars).forEach(v => varnames.add(v));
|
||||
if (resp_obj.metavars)
|
||||
Object.keys(resp_obj.metavars).forEach(v => metavars.add(v));
|
||||
});
|
||||
const avs = Array.from(varnames);
|
||||
const amvs = Array.from(metavars).filter(v => !(v.startsWith('LLM_')));
|
||||
setAvailableVars(avs);
|
||||
setAvailableMetavars(amvs);
|
||||
setDataPropsForNode(id, { availableVars: avs, availableMetavars: amvs });
|
||||
}
|
||||
}, [data, id, handlePullInputs, setDataPropsForNode]);
|
||||
|
||||
if (data.input) {
|
||||
// If there's a change in inputs...
|
||||
|
13
chainforge/react-server/src/VisNode.js
vendored
13
chainforge/react-server/src/VisNode.js
vendored
@ -150,11 +150,19 @@ const VisNode = ({ data, id }) => {
|
||||
useEffect(() => {
|
||||
if (!responses || responses.length === 0 || !multiSelectValue) return;
|
||||
|
||||
// Check if there are evaluation results
|
||||
if (responses.every(r => r?.eval_res === undefined)) {
|
||||
setPlaceholderText(<p style={{maxWidth: '220px', backgroundColor: '#f0f0aa', padding: '10px', fontSize: '10pt'}}>
|
||||
To plot evaluation results, you need to run LLM responses through an Evaluator Node or LLM Scorer Node first.
|
||||
</p>);
|
||||
return;
|
||||
}
|
||||
|
||||
setStatus('none');
|
||||
|
||||
const get_llm = (resp_obj) => {
|
||||
if (selectedLLMGroup === 'LLM')
|
||||
return resp_obj.llm;
|
||||
return typeof resp_obj.llm === "string" ? resp_obj.llm : resp_obj.llm?.name;
|
||||
else
|
||||
return resp_obj.metavars[selectedLLMGroup];
|
||||
};
|
||||
@ -204,7 +212,7 @@ const VisNode = ({ data, id }) => {
|
||||
|
||||
// Get the type of evaluation results, if present
|
||||
// (This is assumed to be consistent across response batches)
|
||||
let typeof_eval_res = 'dtype' in responses[0].eval_res ? responses[0].eval_res['dtype'] : 'Numeric';
|
||||
let typeof_eval_res = (responses[0].eval_res && 'dtype' in responses[0].eval_res) ? responses[0].eval_res['dtype'] : 'Numeric';
|
||||
|
||||
// If categorical type, check if all binary:
|
||||
if (typeof_eval_res === 'Categorical') {
|
||||
@ -244,6 +252,7 @@ const VisNode = ({ data, id }) => {
|
||||
};
|
||||
|
||||
const get_items = (eval_res_obj) => {
|
||||
if (eval_res_obj === undefined) return [];
|
||||
if (typeof_eval_res.includes('KeyValue'))
|
||||
return eval_res_obj.items.map(item => item[metric_axes_labels[0]]);
|
||||
return eval_res_obj.items;
|
||||
|
@ -3,7 +3,7 @@
|
||||
*/
|
||||
import { NativeLLM } from '../models';
|
||||
import { expect, test } from '@jest/globals';
|
||||
import { queryLLM, executejs, countQueries, ResponseInfo } from '../backend';
|
||||
import { queryLLM, executejs, countQueries, ResponseInfo, grabResponses } from '../backend';
|
||||
import { StandardizedLLMResponse, Dict } from '../typing';
|
||||
import StorageCache from '../cache';
|
||||
|
||||
@ -69,6 +69,8 @@ test('run evaluate func over responses', async () => {
|
||||
return response.text.length;
|
||||
};
|
||||
|
||||
const input_resps = await grabResponses(['dummy_response_id']) as StandardizedLLMResponse[];
|
||||
|
||||
// const code = `
|
||||
// function evaluate(response) {
|
||||
// console.log('hello there!');
|
||||
@ -77,7 +79,7 @@ test('run evaluate func over responses', async () => {
|
||||
// `;
|
||||
|
||||
// Execute the code, and map the evaluate function over all responses
|
||||
const {responses, logs, error} = await executejs('evalid', code, ['dummy_response_id'], 'response');
|
||||
const {responses, logs, error} = await executejs('evalid', code, input_resps, 'response', 'evaluator');
|
||||
|
||||
// There should be no errors
|
||||
if (error)
|
||||
|
@ -330,38 +330,47 @@ function check_typeof_vals(arr: Array<any>): MetricType {
|
||||
return val_type;
|
||||
}
|
||||
|
||||
function run_over_responses(eval_func: (resp: ResponseInfo) => any, responses: Array<StandardizedLLMResponse>): Array<StandardizedLLMResponse> {
|
||||
function run_over_responses(process_func: (resp: ResponseInfo) => any,
|
||||
responses: StandardizedLLMResponse[],
|
||||
process_type: 'evaluator' | 'processor'): StandardizedLLMResponse[] {
|
||||
return responses.map((_resp_obj: StandardizedLLMResponse) => {
|
||||
// Deep clone the response object
|
||||
const resp_obj = JSON.parse(JSON.stringify(_resp_obj));
|
||||
let resp_obj = JSON.parse(JSON.stringify(_resp_obj));
|
||||
|
||||
// Map the evaluator func over every individual response text in each response object
|
||||
// Map the processor func over every individual response text in each response object
|
||||
const res = resp_obj.responses;
|
||||
const evals = res.map(
|
||||
(r: string) => eval_func(new ResponseInfo(r,
|
||||
resp_obj.prompt,
|
||||
resp_obj.vars,
|
||||
resp_obj.metavars || {},
|
||||
extract_llm_nickname(resp_obj.llm))
|
||||
));
|
||||
const processed = res.map((r: string) =>
|
||||
process_func(new ResponseInfo(r,
|
||||
resp_obj.prompt,
|
||||
resp_obj.vars,
|
||||
resp_obj.metavars || {},
|
||||
extract_llm_nickname(resp_obj.llm)))
|
||||
);
|
||||
|
||||
// Check the type of evaluation results
|
||||
// NOTE: We assume this is consistent across all evaluations, but it may not be.
|
||||
const eval_res_type = check_typeof_vals(evals);
|
||||
// If type is just a processor
|
||||
if (process_type === 'processor') {
|
||||
// Replace response texts in resp_obj with the transformed ones:
|
||||
resp_obj.responses = processed;
|
||||
|
||||
if (eval_res_type === MetricType.Numeric) {
|
||||
// Store items with summary of mean, median, etc
|
||||
resp_obj.eval_res = {
|
||||
items: evals,
|
||||
} else { // If type is an evaluator
|
||||
// Check the type of evaluation results
|
||||
// NOTE: We assume this is consistent across all evaluations, but it may not be.
|
||||
const eval_res_type = check_typeof_vals(processed);
|
||||
|
||||
if (eval_res_type === MetricType.Numeric) {
|
||||
// Store items with summary of mean, median, etc
|
||||
resp_obj.eval_res = {
|
||||
items: processed,
|
||||
dtype: getEnumName(MetricType, eval_res_type),
|
||||
};
|
||||
} else if ([MetricType.Unknown, MetricType.Empty].includes(eval_res_type)) {
|
||||
throw new Error('Unsupported types found in evaluation results. Only supported types for metrics are: int, float, bool, str.');
|
||||
} else {
|
||||
// Categorical, KeyValue, etc, we just store the items:
|
||||
resp_obj.eval_res = {
|
||||
items: processed,
|
||||
dtype: getEnumName(MetricType, eval_res_type),
|
||||
};
|
||||
} else if ([MetricType.Unknown, MetricType.Empty].includes(eval_res_type)) {
|
||||
throw new Error('Unsupported types found in evaluation results. Only supported types for metrics are: int, float, bool, str.');
|
||||
} else {
|
||||
// Categorical, KeyValue, etc, we just store the items:
|
||||
resp_obj.eval_res = {
|
||||
items: evals,
|
||||
dtype: getEnumName(MetricType, eval_res_type),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -773,93 +782,79 @@ export async function queryLLM(id: string,
|
||||
*
|
||||
* @param id a unique ID to refer to this information. Used when cache'ing evaluation results.
|
||||
* @param code the code to evaluate. Must include an 'evaluate()' function that takes a 'response' of type ResponseInfo. Alternatively, can be the evaluate function itself.
|
||||
* @param response_ids the cache'd response to run on, which must be a unique ID or list of unique IDs of cache'd data
|
||||
* @param scope the scope of responses to run on --a single response, or all across each batch. (If batch, evaluate() func has access to 'responses'.)
|
||||
* @param responses the cache'd response to run on, which must be a unique ID or list of unique IDs of cache'd data
|
||||
* @param scope the scope of responses to run on --a single response, or all across each batch. (If batch, evaluate() func has access to 'responses'.) NOTE: Currently this feature is disabled.
|
||||
* @param process_type the type of processing to perform. Evaluators only 'score'/annotate responses with an 'eval_res' key. Processors change responses (e.g. text).
|
||||
*/
|
||||
export async function executejs(id: string,
|
||||
code: string | ((rinfo: ResponseInfo) => any),
|
||||
response_ids: string | string[],
|
||||
scope: 'response' | 'batch'): Promise<Dict> {
|
||||
// Check format of response_ids
|
||||
if (!Array.isArray(response_ids))
|
||||
response_ids = [ response_ids ];
|
||||
response_ids = response_ids as Array<string>;
|
||||
responses: StandardizedLLMResponse[],
|
||||
scope: 'response' | 'batch',
|
||||
process_type: 'evaluator' | 'processor'): Promise<Dict> {
|
||||
const req_func_name = (!process_type || process_type === 'evaluator') ? 'evaluate' : 'process';
|
||||
|
||||
// Instantiate the evaluator function by eval'ing the passed code
|
||||
// DANGER DANGER!!
|
||||
let iframe: HTMLElement | undefined;
|
||||
let process_func: any;
|
||||
if (typeof code === 'string') {
|
||||
try {
|
||||
/*
|
||||
To run Javascript code in a psuedo-'sandbox' environment, we
|
||||
can use an iframe and run eval() inside the iframe, instead of the current environment.
|
||||
This is slightly safer than using eval() directly, doesn't clog our namespace, and keeps
|
||||
multiple Evaluate node execution environments separate.
|
||||
|
||||
The Evaluate node in the front-end has a hidden iframe with the following id.
|
||||
We need to get this iframe element.
|
||||
*/
|
||||
iframe = document.getElementById(`${id}-iframe`);
|
||||
if (!iframe)
|
||||
throw new Error("Could not find iframe sandbox for evaluator node.");
|
||||
/*
|
||||
To run Javascript code in a psuedo-'sandbox' environment, we
|
||||
can use an iframe and run eval() inside the iframe, instead of the current environment.
|
||||
This is slightly safer than using eval() directly, doesn't clog our namespace, and keeps
|
||||
multiple Evaluate node execution environments separate.
|
||||
|
||||
The Evaluate node in the front-end has a hidden iframe with the following id.
|
||||
We need to get this iframe element.
|
||||
*/
|
||||
iframe = document.getElementById(`${id}-iframe`);
|
||||
if (!iframe)
|
||||
throw new Error("Could not find iframe sandbox for evaluator node.");
|
||||
|
||||
// Now run eval() on the 'window' of the iframe:
|
||||
// @ts-ignore
|
||||
iframe.contentWindow.eval(code);
|
||||
// Now run eval() on the 'window' of the iframe:
|
||||
// @ts-ignore
|
||||
iframe.contentWindow.eval(code);
|
||||
|
||||
// Now check that there is an 'evaluate' method in the iframe's scope.
|
||||
// NOTE: We need to tell Typescript to ignore this, since it's a dynamic type check.
|
||||
// @ts-ignore
|
||||
process_func = (!process_type || process_type === 'evaluator') ? iframe.contentWindow.evaluate : iframe.contentWindow.process;
|
||||
if (process_func === undefined)
|
||||
throw new Error(`${req_func_name}() function is undefined.`);
|
||||
|
||||
// Now check that there is an 'evaluate' method in the iframe's scope.
|
||||
// NOTE: We need to tell Typescript to ignore this, since it's a dynamic type check.
|
||||
// @ts-ignore
|
||||
if (iframe.contentWindow.evaluate === undefined) {
|
||||
throw new Error('evaluate() function is undefined.');
|
||||
}
|
||||
} catch (err) {
|
||||
return {'error': `Could not compile evaluator code. Error message:\n${err.message}`};
|
||||
return {'error': `Could not compile code. Error message:\n${err.message}`};
|
||||
}
|
||||
}
|
||||
|
||||
// Load all responses with the given ID:
|
||||
let all_evald_responses: StandardizedLLMResponse[] = [];
|
||||
let all_logs: string[] = [];
|
||||
for (let i = 0; i < response_ids.length; i++) {
|
||||
const cache_id = response_ids[i];
|
||||
const fname = `${cache_id}.json`;
|
||||
if (!StorageCache.has(fname))
|
||||
return {error: `Did not find cache file for id ${cache_id}`, logs: all_logs};
|
||||
let processed_resps: StandardizedLLMResponse[];
|
||||
try {
|
||||
// Intercept any calls to console.log, .warn, or .error, so we can store the calls
|
||||
// and print them in the 'output' footer of the Evaluator Node:
|
||||
// @ts-ignore
|
||||
HIJACK_CONSOLE_LOGGING(id, iframe.contentWindow);
|
||||
|
||||
// Load the raw responses from the cache
|
||||
const responses = load_cache_responses(fname);
|
||||
if (responses.length === 0)
|
||||
continue;
|
||||
// Run the user-defined 'evaluate' function over the responses:
|
||||
// NOTE: 'evaluate' here was defined dynamically from 'eval' above. We've already checked that it exists.
|
||||
// @ts-ignore
|
||||
processed_resps = run_over_responses((iframe ? process_func : code), responses, process_type);
|
||||
|
||||
let evald_responses: StandardizedLLMResponse[];
|
||||
try {
|
||||
// Intercept any calls to console.log, .warn, or .error, so we can store the calls
|
||||
// and print them in the 'output' footer of the Evaluator Node:
|
||||
// @ts-ignore
|
||||
HIJACK_CONSOLE_LOGGING(id, iframe.contentWindow);
|
||||
|
||||
// Run the user-defined 'evaluate' function over the responses:
|
||||
// NOTE: 'evaluate' here was defined dynamically from 'eval' above. We've already checked that it exists.
|
||||
// @ts-ignore
|
||||
evald_responses = run_over_responses((iframe ? iframe.contentWindow.evaluate : code), responses, scope);
|
||||
|
||||
// Revert the console.log, .warn, .error back to browser default:
|
||||
// @ts-ignore
|
||||
all_logs = all_logs.concat(REVERT_CONSOLE_LOGGING(id, iframe.contentWindow));
|
||||
} catch (err) {
|
||||
// @ts-ignore
|
||||
all_logs = all_logs.concat(REVERT_CONSOLE_LOGGING(id, iframe.contentWindow));
|
||||
return { error: `Error encountered while trying to run "evaluate" method:\n${err.message}`, logs: all_logs };
|
||||
}
|
||||
|
||||
all_evald_responses = all_evald_responses.concat(evald_responses);
|
||||
// Revert the console.log, .warn, .error back to browser default:
|
||||
// @ts-ignore
|
||||
all_logs = all_logs.concat(REVERT_CONSOLE_LOGGING(id, iframe.contentWindow));
|
||||
} catch (err) {
|
||||
// @ts-ignore
|
||||
all_logs = all_logs.concat(REVERT_CONSOLE_LOGGING(id, iframe.contentWindow));
|
||||
return { error: `Error encountered while trying to run "evaluate" method:\n${err.message}`, logs: all_logs };
|
||||
}
|
||||
|
||||
// Store the evaluated responses in a new cache json:
|
||||
StorageCache.store(`${id}.json`, all_evald_responses);
|
||||
StorageCache.store(`${id}.json`, processed_resps);
|
||||
|
||||
return {responses: all_evald_responses, logs: all_logs};
|
||||
return {responses: processed_resps, logs: all_logs};
|
||||
}
|
||||
|
||||
/**
|
||||
@ -872,28 +867,19 @@ export async function executejs(id: string,
|
||||
* @param id a unique ID to refer to this information. Used when cache'ing evaluation results.
|
||||
* @param code the code to evaluate. Must include an 'evaluate()' function that takes a 'response' of type ResponseInfo. Alternatively, can be the evaluate function itself.
|
||||
* @param response_ids the cache'd response to run on, which must be a unique ID or list of unique IDs of cache'd data
|
||||
* @param scope the scope of responses to run on --a single response, or all across each batch. (If batch, evaluate() func has access to 'responses'.)
|
||||
* @param scope the scope of responses to run on --a single response, or all across each batch. (If batch, evaluate() func has access to 'responses'.) NOTE: Currently disabled.
|
||||
* @param process_type the type of processing to perform. Evaluators only 'score'/annotate responses with an 'eval_res' key. Processors change responses (e.g. text).
|
||||
*/
|
||||
export async function executepy(id: string,
|
||||
code: string | ((rinfo: ResponseInfo) => any),
|
||||
response_ids: string | string[],
|
||||
responses: StandardizedLLMResponse[],
|
||||
scope: 'response' | 'batch',
|
||||
process_type: 'evaluator' | 'processor',
|
||||
script_paths?: string[]): Promise<Dict> {
|
||||
if (!APP_IS_RUNNING_LOCALLY()) {
|
||||
// We can't execute Python if we're not running the local Flask server. Error out:
|
||||
throw new Error("Cannot evaluate Python code: ChainForge does not appear to be running on localhost.")
|
||||
}
|
||||
|
||||
// Check format of response_ids
|
||||
if (!Array.isArray(response_ids))
|
||||
response_ids = [ response_ids ];
|
||||
response_ids = response_ids as Array<string>;
|
||||
|
||||
// Load cache'd responses for all response_ids:
|
||||
const {responses, error} = await grabResponses(response_ids);
|
||||
|
||||
if (error !== undefined)
|
||||
throw new Error(error);
|
||||
|
||||
// All responses loaded; call our Python server to execute the evaluation code across all responses:
|
||||
const flask_response = await call_flask_backend('executepy', {
|
||||
@ -901,6 +887,7 @@ export async function executepy(id: string,
|
||||
code: code,
|
||||
responses: responses,
|
||||
scope: scope,
|
||||
process_type: process_type,
|
||||
script_paths: script_paths,
|
||||
}).catch(err => {
|
||||
throw new Error(err.message);
|
||||
|
@ -934,4 +934,34 @@ export const dict_excluding_key = (d, key) => {
|
||||
const copy_d = {...d};
|
||||
delete copy_d[key];
|
||||
return copy_d;
|
||||
};
|
||||
|
||||
export const getLLMsInPulledInputData = (pulled_data: Dict) => {
|
||||
let found_llms = {};
|
||||
Object.values(pulled_data).filter(_vs => {
|
||||
let vs = Array.isArray(_vs) ? _vs : [_vs];
|
||||
vs.forEach(v => {
|
||||
if (v?.llm !== undefined && !(v.llm.key in found_llms))
|
||||
found_llms[v.llm.key] = v.llm;
|
||||
});
|
||||
});
|
||||
return Object.values(found_llms);
|
||||
};
|
||||
|
||||
export const stripLLMDetailsFromResponses = (resps) => resps.map(r => ({...r, llm: (typeof r?.llm === 'string' ? r?.llm : (r?.llm?.name ?? 'undefined'))}));
|
||||
|
||||
export const toStandardResponseFormat = (r) => {
|
||||
let resp_obj: Dict = {
|
||||
vars: r?.fill_history ?? {},
|
||||
metavars: r?.metavars ?? {},
|
||||
llm: r?.llm ?? undefined,
|
||||
prompt: r?.prompt ?? "",
|
||||
responses: [typeof r === 'string' ? r : r?.text],
|
||||
tokens: r?.raw_response?.usage ?? {},
|
||||
};
|
||||
if ('eval_res' in r)
|
||||
resp_obj.eval_res = r.eval_res;
|
||||
if ('chat_history' in r)
|
||||
resp_obj.chat_history = r.chat_history;
|
||||
return resp_obj;
|
||||
};
|
@ -17,9 +17,9 @@ async function _route_to_js_backend(route, params) {
|
||||
case 'queryllm':
|
||||
return queryLLM(params.id, clone(params.llm), params.n, params.prompt, clone(params.vars), params.chat_histories, params.api_keys, params.no_cache, params.progress_listener, params.cont_only_w_prior_llms);
|
||||
case 'executejs':
|
||||
return executejs(params.id, params.code, params.responses, params.scope);
|
||||
return executejs(params.id, params.code, params.responses, params.scope, params.process_type);
|
||||
case 'executepy':
|
||||
return executepy(params.id, params.code, params.responses, params.scope, params.script_paths);
|
||||
return executepy(params.id, params.code, params.responses, params.scope, params.process_type, params.script_paths);
|
||||
case 'evalWithLLM':
|
||||
return evalWithLLM(params.id, params.llm, params.root_prompt, params.responses, params.api_keys, params.progress_listener);
|
||||
case 'importCache':
|
||||
|
2
chainforge/react-server/src/store.js
vendored
2
chainforge/react-server/src/store.js
vendored
@ -26,7 +26,7 @@ export const colorPalettes = {
|
||||
var: varColorPalette,
|
||||
}
|
||||
|
||||
const refreshableOutputNodeTypes = new Set(['evaluator', 'prompt', 'inspect', 'vis', 'llmeval', 'textfields', 'chat', 'simpleval', 'join', 'split']);
|
||||
const refreshableOutputNodeTypes = new Set(['evaluator', 'processor', 'prompt', 'inspect', 'vis', 'llmeval', 'textfields', 'chat', 'simpleval', 'join', 'split']);
|
||||
|
||||
export let initLLMProviders = [
|
||||
{ name: "GPT3.5", emoji: "🤖", model: "gpt-3.5-turbo", base_model: "gpt-3.5-turbo", temp: 1.0 }, // The base_model designates what settings form will be used, and must be unique.
|
||||
|
Loading…
x
Reference in New Issue
Block a user