From b517e1c95fbcf084336cd90e4bfa00dbbabdb38c Mon Sep 17 00:00:00 2001 From: Ian Arawjo Date: Sun, 24 Mar 2024 22:19:39 -0400 Subject: [PATCH] WIP: Better rate limiting --- chainforge/react-server/src/backend/models.ts | 82 +++++++++++++++---- chainforge/react-server/src/backend/utils.ts | 3 + chainforge/react-server/src/store.tsx | 14 ++-- 3 files changed, 75 insertions(+), 24 deletions(-) diff --git a/chainforge/react-server/src/backend/models.ts b/chainforge/react-server/src/backend/models.ts index d90c89a..581ac80 100644 --- a/chainforge/react-server/src/backend/models.ts +++ b/chainforge/react-server/src/backend/models.ts @@ -1,6 +1,8 @@ /** * A list of all model APIs natively supported by ChainForge. */ +import Bottleneck from "bottleneck"; +import { UserForcedPrematureExit } from "./errors"; export enum NativeLLM { // OpenAI Chat @@ -151,23 +153,24 @@ export function getProvider(llm: LLM): LLMProvider | undefined { # This 'cheap' version of controlling for rate limits is to wait a few seconds between batches of requests being sent off. # If a model is missing from below, it means we must send and receive only 1 request at a time (synchronous). # The following is only a guideline, and a bit on the conservative side. */ -export const RATE_LIMITS: { [key in LLM]?: [number, number] } = { - [NativeLLM.OpenAI_ChatGPT]: [30, 10], // max 30 requests a batch; wait 10 seconds between - [NativeLLM.OpenAI_ChatGPT_0301]: [30, 10], - [NativeLLM.OpenAI_ChatGPT_0613]: [30, 10], - [NativeLLM.OpenAI_ChatGPT_16k]: [30, 10], - [NativeLLM.OpenAI_ChatGPT_16k_0613]: [30, 10], - [NativeLLM.OpenAI_GPT4]: [8, 2], // max 8 requests every 2 seconds - [NativeLLM.OpenAI_GPT4_0314]: [4, 15], - [NativeLLM.OpenAI_GPT4_0613]: [4, 15], - [NativeLLM.OpenAI_GPT4_32k]: [4, 15], - [NativeLLM.OpenAI_GPT4_32k_0314]: [4, 15], - [NativeLLM.OpenAI_GPT4_32k_0613]: [4, 15], - [NativeLLM.OpenAI_DallE_2]: [2, 10], // Should be 5 images per minute (1 img per every 10 seconds); here, we've been a bit lenient with it. - [NativeLLM.OpenAI_DallE_3]: [2, 10], // This differs per tier, see https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-one - [NativeLLM.Azure_OpenAI]: [30, 10], - [NativeLLM.PaLM2_Text_Bison]: [4, 10], // max 30 requests per minute; so do 4 per batch, 10 seconds between (conservative) - [NativeLLM.PaLM2_Chat_Bison]: [4, 10], +export const RATE_LIMITS: { [key in LLM]?: number } = { + [NativeLLM.OpenAI_ChatGPT]: 1000, // max RPM (API requests per minute) + [NativeLLM.OpenAI_ChatGPT_0301]: 1000, + [NativeLLM.OpenAI_ChatGPT_0613]: 1000, + [NativeLLM.OpenAI_ChatGPT_16k]: 1000, + [NativeLLM.OpenAI_ChatGPT_16k_0613]: 1000, + [NativeLLM.OpenAI_GPT4]: 500, + [NativeLLM.OpenAI_GPT4_0314]: 500, + [NativeLLM.OpenAI_GPT4_0613]: 500, + [NativeLLM.OpenAI_GPT4_32k]: 500, + [NativeLLM.OpenAI_GPT4_32k_0314]: 500, + [NativeLLM.OpenAI_GPT4_32k_0613]: 500, + [NativeLLM.OpenAI_DallE_2]: 10, // Should be 5 images per minute (1 img per every 10 seconds); here, we've been a bit lenient with it. + [NativeLLM.OpenAI_DallE_3]: 10, // This differs per tier, see https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-one + [NativeLLM.Azure_OpenAI]: 500, // conservative + [NativeLLM.PaLM2_Text_Bison]: 60, // max 60 requests per minute as of Mar 2023 + [NativeLLM.PaLM2_Chat_Bison]: 60, + [NativeLLM.GEMINI_PRO]: 60, [NativeLLM.Bedrock_Jurassic_Mid]: [20, 5], [NativeLLM.Bedrock_Jurassic_Ultra]: [5, 5], [NativeLLM.Bedrock_Titan_Light]: [40, 5], @@ -184,6 +187,51 @@ export const RATE_LIMITS: { [key in LLM]?: [number, number] } = { [NativeLLM.Bedrock_Mistral_Mistral]: [40, 5], // 800 RPM }; +const DEFAULT_RATE_LIMIT = 100; // RPM for any models not listed above + +class RateLimiter { + // eslint-disable-next-line no-use-before-define + private static instance: RateLimiter; + private limiters: Record; + + private constructor() { + // Initialize the singleton instance + this.limiters = {}; + } + + /** Gets the rate limiter. Initializes it if the singleton instance does not yet exist. */ + public static getInstance(): RateLimiter { + if (!RateLimiter.instance) { + RateLimiter.instance = new RateLimiter(); + } + return RateLimiter.instance; + } + + private getLimiter(model: LLM): Bottleneck { + // Find if there's an existing limiter for this model + if (!(model in this.limiters)) { + // If there isn't, make one: + const rpm = RATE_LIMITS[model] ?? DEFAULT_RATE_LIMIT; + this.limiters[model] = new Bottleneck({ + reservoir: rpm, // max requests per minute + reservoirRefreshAmount: rpm, // refresh up to max requests every minute + reservoirRefreshInterval: 60000, // refresh every minute + maxConcurrent: Math.ceil(rpm / 2), // throttle max concurrent requests to half, just in case + minTime: 20 }); // space out the requests by 20ms, to be safe + } + return this.limiters[model]; + } + + /** Throttles the API call for the given model, using Bottleneck */ + public static throttle(model: LLM, func: () => PromiseLike, should_cancel?: () => boolean): Promise { + // Rate limit per model, and abort if the API request takes 3 minutes or more. + return this.getInstance().getLimiter(model).schedule({expiration: 180000}, () => { + if (should_cancel && should_cancel()) throw new UserForcedPrematureExit(); + return func(); + }); + } +} + /** Equivalent to a Python enum's .name property */ export function getEnumName( enumObject: any, diff --git a/chainforge/react-server/src/backend/utils.ts b/chainforge/react-server/src/backend/utils.ts index b5717fd..cb5d2df 100644 --- a/chainforge/react-server/src/backend/utils.ts +++ b/chainforge/react-server/src/backend/utils.ts @@ -363,6 +363,9 @@ export async function call_dalle( // Since n doesn't work for DALLE3, we must repeat call n times if n > 1, waiting for each response to come in: const responses: Array = []; while (responses.length < n) { + // Abort if canceled + if (should_cancel && should_cancel()) throw new UserForcedPrematureExit(); + let response: Dict = {}; try { const completion = await openai.createImage(query as CreateImageRequest); diff --git a/chainforge/react-server/src/store.tsx b/chainforge/react-server/src/store.tsx index 371483b..9d1dfe2 100644 --- a/chainforge/react-server/src/store.tsx +++ b/chainforge/react-server/src/store.tsx @@ -108,13 +108,6 @@ export const initLLMProviderMenu: (LLMSpec | {group: string, emoji: string, item }, ], }, - { - name: "Dall-E", - emoji: "🖼", - model: "dall-e-2", - base_model: "dall-e", - temp: 0.0, - }, { name: "Claude", emoji: "📚", @@ -149,6 +142,13 @@ export const initLLMProviderMenu: (LLMSpec | {group: string, emoji: string, item }, ], }, + { + name: "Dall-E", + emoji: "🖼", + model: "dall-e-2", + base_model: "dall-e", + temp: 0.0, + }, { name: "Aleph Alpha", emoji: "💡",