WIP: Better rate limiting

2025-03-15 00:36:29 +00:00 · 2024-03-24 22:19:39 -04:00 · 2024-03-24 22:19:39 -04:00 · b517e1c95f
commit b517e1c95f
parent 0b8ed6e0e9
3 changed files with 75 additions and 24 deletions
--- a/chainforge/react-server/src/backend/models.ts
+++ b/chainforge/react-server/src/backend/models.ts
@ -1,6 +1,8 @@
 /**
 * A list of all model APIs natively supported by ChainForge.
 */
+import Bottleneck from "bottleneck";
+import { UserForcedPrematureExit } from "./errors";

 export enum NativeLLM {
  // OpenAI Chat
@ -151,23 +153,24 @@ export function getProvider(llm: LLM): LLMProvider | undefined {
 #   This 'cheap' version of controlling for rate limits is to wait a few seconds between batches of requests being sent off.
 #   If a model is missing from below, it means we must send and receive only 1 request at a time (synchronous).
 #   The following is only a guideline, and a bit on the conservative side.  */
-export const RATE_LIMITS: { [key in LLM]?: [number, number] } = {
-  [NativeLLM.OpenAI_ChatGPT]: [30, 10], // max 30 requests a batch; wait 10 seconds between
-  [NativeLLM.OpenAI_ChatGPT_0301]: [30, 10],
-  [NativeLLM.OpenAI_ChatGPT_0613]: [30, 10],
-  [NativeLLM.OpenAI_ChatGPT_16k]: [30, 10],
-  [NativeLLM.OpenAI_ChatGPT_16k_0613]: [30, 10],
-  [NativeLLM.OpenAI_GPT4]: [8, 2], // max 8 requests every 2 seconds
-  [NativeLLM.OpenAI_GPT4_0314]: [4, 15],
-  [NativeLLM.OpenAI_GPT4_0613]: [4, 15],
-  [NativeLLM.OpenAI_GPT4_32k]: [4, 15],
-  [NativeLLM.OpenAI_GPT4_32k_0314]: [4, 15],
-  [NativeLLM.OpenAI_GPT4_32k_0613]: [4, 15],
-  [NativeLLM.OpenAI_DallE_2]: [2, 10], // Should be 5 images per minute (1 img per every 10 seconds); here, we've been a bit lenient with it.
-  [NativeLLM.OpenAI_DallE_3]: [2, 10], // This differs per tier, see https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-one
-  [NativeLLM.Azure_OpenAI]: [30, 10],
-  [NativeLLM.PaLM2_Text_Bison]: [4, 10], // max 30 requests per minute; so do 4 per batch, 10 seconds between (conservative)
-  [NativeLLM.PaLM2_Chat_Bison]: [4, 10],
+export const RATE_LIMITS: { [key in LLM]?: number } = {
+  [NativeLLM.OpenAI_ChatGPT]: 1000, // max RPM (API requests per minute)
+  [NativeLLM.OpenAI_ChatGPT_0301]: 1000,
+  [NativeLLM.OpenAI_ChatGPT_0613]: 1000,
+  [NativeLLM.OpenAI_ChatGPT_16k]: 1000,
+  [NativeLLM.OpenAI_ChatGPT_16k_0613]: 1000,
+  [NativeLLM.OpenAI_GPT4]: 500,
+  [NativeLLM.OpenAI_GPT4_0314]: 500,
+  [NativeLLM.OpenAI_GPT4_0613]: 500,
+  [NativeLLM.OpenAI_GPT4_32k]: 500,
+  [NativeLLM.OpenAI_GPT4_32k_0314]: 500,
+  [NativeLLM.OpenAI_GPT4_32k_0613]: 500,
+  [NativeLLM.OpenAI_DallE_2]: 10, // Should be 5 images per minute (1 img per every 10 seconds); here, we've been a bit lenient with it.
+  [NativeLLM.OpenAI_DallE_3]: 10, // This differs per tier, see https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-one
+  [NativeLLM.Azure_OpenAI]: 500, // conservative
+  [NativeLLM.PaLM2_Text_Bison]: 60, // max 60 requests per minute as of Mar 2023
+  [NativeLLM.PaLM2_Chat_Bison]: 60,
+  [NativeLLM.GEMINI_PRO]: 60,
  [NativeLLM.Bedrock_Jurassic_Mid]: [20, 5],
  [NativeLLM.Bedrock_Jurassic_Ultra]: [5, 5],
  [NativeLLM.Bedrock_Titan_Light]: [40, 5],
@ -184,6 +187,51 @@ export const RATE_LIMITS: { [key in LLM]?: [number, number] } = {
  [NativeLLM.Bedrock_Mistral_Mistral]: [40, 5], // 800 RPM
 };

+const DEFAULT_RATE_LIMIT = 100; // RPM for any models not listed above
+
+class RateLimiter {
+  // eslint-disable-next-line no-use-before-define
+  private static instance: RateLimiter;
+  private limiters: Record<LLM, Bottleneck>;
+
+  private constructor() {
+    // Initialize the singleton instance
+    this.limiters = {};
+  }
+
+  /** Gets the rate limiter. Initializes it if the singleton instance does not yet exist. */
+  public static getInstance(): RateLimiter {
+    if (!RateLimiter.instance) {
+      RateLimiter.instance = new RateLimiter();
+    }
+    return RateLimiter.instance;
+  }
+
+  private getLimiter(model: LLM): Bottleneck {
+    // Find if there's an existing limiter for this model
+    if (!(model in this.limiters)) {
+      // If there isn't, make one:
+      const rpm = RATE_LIMITS[model] ?? DEFAULT_RATE_LIMIT;
+      this.limiters[model] = new Bottleneck({
+        reservoir: rpm, // max requests per minute
+        reservoirRefreshAmount: rpm, // refresh up to max requests every minute
+        reservoirRefreshInterval: 60000, // refresh every minute
+        maxConcurrent: Math.ceil(rpm / 2), // throttle max concurrent requests to half, just in case
+        minTime: 20 }); // space out the requests by 20ms, to be safe
+    }
+    return this.limiters[model];
+  }
+
+  /** Throttles the API call for the given model, using Bottleneck */
+  public static throttle<T>(model: LLM, func: () => PromiseLike<T>, should_cancel?: () => boolean): Promise<T> {
+    // Rate limit per model, and abort if the API request takes 3 minutes or more. 
+    return this.getInstance().getLimiter(model).schedule({expiration: 180000}, () => {
+      if (should_cancel && should_cancel()) throw new UserForcedPrematureExit();
+      return func();
+    }); 
+  }
+}
+
 /** Equivalent to a Python enum's .name property */
 export function getEnumName(
  enumObject: any,
--- a/chainforge/react-server/src/backend/utils.ts
+++ b/chainforge/react-server/src/backend/utils.ts
@ -363,6 +363,9 @@ export async function call_dalle(
  // Since n doesn't work for DALLE3, we must repeat call n times if n > 1, waiting for each response to come in:
  const responses: Array<Dict> = [];
  while (responses.length < n) {
+    // Abort if canceled
+    if (should_cancel && should_cancel()) throw new UserForcedPrematureExit();
+    
    let response: Dict = {};
    try {
      const completion = await openai.createImage(query as CreateImageRequest);
--- a/chainforge/react-server/src/store.tsx
+++ b/chainforge/react-server/src/store.tsx
@ -108,13 +108,6 @@ export const initLLMProviderMenu: (LLMSpec | {group: string, emoji: string, item
      },
    ],
  },
-  {
-    name: "Dall-E",
-    emoji: "🖼",
-    model: "dall-e-2",
-    base_model: "dall-e",
-    temp: 0.0,
-  },
  {
    name: "Claude",
    emoji: "📚",
@ -149,6 +142,13 @@ export const initLLMProviderMenu: (LLMSpec | {group: string, emoji: string, item
      },
    ],
  },
+  {
+    name: "Dall-E",
+    emoji: "🖼",
+    model: "dall-e-2",
+    base_model: "dall-e",
+    temp: 0.0,
+  },
  {
    name: "Aleph Alpha",
    emoji: "💡",