<!-- CURSOR_SUMMARY --> > [!NOTE] > Adds a context-limit banner with one-click “summarize into new chat,” refactors token counting with react-query, and persists per-message max token usage. > > - **Chat UX** > - **Context limit banner** (`ContextLimitBanner.tsx`, `MessagesList.tsx`): shows when within 40k tokens of `contextWindow`, with tooltip and action to summarize into a new chat. > - **Summarize flow**: extracted to `useSummarizeInNewChat` and used in chat input and banner; new summarize system prompt (`summarize_chat_system_prompt.ts`). > - **Token usage & counting** > - **Persist max tokens used per assistant message**: DB migration (`messages.max_tokens_used`), schema updates, and saving usage during streaming (`chat_stream_handlers.ts`). > - **Token counting refactor** (`useCountTokens.ts`): react-query with debounce; returns `estimatedTotalTokens` and `actualMaxTokens`; invalidated on model change and stream end; `TokenBar` updated. > - **Surfacing usage**: tooltip on latest assistant message shows total tokens (`ChatMessage.tsx`). > - **Model/config tweaks** > - Set `auto` model `contextWindow` to `200_000` (`language_model_constants.ts`). > - Improve chat auto-scroll dependency (`ChatPanel.tsx`). > - Fix app path validation regex (`app_handlers.ts`). > - **Testing & dev server** > - E2E tests for banner and summarize (`e2e-tests/context_limit_banner.spec.ts` + fixtures/snapshot). > - Fake LLM server streams usage to simulate high token scenarios (`testing/fake-llm-server/*`). > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 2ae16a14d50699cc772407426419192c2fdf2ec3. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY --> <!-- This is an auto-generated description by cubic. --> --- ## Summary by cubic Adds a “Summarize into new chat” trigger and a context limit banner to help keep conversations focused and avoid hitting model limits. Also tracks and surfaces actual token usage per assistant message, with a token counting refactor for reliability. - **New Features** - Summarize into new chat from the input or banner; improved system prompt with clear output format. - Context limit banner shows when within 40k tokens of the model’s context window and offers a one-click summarize action. - Tooltip on the latest assistant message shows total tokens used. - **Refactors** - Token counting now uses react-query and returns estimatedTotalTokens and actualMaxTokens; counts are invalidated on model change and when streaming settles. - Persist per-message max_tokens_used in the messages table; backend aggregates model usage during streaming and saves it. - Adjusted default “Auto” model contextWindow to 200k for more realistic limits. - Improved chat scrolling while streaming; fixed app path validation regex. <sup>Written for commit 2ae16a14d50699cc772407426419192c2fdf2ec3. Summary will update automatically on new commits.</sup> <!-- End of auto-generated description by cubic. -->
204 lines
5.3 KiB
TypeScript
204 lines
5.3 KiB
TypeScript
import express from "express";
|
|
import { createServer } from "http";
|
|
import cors from "cors";
|
|
import { createChatCompletionHandler } from "./chatCompletionHandler";
|
|
import {
|
|
handleDeviceCode,
|
|
handleAccessToken,
|
|
handleUser,
|
|
handleUserEmails,
|
|
handleUserRepos,
|
|
handleRepo,
|
|
handleRepoBranches,
|
|
handleOrgRepos,
|
|
handleGitPush,
|
|
handleGetPushEvents,
|
|
handleClearPushEvents,
|
|
} from "./githubHandler";
|
|
|
|
// Create Express app
|
|
const app = express();
|
|
app.use(cors());
|
|
app.use(express.json({ limit: "50mb" }));
|
|
app.use(express.urlencoded({ extended: true, limit: "50mb" }));
|
|
|
|
const PORT = 3500;
|
|
|
|
// Helper function to create OpenAI-like streaming response chunks
|
|
export function createStreamChunk(
|
|
content: string,
|
|
role: string = "assistant",
|
|
isLast: boolean = false,
|
|
usage?: {
|
|
prompt_tokens: number;
|
|
completion_tokens: number;
|
|
total_tokens: number;
|
|
},
|
|
) {
|
|
const chunk: any = {
|
|
id: `chatcmpl-${Date.now()}`,
|
|
object: "chat.completion.chunk",
|
|
created: Math.floor(Date.now() / 1000),
|
|
model: "fake-model",
|
|
choices: [
|
|
{
|
|
index: 0,
|
|
delta: isLast ? {} : { content, role },
|
|
finish_reason: isLast ? "stop" : null,
|
|
},
|
|
],
|
|
};
|
|
|
|
// Add usage info to the final chunk if provided
|
|
if (isLast && usage) {
|
|
chunk.usage = usage;
|
|
}
|
|
|
|
return `data: ${JSON.stringify(chunk)}\n\n${isLast ? "data: [DONE]\n\n" : ""}`;
|
|
}
|
|
|
|
export const CANNED_MESSAGE = `
|
|
<dyad-write path="file1.txt">
|
|
A file (2)
|
|
</dyad-write>
|
|
More
|
|
EOM`;
|
|
|
|
app.get("/health", (req, res) => {
|
|
res.send("OK");
|
|
});
|
|
|
|
// Ollama-specific endpoints
|
|
app.get("/ollama/api/tags", (req, res) => {
|
|
const ollamaModels = {
|
|
models: [
|
|
{
|
|
name: "testollama",
|
|
modified_at: "2024-05-01T10:00:00.000Z",
|
|
size: 4700000000,
|
|
digest: "abcdef123456",
|
|
details: {
|
|
format: "gguf",
|
|
family: "llama",
|
|
families: ["llama"],
|
|
parameter_size: "8B",
|
|
quantization_level: "Q4_0",
|
|
},
|
|
},
|
|
{
|
|
name: "codellama:7b",
|
|
modified_at: "2024-04-25T12:30:00.000Z",
|
|
size: 3800000000,
|
|
digest: "fedcba654321",
|
|
details: {
|
|
format: "gguf",
|
|
family: "llama",
|
|
families: ["llama", "codellama"],
|
|
parameter_size: "7B",
|
|
quantization_level: "Q5_K_M",
|
|
},
|
|
},
|
|
],
|
|
};
|
|
console.log("* Sending fake Ollama models");
|
|
res.json(ollamaModels);
|
|
});
|
|
|
|
// LM Studio specific endpoints
|
|
app.get("/lmstudio/api/v0/models", (req, res) => {
|
|
const lmStudioModels = {
|
|
data: [
|
|
{
|
|
type: "llm",
|
|
id: "lmstudio-model-1",
|
|
object: "model",
|
|
publisher: "lmstudio",
|
|
state: "loaded",
|
|
max_context_length: 4096,
|
|
quantization: "Q4_0",
|
|
compatibility_type: "gguf",
|
|
arch: "llama",
|
|
},
|
|
{
|
|
type: "llm",
|
|
id: "lmstudio-model-2-chat",
|
|
object: "model",
|
|
publisher: "lmstudio",
|
|
state: "not-loaded",
|
|
max_context_length: 8192,
|
|
quantization: "Q5_K_M",
|
|
compatibility_type: "gguf",
|
|
arch: "mixtral",
|
|
},
|
|
{
|
|
type: "embedding", // Should be filtered out by client
|
|
id: "lmstudio-embedding-model",
|
|
object: "model",
|
|
publisher: "lmstudio",
|
|
state: "loaded",
|
|
max_context_length: 2048,
|
|
quantization: "F16",
|
|
compatibility_type: "gguf",
|
|
arch: "bert",
|
|
},
|
|
],
|
|
};
|
|
console.log("* Sending fake LM Studio models");
|
|
res.json(lmStudioModels);
|
|
});
|
|
|
|
["lmstudio", "gateway", "engine", "ollama", "azure"].forEach((provider) => {
|
|
app.post(
|
|
`/${provider}/v1/chat/completions`,
|
|
createChatCompletionHandler(provider),
|
|
);
|
|
});
|
|
|
|
// Azure-specific endpoints (Azure client uses different URL patterns)
|
|
app.post("/azure/chat/completions", createChatCompletionHandler("azure"));
|
|
app.post(
|
|
"/azure/openai/deployments/:deploymentId/chat/completions",
|
|
createChatCompletionHandler("azure"),
|
|
);
|
|
|
|
// Default test provider handler:
|
|
app.post("/v1/chat/completions", createChatCompletionHandler("."));
|
|
|
|
// GitHub API Mock Endpoints
|
|
console.log("Setting up GitHub mock endpoints");
|
|
|
|
// GitHub OAuth Device Flow
|
|
app.post("/github/login/device/code", handleDeviceCode);
|
|
app.post("/github/login/oauth/access_token", handleAccessToken);
|
|
|
|
// GitHub API endpoints
|
|
app.get("/github/api/user", handleUser);
|
|
app.get("/github/api/user/emails", handleUserEmails);
|
|
app.get("/github/api/user/repos", handleUserRepos);
|
|
app.post("/github/api/user/repos", handleUserRepos);
|
|
app.get("/github/api/repos/:owner/:repo", handleRepo);
|
|
app.get("/github/api/repos/:owner/:repo/branches", handleRepoBranches);
|
|
app.post("/github/api/orgs/:org/repos", handleOrgRepos);
|
|
|
|
// GitHub test endpoints for verifying push operations
|
|
app.get("/github/api/test/push-events", handleGetPushEvents);
|
|
app.post("/github/api/test/clear-push-events", handleClearPushEvents);
|
|
|
|
// GitHub Git endpoints - intercept all paths with /github/git prefix
|
|
app.all("/github/git/*", handleGitPush);
|
|
|
|
// Start the server
|
|
const server = createServer(app);
|
|
server.listen(PORT, () => {
|
|
console.log(`Fake LLM server running on http://localhost:${PORT}`);
|
|
});
|
|
|
|
// Handle SIGINT (Ctrl+C)
|
|
process.on("SIGINT", () => {
|
|
console.log("Shutting down fake LLM server");
|
|
server.close(() => {
|
|
console.log("Server closed");
|
|
process.exit(0);
|
|
});
|
|
});
|