Files
Will Chen 6235f7bb9d Summarize chat trigger (#1890)
<!-- CURSOR_SUMMARY -->
> [!NOTE]
> Adds a context-limit banner with one-click “summarize into new chat,”
refactors token counting with react-query, and persists per-message max
token usage.
> 
> - **Chat UX**
> - **Context limit banner** (`ContextLimitBanner.tsx`,
`MessagesList.tsx`): shows when within 40k tokens of `contextWindow`,
with tooltip and action to summarize into a new chat.
> - **Summarize flow**: extracted to `useSummarizeInNewChat` and used in
chat input and banner; new summarize system prompt
(`summarize_chat_system_prompt.ts`).
> - **Token usage & counting**
> - **Persist max tokens used per assistant message**: DB migration
(`messages.max_tokens_used`), schema updates, and saving usage during
streaming (`chat_stream_handlers.ts`).
> - **Token counting refactor** (`useCountTokens.ts`): react-query with
debounce; returns `estimatedTotalTokens` and `actualMaxTokens`;
invalidated on model change and stream end; `TokenBar` updated.
> - **Surfacing usage**: tooltip on latest assistant message shows total
tokens (`ChatMessage.tsx`).
> - **Model/config tweaks**
> - Set `auto` model `contextWindow` to `200_000`
(`language_model_constants.ts`).
>   - Improve chat auto-scroll dependency (`ChatPanel.tsx`).
>   - Fix app path validation regex (`app_handlers.ts`).
> - **Testing & dev server**
> - E2E tests for banner and summarize
(`e2e-tests/context_limit_banner.spec.ts` + fixtures/snapshot).
> - Fake LLM server streams usage to simulate high token scenarios
(`testing/fake-llm-server/*`).
> 
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
2ae16a14d50699cc772407426419192c2fdf2ec3. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->













<!-- This is an auto-generated description by cubic. -->
---
## Summary by cubic
Adds a “Summarize into new chat” trigger and a context limit banner to
help keep conversations focused and avoid hitting model limits. Also
tracks and surfaces actual token usage per assistant message, with a
token counting refactor for reliability.

- **New Features**
- Summarize into new chat from the input or banner; improved system
prompt with clear output format.
- Context limit banner shows when within 40k tokens of the model’s
context window and offers a one-click summarize action.
  - Tooltip on the latest assistant message shows total tokens used.

- **Refactors**
- Token counting now uses react-query and returns estimatedTotalTokens
and actualMaxTokens; counts are invalidated on model change and when
streaming settles.
- Persist per-message max_tokens_used in the messages table; backend
aggregates model usage during streaming and saves it.
- Adjusted default “Auto” model contextWindow to 200k for more realistic
limits.
- Improved chat scrolling while streaming; fixed app path validation
regex.

<sup>Written for commit 2ae16a14d50699cc772407426419192c2fdf2ec3.
Summary will update automatically on new commits.</sup>

<!-- End of auto-generated description by cubic. -->
2025-12-04 23:00:28 -08:00

204 lines
5.3 KiB
TypeScript

import express from "express";
import { createServer } from "http";
import cors from "cors";
import { createChatCompletionHandler } from "./chatCompletionHandler";
import {
handleDeviceCode,
handleAccessToken,
handleUser,
handleUserEmails,
handleUserRepos,
handleRepo,
handleRepoBranches,
handleOrgRepos,
handleGitPush,
handleGetPushEvents,
handleClearPushEvents,
} from "./githubHandler";
// Create Express app
const app = express();
app.use(cors());
app.use(express.json({ limit: "50mb" }));
app.use(express.urlencoded({ extended: true, limit: "50mb" }));
const PORT = 3500;
// Helper function to create OpenAI-like streaming response chunks
export function createStreamChunk(
content: string,
role: string = "assistant",
isLast: boolean = false,
usage?: {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
},
) {
const chunk: any = {
id: `chatcmpl-${Date.now()}`,
object: "chat.completion.chunk",
created: Math.floor(Date.now() / 1000),
model: "fake-model",
choices: [
{
index: 0,
delta: isLast ? {} : { content, role },
finish_reason: isLast ? "stop" : null,
},
],
};
// Add usage info to the final chunk if provided
if (isLast && usage) {
chunk.usage = usage;
}
return `data: ${JSON.stringify(chunk)}\n\n${isLast ? "data: [DONE]\n\n" : ""}`;
}
export const CANNED_MESSAGE = `
<dyad-write path="file1.txt">
A file (2)
</dyad-write>
More
EOM`;
app.get("/health", (req, res) => {
res.send("OK");
});
// Ollama-specific endpoints
app.get("/ollama/api/tags", (req, res) => {
const ollamaModels = {
models: [
{
name: "testollama",
modified_at: "2024-05-01T10:00:00.000Z",
size: 4700000000,
digest: "abcdef123456",
details: {
format: "gguf",
family: "llama",
families: ["llama"],
parameter_size: "8B",
quantization_level: "Q4_0",
},
},
{
name: "codellama:7b",
modified_at: "2024-04-25T12:30:00.000Z",
size: 3800000000,
digest: "fedcba654321",
details: {
format: "gguf",
family: "llama",
families: ["llama", "codellama"],
parameter_size: "7B",
quantization_level: "Q5_K_M",
},
},
],
};
console.log("* Sending fake Ollama models");
res.json(ollamaModels);
});
// LM Studio specific endpoints
app.get("/lmstudio/api/v0/models", (req, res) => {
const lmStudioModels = {
data: [
{
type: "llm",
id: "lmstudio-model-1",
object: "model",
publisher: "lmstudio",
state: "loaded",
max_context_length: 4096,
quantization: "Q4_0",
compatibility_type: "gguf",
arch: "llama",
},
{
type: "llm",
id: "lmstudio-model-2-chat",
object: "model",
publisher: "lmstudio",
state: "not-loaded",
max_context_length: 8192,
quantization: "Q5_K_M",
compatibility_type: "gguf",
arch: "mixtral",
},
{
type: "embedding", // Should be filtered out by client
id: "lmstudio-embedding-model",
object: "model",
publisher: "lmstudio",
state: "loaded",
max_context_length: 2048,
quantization: "F16",
compatibility_type: "gguf",
arch: "bert",
},
],
};
console.log("* Sending fake LM Studio models");
res.json(lmStudioModels);
});
["lmstudio", "gateway", "engine", "ollama", "azure"].forEach((provider) => {
app.post(
`/${provider}/v1/chat/completions`,
createChatCompletionHandler(provider),
);
});
// Azure-specific endpoints (Azure client uses different URL patterns)
app.post("/azure/chat/completions", createChatCompletionHandler("azure"));
app.post(
"/azure/openai/deployments/:deploymentId/chat/completions",
createChatCompletionHandler("azure"),
);
// Default test provider handler:
app.post("/v1/chat/completions", createChatCompletionHandler("."));
// GitHub API Mock Endpoints
console.log("Setting up GitHub mock endpoints");
// GitHub OAuth Device Flow
app.post("/github/login/device/code", handleDeviceCode);
app.post("/github/login/oauth/access_token", handleAccessToken);
// GitHub API endpoints
app.get("/github/api/user", handleUser);
app.get("/github/api/user/emails", handleUserEmails);
app.get("/github/api/user/repos", handleUserRepos);
app.post("/github/api/user/repos", handleUserRepos);
app.get("/github/api/repos/:owner/:repo", handleRepo);
app.get("/github/api/repos/:owner/:repo/branches", handleRepoBranches);
app.post("/github/api/orgs/:org/repos", handleOrgRepos);
// GitHub test endpoints for verifying push operations
app.get("/github/api/test/push-events", handleGetPushEvents);
app.post("/github/api/test/clear-push-events", handleClearPushEvents);
// GitHub Git endpoints - intercept all paths with /github/git prefix
app.all("/github/git/*", handleGitPush);
// Start the server
const server = createServer(app);
server.listen(PORT, () => {
console.log(`Fake LLM server running on http://localhost:${PORT}`);
});
// Handle SIGINT (Ctrl+C)
process.on("SIGINT", () => {
console.log("Shutting down fake LLM server");
server.close(() => {
console.log("Server closed");
process.exit(0);
});
});