Fuzzy turbo edits (#1700)

<!-- CURSOR_SUMMARY -->
> [!NOTE]
> Makes search-replace tolerant to typos/whitespace via two-pass fuzzy
matching with Levenshtein and smart-quote normalization, with clearer
errors and tests.
> 
> - **Search/Replace Processor
(`src/pro/main/ipc/processors/search_replace_processor.ts`)**
> - Add two-pass fuzzy matching: quick exact-line prefilter +
Levenshtein scoring (`fastest-levenshtein`).
> - Match flow: exact > whitespace-insensitive > fuzzy; detect ambiguity
and improve error messages with similarity %.
> - Introduce thresholds and limits: `FUZZY_MATCH_THRESHOLD=0.9`,
`EARLY_STOP_THRESHOLD=0.95`, `MAX_FUZZY_SEARCH_TIME_MS=10000`.
> - Normalize text before scoring using `normalizeString` (handles
quotes, dashes, ellipsis, NBSP, soft hyphen, BOM).
> - Preserve indentation for replacements; keep existing
unescape/validation logic.
> - **Utils (`src/utils/text_normalization.ts`)**
> - New `normalizeString` helper for Unicode normalization used by fuzzy
matching.
> - **Tests
(`src/pro/main/ipc/processors/search_replace_processor.test.ts`)**
> - Add cases for typos, smart quotes, below-threshold failure,
exact-over-fuzzy preference, whitespace differences, and ambiguity.
> - **Dependencies**
>   - Add `fastest-levenshtein` to `package.json`.
> 
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
289dab915c37bc4f9ab4bf0209ff3f95a57341fc. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->



<!-- This is an auto-generated description by cubic. -->
---
## Summary by cubic
Makes applySearchReplace more tolerant by preferring exact and
whitespace-insensitive matches, then falling back to fast two-pass
Levenshtein-based fuzzy matching with Unicode normalization. Improves
failure messages and adds tests to cover common edit scenarios.

- **New Features**
- Exact > whitespace-insensitive > fuzzy matching flow (threshold 0.9).
- Two-pass fuzzy search with exact-line prefilter; early stop at 0.95
and 10s timeout.
- normalizeString handles smart quotes, dashes, ellipsis, NBSPs, soft
hyphen, and BOM.
  - Errors now report best fuzzy similarity when below threshold.
- Tests for typos, smart quotes, below-threshold cases,
exact-over-fuzzy, whitespace differences, and ambiguity.

- **Dependencies**
  - Add fastest-levenshtein.

<sup>Written for commit 289dab915c37bc4f9ab4bf0209ff3f95a57341fc.
Summary will update automatically on new commits.</sup>

<!-- End of auto-generated description by cubic. -->
This commit is contained in:
Will Chen
2025-11-04 09:30:25 -08:00
committed by GitHub
parent 369149b202
commit ae1ec68453
5 changed files with 374 additions and 6 deletions

14
package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "dyad",
"version": "0.25.0-beta.1",
"version": "0.27.0-beta.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "dyad",
"version": "0.25.0-beta.1",
"version": "0.27.0-beta.1",
"license": "MIT",
"dependencies": {
"@ai-sdk/amazon-bedrock": "^3.0.15",
@@ -62,6 +62,7 @@
"electron-playwright-helpers": "^1.7.1",
"electron-squirrel-startup": "^1.0.1",
"esbuild-register": "^3.6.0",
"fastest-levenshtein": "^1.0.16",
"fix-path": "^4.0.0",
"framer-motion": "^12.6.3",
"geist": "^1.3.1",
@@ -11830,6 +11831,15 @@
"dev": true,
"license": "MIT"
},
"node_modules/fastest-levenshtein": {
"version": "1.0.16",
"resolved": "https://registry.npmjs.org/fastest-levenshtein/-/fastest-levenshtein-1.0.16.tgz",
"integrity": "sha512-eRnCtTTtGZFpQCwhJiUOuxPQWRXVKYDn0b2PeHfXL6/Zi53SLAzAHfVhVWK2AryC/WH05kGfxhFIPvTF0SXQzg==",
"license": "MIT",
"engines": {
"node": ">= 4.9.1"
}
},
"node_modules/fastq": {
"version": "1.19.1",
"resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.1.tgz",

View File

@@ -138,6 +138,7 @@
"electron-playwright-helpers": "^1.7.1",
"electron-squirrel-startup": "^1.0.1",
"esbuild-register": "^3.6.0",
"fastest-levenshtein": "^1.0.16",
"fix-path": "^4.0.0",
"framer-motion": "^12.6.3",
"geist": "^1.3.1",

View File

@@ -0,0 +1,169 @@
import { describe, it, expect } from "vitest";
import { applySearchReplace } from "./search_replace_processor";
describe("applySearchReplace", () => {
describe("fuzzy matching with Levenshtein distance", () => {
it("should match content with minor typos", () => {
const originalContent = `function hello() {
console.log("Hello, World!");
return true;
}`;
// Search block has a typo: "consle" instead of "console"
const diffContent = `<<<<<<< SEARCH
function hello() {
consle.log("Hello, World!");
return true;
}
=======
function hello() {
console.log("Hello, Universe!");
return true;
}
>>>>>>> REPLACE`;
const result = applySearchReplace(originalContent, diffContent);
expect(result.success).toBe(true);
expect(result.content).toContain("Hello, Universe!");
});
it("should match content with smart quotes normalized", () => {
const originalContent = `function greet() {
console.log("Hello");
}`;
// Search block uses smart quotes
const diffContent = `<<<<<<< SEARCH
function greet() {
console.log("Hello");
}
=======
function greet() {
console.log("Goodbye");
}
>>>>>>> REPLACE`;
const result = applySearchReplace(originalContent, diffContent);
expect(result.success).toBe(true);
expect(result.content).toContain("Goodbye");
});
it("should fail when similarity is below threshold", () => {
const originalContent = `function hello() {
console.log("Hello, World!");
return true;
}`;
// Search block is too different (multiple typos and changes)
const diffContent = `<<<<<<< SEARCH
function goodbye() {
consle.error("Bye, Earth!");
return false;
}
=======
function hello() {
console.log("Hello, Universe!");
return true;
}
>>>>>>> REPLACE`;
const result = applySearchReplace(originalContent, diffContent);
expect(result.success).toBe(false);
expect(result.error).toContain("Best fuzzy match had similarity");
});
it("should prefer exact match over fuzzy match", () => {
const originalContent = `function hello() {
console.log("Hello");
}
function hello() {
consle.log("Hello");
}`;
// Should match the first exact occurrence, not the fuzzy one
const diffContent = `<<<<<<< SEARCH
function hello() {
console.log("Hello");
}
=======
function hello() {
console.log("Goodbye");
}
>>>>>>> REPLACE`;
const result = applySearchReplace(originalContent, diffContent);
expect(result.success).toBe(true);
// Should only replace the first exact match
expect(result.content).toContain('console.log("Goodbye")');
expect(result.content).toContain('consle.log("Hello")');
});
it("should handle whitespace differences with lenient matching before fuzzy", () => {
const originalContent = `function test() {
console.log("test");
}`;
// Different indentation
const diffContent = `<<<<<<< SEARCH
function test() {
console.log("test");
}
=======
function test() {
console.log("updated");
}
>>>>>>> REPLACE`;
const result = applySearchReplace(originalContent, diffContent);
expect(result.success).toBe(true);
expect(result.content).toContain("updated");
});
});
describe("existing functionality", () => {
it("should handle exact matches", () => {
const originalContent = `function hello() {
console.log("Hello");
}`;
const diffContent = `<<<<<<< SEARCH
function hello() {
console.log("Hello");
}
=======
function hello() {
console.log("Goodbye");
}
>>>>>>> REPLACE`;
const result = applySearchReplace(originalContent, diffContent);
expect(result.success).toBe(true);
expect(result.content).toContain("Goodbye");
});
it("should detect ambiguous matches", () => {
const originalContent = `function hello() {
console.log("Hello");
}
function hello() {
console.log("Hello");
}`;
const diffContent = `<<<<<<< SEARCH
function hello() {
console.log("Hello");
}
=======
function hello() {
console.log("Goodbye");
}
>>>>>>> REPLACE`;
const result = applySearchReplace(originalContent, diffContent);
expect(result.success).toBe(false);
expect(result.error).toContain("ambiguous");
});
});
});

View File

@@ -1,6 +1,17 @@
/* eslint-disable no-irregular-whitespace */
import { parseSearchReplaceBlocks } from "@/pro/shared/search_replace_parser";
import { distance } from "fastest-levenshtein";
import { normalizeString } from "@/utils/text_normalization";
// Minimum similarity threshold for fuzzy matching (0 to 1, where 1 is exact match)
const FUZZY_MATCH_THRESHOLD = 0.9;
// Early termination threshold - stop searching if we find a match this good
const EARLY_STOP_THRESHOLD = 0.95;
// Maximum time to spend on fuzzy matching (in milliseconds)
const MAX_FUZZY_SEARCH_TIME_MS = 10_000; // 10 seconds
function unescapeMarkers(content: string): string {
return content
@@ -9,6 +20,148 @@ function unescapeMarkers(content: string): string {
.replace(/^\\>>>>>>>/gm, ">>>>>>>");
}
/**
* Calculate similarity between two strings using Levenshtein distance
* Returns a value between 0 and 1, where 1 is an exact match
*/
function getSimilarity(original: string, search: string): number {
// Empty searches are no longer supported
if (search === "") {
return 0;
}
// Use the normalizeString utility to handle smart quotes and other special characters
const normalizedOriginal = normalizeString(original);
const normalizedSearch = normalizeString(search);
if (normalizedOriginal === normalizedSearch) {
return 1;
}
// Calculate Levenshtein distance using fastest-levenshtein's distance function
const dist = distance(normalizedOriginal, normalizedSearch);
// Calculate similarity ratio (0 to 1, where 1 is an exact match)
const maxLength = Math.max(
normalizedOriginal.length,
normalizedSearch.length,
);
return 1 - dist / maxLength;
}
/**
* Quick scoring function that counts how many lines exactly match.
* This is much faster than Levenshtein and serves as a good pre-filter.
*/
function quickScoreByExactLines(
targetLines: string[],
searchLines: string[],
startIdx: number,
): number {
let exactMatches = 0;
for (let i = 0; i < searchLines.length; i++) {
if (startIdx + i >= targetLines.length) break;
if (
normalizeString(targetLines[startIdx + i]) ===
normalizeString(searchLines[i])
) {
exactMatches++;
}
}
return exactMatches / searchLines.length;
}
/**
* Fast fuzzy search using a two-pass approach:
* 1. Quick pre-filter pass: Count exact line matches (fast)
* 2. Detailed pass: Only compute Levenshtein on promising candidates (expensive)
*
* The key insight: If two blocks are similar enough for fuzzy matching (e.g., 90%),
* then likely at least 60% of their lines will match exactly.
*/
function fastFuzzySearch(
lines: string[],
searchChunk: string,
startIndex: number,
endIndex: number,
) {
const searchLines = searchChunk.split(/\r?\n/);
const searchLen = searchLines.length;
// Track start time for timeout
const startTime = performance.now();
// Quick threshold: require at least 60% exact line matches to be a candidate
const QUICK_THRESHOLD = 0.6;
// First pass: find candidates with high exact line match ratio (very fast)
const candidates: Array<{ index: number; quickScore: number }> = [];
for (let i = startIndex; i <= endIndex - searchLen; i++) {
// Check time limit
const elapsed = performance.now() - startTime;
if (elapsed > MAX_FUZZY_SEARCH_TIME_MS) {
console.warn(
`Fast fuzzy search timed out during pre-filter after ${(elapsed / 1000).toFixed(1)}s`,
);
break;
}
const quickScore = quickScoreByExactLines(lines, searchLines, i);
if (quickScore >= QUICK_THRESHOLD) {
candidates.push({ index: i, quickScore });
}
}
// Sort candidates by quick score (best first)
candidates.sort((a, b) => b.quickScore - a.quickScore);
// Second pass: only compute expensive Levenshtein on top candidates
let bestScore = 0;
let bestMatchIndex = -1;
const MAX_CANDIDATES_TO_CHECK = 10; // Only check top 10 candidates
for (
let i = 0;
i < Math.min(candidates.length, MAX_CANDIDATES_TO_CHECK);
i++
) {
const candidate = candidates[i];
// Check time limit
const elapsed = performance.now() - startTime;
if (elapsed > MAX_FUZZY_SEARCH_TIME_MS) {
console.warn(
`Fast fuzzy search timed out during detailed pass after ${(elapsed / 1000).toFixed(1)}s. Best match: ${(bestScore * 100).toFixed(1)}%`,
);
break;
}
const originalChunk = lines
.slice(candidate.index, candidate.index + searchLen)
.join("\n");
const similarity = getSimilarity(originalChunk, searchChunk);
if (similarity > bestScore) {
bestScore = similarity;
bestMatchIndex = candidate.index;
// Early exit if we found a very good match
if (bestScore >= EARLY_STOP_THRESHOLD) {
return { bestScore, bestMatchIndex };
}
}
}
return { bestScore, bestMatchIndex };
}
export function applySearchReplace(
originalContent: string,
diffContent: string,
@@ -113,14 +266,29 @@ export function applySearchReplace(
};
}
if (candidates.length === 0) {
if (candidates.length === 1) {
matchIndex = candidates[0];
}
}
// If still no match, try fuzzy matching with Levenshtein distance
if (matchIndex === -1) {
const searchChunk = searchLines.join("\n");
const { bestScore, bestMatchIndex } = fastFuzzySearch(
resultLines,
searchChunk,
0,
resultLines.length,
);
if (bestScore >= FUZZY_MATCH_THRESHOLD) {
matchIndex = bestMatchIndex;
} else {
return {
success: false,
error: "Search block did not match any content in the target file",
error: `Search block did not match any content in the target file. Best fuzzy match had similarity of ${(bestScore * 100).toFixed(1)}% (threshold: ${(FUZZY_MATCH_THRESHOLD * 100).toFixed(1)}%)`,
};
}
matchIndex = candidates[0];
}
const matchedLines = resultLines.slice(

View File

@@ -0,0 +1,20 @@
/**
* Normalizes text for comparison by handling smart quotes and other special characters
*/
export function normalizeString(text: string): string {
return (
text
// Normalize smart quotes to regular quotes
.replace(/[\u2018\u2019]/g, "'") // Single quotes
.replace(/[\u201C\u201D]/g, '"') // Double quotes
// Normalize different types of dashes
.replace(/[\u2013\u2014]/g, "-") // En dash and em dash to hyphen
// Normalize ellipsis
.replace(/\u2026/g, "...") // Ellipsis to three dots
// Normalize non-breaking spaces
.replace(/\u00A0/g, " ") // Non-breaking space to regular space
// Normalize other common Unicode variants
.replace(/\u00AD/g, "") // Soft hyphen (remove)
.replace(/[\uFEFF]/g, "")
); // Zero-width no-break space (remove)
}