Fuzzy turbo edits (#1700)
<!-- CURSOR_SUMMARY --> > [!NOTE] > Makes search-replace tolerant to typos/whitespace via two-pass fuzzy matching with Levenshtein and smart-quote normalization, with clearer errors and tests. > > - **Search/Replace Processor (`src/pro/main/ipc/processors/search_replace_processor.ts`)** > - Add two-pass fuzzy matching: quick exact-line prefilter + Levenshtein scoring (`fastest-levenshtein`). > - Match flow: exact > whitespace-insensitive > fuzzy; detect ambiguity and improve error messages with similarity %. > - Introduce thresholds and limits: `FUZZY_MATCH_THRESHOLD=0.9`, `EARLY_STOP_THRESHOLD=0.95`, `MAX_FUZZY_SEARCH_TIME_MS=10000`. > - Normalize text before scoring using `normalizeString` (handles quotes, dashes, ellipsis, NBSP, soft hyphen, BOM). > - Preserve indentation for replacements; keep existing unescape/validation logic. > - **Utils (`src/utils/text_normalization.ts`)** > - New `normalizeString` helper for Unicode normalization used by fuzzy matching. > - **Tests (`src/pro/main/ipc/processors/search_replace_processor.test.ts`)** > - Add cases for typos, smart quotes, below-threshold failure, exact-over-fuzzy preference, whitespace differences, and ambiguity. > - **Dependencies** > - Add `fastest-levenshtein` to `package.json`. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 289dab915c37bc4f9ab4bf0209ff3f95a57341fc. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY --> <!-- This is an auto-generated description by cubic. --> --- ## Summary by cubic Makes applySearchReplace more tolerant by preferring exact and whitespace-insensitive matches, then falling back to fast two-pass Levenshtein-based fuzzy matching with Unicode normalization. Improves failure messages and adds tests to cover common edit scenarios. - **New Features** - Exact > whitespace-insensitive > fuzzy matching flow (threshold 0.9). - Two-pass fuzzy search with exact-line prefilter; early stop at 0.95 and 10s timeout. - normalizeString handles smart quotes, dashes, ellipsis, NBSPs, soft hyphen, and BOM. - Errors now report best fuzzy similarity when below threshold. - Tests for typos, smart quotes, below-threshold cases, exact-over-fuzzy, whitespace differences, and ambiguity. - **Dependencies** - Add fastest-levenshtein. <sup>Written for commit 289dab915c37bc4f9ab4bf0209ff3f95a57341fc. Summary will update automatically on new commits.</sup> <!-- End of auto-generated description by cubic. -->
This commit is contained in:
169
src/pro/main/ipc/processors/search_replace_processor.test.ts
Normal file
169
src/pro/main/ipc/processors/search_replace_processor.test.ts
Normal file
@@ -0,0 +1,169 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { applySearchReplace } from "./search_replace_processor";
|
||||
|
||||
describe("applySearchReplace", () => {
|
||||
describe("fuzzy matching with Levenshtein distance", () => {
|
||||
it("should match content with minor typos", () => {
|
||||
const originalContent = `function hello() {
|
||||
console.log("Hello, World!");
|
||||
return true;
|
||||
}`;
|
||||
|
||||
// Search block has a typo: "consle" instead of "console"
|
||||
const diffContent = `<<<<<<< SEARCH
|
||||
function hello() {
|
||||
consle.log("Hello, World!");
|
||||
return true;
|
||||
}
|
||||
=======
|
||||
function hello() {
|
||||
console.log("Hello, Universe!");
|
||||
return true;
|
||||
}
|
||||
>>>>>>> REPLACE`;
|
||||
|
||||
const result = applySearchReplace(originalContent, diffContent);
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.content).toContain("Hello, Universe!");
|
||||
});
|
||||
|
||||
it("should match content with smart quotes normalized", () => {
|
||||
const originalContent = `function greet() {
|
||||
console.log("Hello");
|
||||
}`;
|
||||
|
||||
// Search block uses smart quotes
|
||||
const diffContent = `<<<<<<< SEARCH
|
||||
function greet() {
|
||||
console.log("Hello");
|
||||
}
|
||||
=======
|
||||
function greet() {
|
||||
console.log("Goodbye");
|
||||
}
|
||||
>>>>>>> REPLACE`;
|
||||
|
||||
const result = applySearchReplace(originalContent, diffContent);
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.content).toContain("Goodbye");
|
||||
});
|
||||
|
||||
it("should fail when similarity is below threshold", () => {
|
||||
const originalContent = `function hello() {
|
||||
console.log("Hello, World!");
|
||||
return true;
|
||||
}`;
|
||||
|
||||
// Search block is too different (multiple typos and changes)
|
||||
const diffContent = `<<<<<<< SEARCH
|
||||
function goodbye() {
|
||||
consle.error("Bye, Earth!");
|
||||
return false;
|
||||
}
|
||||
=======
|
||||
function hello() {
|
||||
console.log("Hello, Universe!");
|
||||
return true;
|
||||
}
|
||||
>>>>>>> REPLACE`;
|
||||
|
||||
const result = applySearchReplace(originalContent, diffContent);
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.error).toContain("Best fuzzy match had similarity");
|
||||
});
|
||||
|
||||
it("should prefer exact match over fuzzy match", () => {
|
||||
const originalContent = `function hello() {
|
||||
console.log("Hello");
|
||||
}
|
||||
|
||||
function hello() {
|
||||
consle.log("Hello");
|
||||
}`;
|
||||
|
||||
// Should match the first exact occurrence, not the fuzzy one
|
||||
const diffContent = `<<<<<<< SEARCH
|
||||
function hello() {
|
||||
console.log("Hello");
|
||||
}
|
||||
=======
|
||||
function hello() {
|
||||
console.log("Goodbye");
|
||||
}
|
||||
>>>>>>> REPLACE`;
|
||||
|
||||
const result = applySearchReplace(originalContent, diffContent);
|
||||
expect(result.success).toBe(true);
|
||||
// Should only replace the first exact match
|
||||
expect(result.content).toContain('console.log("Goodbye")');
|
||||
expect(result.content).toContain('consle.log("Hello")');
|
||||
});
|
||||
|
||||
it("should handle whitespace differences with lenient matching before fuzzy", () => {
|
||||
const originalContent = `function test() {
|
||||
console.log("test");
|
||||
}`;
|
||||
|
||||
// Different indentation
|
||||
const diffContent = `<<<<<<< SEARCH
|
||||
function test() {
|
||||
console.log("test");
|
||||
}
|
||||
=======
|
||||
function test() {
|
||||
console.log("updated");
|
||||
}
|
||||
>>>>>>> REPLACE`;
|
||||
|
||||
const result = applySearchReplace(originalContent, diffContent);
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.content).toContain("updated");
|
||||
});
|
||||
});
|
||||
|
||||
describe("existing functionality", () => {
|
||||
it("should handle exact matches", () => {
|
||||
const originalContent = `function hello() {
|
||||
console.log("Hello");
|
||||
}`;
|
||||
|
||||
const diffContent = `<<<<<<< SEARCH
|
||||
function hello() {
|
||||
console.log("Hello");
|
||||
}
|
||||
=======
|
||||
function hello() {
|
||||
console.log("Goodbye");
|
||||
}
|
||||
>>>>>>> REPLACE`;
|
||||
|
||||
const result = applySearchReplace(originalContent, diffContent);
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.content).toContain("Goodbye");
|
||||
});
|
||||
|
||||
it("should detect ambiguous matches", () => {
|
||||
const originalContent = `function hello() {
|
||||
console.log("Hello");
|
||||
}
|
||||
|
||||
function hello() {
|
||||
console.log("Hello");
|
||||
}`;
|
||||
|
||||
const diffContent = `<<<<<<< SEARCH
|
||||
function hello() {
|
||||
console.log("Hello");
|
||||
}
|
||||
=======
|
||||
function hello() {
|
||||
console.log("Goodbye");
|
||||
}
|
||||
>>>>>>> REPLACE`;
|
||||
|
||||
const result = applySearchReplace(originalContent, diffContent);
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.error).toContain("ambiguous");
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1,6 +1,17 @@
|
||||
/* eslint-disable no-irregular-whitespace */
|
||||
|
||||
import { parseSearchReplaceBlocks } from "@/pro/shared/search_replace_parser";
|
||||
import { distance } from "fastest-levenshtein";
|
||||
import { normalizeString } from "@/utils/text_normalization";
|
||||
|
||||
// Minimum similarity threshold for fuzzy matching (0 to 1, where 1 is exact match)
|
||||
const FUZZY_MATCH_THRESHOLD = 0.9;
|
||||
|
||||
// Early termination threshold - stop searching if we find a match this good
|
||||
const EARLY_STOP_THRESHOLD = 0.95;
|
||||
|
||||
// Maximum time to spend on fuzzy matching (in milliseconds)
|
||||
const MAX_FUZZY_SEARCH_TIME_MS = 10_000; // 10 seconds
|
||||
|
||||
function unescapeMarkers(content: string): string {
|
||||
return content
|
||||
@@ -9,6 +20,148 @@ function unescapeMarkers(content: string): string {
|
||||
.replace(/^\\>>>>>>>/gm, ">>>>>>>");
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate similarity between two strings using Levenshtein distance
|
||||
* Returns a value between 0 and 1, where 1 is an exact match
|
||||
*/
|
||||
function getSimilarity(original: string, search: string): number {
|
||||
// Empty searches are no longer supported
|
||||
if (search === "") {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Use the normalizeString utility to handle smart quotes and other special characters
|
||||
const normalizedOriginal = normalizeString(original);
|
||||
const normalizedSearch = normalizeString(search);
|
||||
|
||||
if (normalizedOriginal === normalizedSearch) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Calculate Levenshtein distance using fastest-levenshtein's distance function
|
||||
const dist = distance(normalizedOriginal, normalizedSearch);
|
||||
|
||||
// Calculate similarity ratio (0 to 1, where 1 is an exact match)
|
||||
const maxLength = Math.max(
|
||||
normalizedOriginal.length,
|
||||
normalizedSearch.length,
|
||||
);
|
||||
return 1 - dist / maxLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Quick scoring function that counts how many lines exactly match.
|
||||
* This is much faster than Levenshtein and serves as a good pre-filter.
|
||||
*/
|
||||
function quickScoreByExactLines(
|
||||
targetLines: string[],
|
||||
searchLines: string[],
|
||||
startIdx: number,
|
||||
): number {
|
||||
let exactMatches = 0;
|
||||
|
||||
for (let i = 0; i < searchLines.length; i++) {
|
||||
if (startIdx + i >= targetLines.length) break;
|
||||
|
||||
if (
|
||||
normalizeString(targetLines[startIdx + i]) ===
|
||||
normalizeString(searchLines[i])
|
||||
) {
|
||||
exactMatches++;
|
||||
}
|
||||
}
|
||||
|
||||
return exactMatches / searchLines.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fast fuzzy search using a two-pass approach:
|
||||
* 1. Quick pre-filter pass: Count exact line matches (fast)
|
||||
* 2. Detailed pass: Only compute Levenshtein on promising candidates (expensive)
|
||||
*
|
||||
* The key insight: If two blocks are similar enough for fuzzy matching (e.g., 90%),
|
||||
* then likely at least 60% of their lines will match exactly.
|
||||
*/
|
||||
function fastFuzzySearch(
|
||||
lines: string[],
|
||||
searchChunk: string,
|
||||
startIndex: number,
|
||||
endIndex: number,
|
||||
) {
|
||||
const searchLines = searchChunk.split(/\r?\n/);
|
||||
const searchLen = searchLines.length;
|
||||
|
||||
// Track start time for timeout
|
||||
const startTime = performance.now();
|
||||
|
||||
// Quick threshold: require at least 60% exact line matches to be a candidate
|
||||
const QUICK_THRESHOLD = 0.6;
|
||||
|
||||
// First pass: find candidates with high exact line match ratio (very fast)
|
||||
const candidates: Array<{ index: number; quickScore: number }> = [];
|
||||
|
||||
for (let i = startIndex; i <= endIndex - searchLen; i++) {
|
||||
// Check time limit
|
||||
const elapsed = performance.now() - startTime;
|
||||
if (elapsed > MAX_FUZZY_SEARCH_TIME_MS) {
|
||||
console.warn(
|
||||
`Fast fuzzy search timed out during pre-filter after ${(elapsed / 1000).toFixed(1)}s`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
const quickScore = quickScoreByExactLines(lines, searchLines, i);
|
||||
|
||||
if (quickScore >= QUICK_THRESHOLD) {
|
||||
candidates.push({ index: i, quickScore });
|
||||
}
|
||||
}
|
||||
|
||||
// Sort candidates by quick score (best first)
|
||||
candidates.sort((a, b) => b.quickScore - a.quickScore);
|
||||
|
||||
// Second pass: only compute expensive Levenshtein on top candidates
|
||||
let bestScore = 0;
|
||||
let bestMatchIndex = -1;
|
||||
|
||||
const MAX_CANDIDATES_TO_CHECK = 10; // Only check top 10 candidates
|
||||
|
||||
for (
|
||||
let i = 0;
|
||||
i < Math.min(candidates.length, MAX_CANDIDATES_TO_CHECK);
|
||||
i++
|
||||
) {
|
||||
const candidate = candidates[i];
|
||||
|
||||
// Check time limit
|
||||
const elapsed = performance.now() - startTime;
|
||||
if (elapsed > MAX_FUZZY_SEARCH_TIME_MS) {
|
||||
console.warn(
|
||||
`Fast fuzzy search timed out during detailed pass after ${(elapsed / 1000).toFixed(1)}s. Best match: ${(bestScore * 100).toFixed(1)}%`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
const originalChunk = lines
|
||||
.slice(candidate.index, candidate.index + searchLen)
|
||||
.join("\n");
|
||||
|
||||
const similarity = getSimilarity(originalChunk, searchChunk);
|
||||
|
||||
if (similarity > bestScore) {
|
||||
bestScore = similarity;
|
||||
bestMatchIndex = candidate.index;
|
||||
|
||||
// Early exit if we found a very good match
|
||||
if (bestScore >= EARLY_STOP_THRESHOLD) {
|
||||
return { bestScore, bestMatchIndex };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { bestScore, bestMatchIndex };
|
||||
}
|
||||
|
||||
export function applySearchReplace(
|
||||
originalContent: string,
|
||||
diffContent: string,
|
||||
@@ -113,14 +266,29 @@ export function applySearchReplace(
|
||||
};
|
||||
}
|
||||
|
||||
if (candidates.length === 0) {
|
||||
if (candidates.length === 1) {
|
||||
matchIndex = candidates[0];
|
||||
}
|
||||
}
|
||||
|
||||
// If still no match, try fuzzy matching with Levenshtein distance
|
||||
if (matchIndex === -1) {
|
||||
const searchChunk = searchLines.join("\n");
|
||||
const { bestScore, bestMatchIndex } = fastFuzzySearch(
|
||||
resultLines,
|
||||
searchChunk,
|
||||
0,
|
||||
resultLines.length,
|
||||
);
|
||||
|
||||
if (bestScore >= FUZZY_MATCH_THRESHOLD) {
|
||||
matchIndex = bestMatchIndex;
|
||||
} else {
|
||||
return {
|
||||
success: false,
|
||||
error: "Search block did not match any content in the target file",
|
||||
error: `Search block did not match any content in the target file. Best fuzzy match had similarity of ${(bestScore * 100).toFixed(1)}% (threshold: ${(FUZZY_MATCH_THRESHOLD * 100).toFixed(1)}%)`,
|
||||
};
|
||||
}
|
||||
|
||||
matchIndex = candidates[0];
|
||||
}
|
||||
|
||||
const matchedLines = resultLines.slice(
|
||||
|
||||
20
src/utils/text_normalization.ts
Normal file
20
src/utils/text_normalization.ts
Normal file
@@ -0,0 +1,20 @@
|
||||
/**
|
||||
* Normalizes text for comparison by handling smart quotes and other special characters
|
||||
*/
|
||||
export function normalizeString(text: string): string {
|
||||
return (
|
||||
text
|
||||
// Normalize smart quotes to regular quotes
|
||||
.replace(/[\u2018\u2019]/g, "'") // Single quotes
|
||||
.replace(/[\u201C\u201D]/g, '"') // Double quotes
|
||||
// Normalize different types of dashes
|
||||
.replace(/[\u2013\u2014]/g, "-") // En dash and em dash to hyphen
|
||||
// Normalize ellipsis
|
||||
.replace(/\u2026/g, "...") // Ellipsis to three dots
|
||||
// Normalize non-breaking spaces
|
||||
.replace(/\u00A0/g, " ") // Non-breaking space to regular space
|
||||
// Normalize other common Unicode variants
|
||||
.replace(/\u00AD/g, "") // Soft hyphen (remove)
|
||||
.replace(/[\uFEFF]/g, "")
|
||||
); // Zero-width no-break space (remove)
|
||||
}
|
||||
Reference in New Issue
Block a user