Fuzzy turbo edits (#1700)

> [!NOTE] > Makes search-replace tolerant to typos/whitespace via two-pass fuzzy matching with Levenshtein and smart-quote normalization, with clearer errors and tests. > > - **Search/Replace Processor (`src/pro/main/ipc/processors/search_replace_processor.ts`)** > - Add two-pass fuzzy matching: quick exact-line prefilter + Levenshtein scoring (`fastest-levenshtein`). > - Match flow: exact > whitespace-insensitive > fuzzy; detect ambiguity and improve error messages with similarity %. > - Introduce thresholds and limits: `FUZZY_MATCH_THRESHOLD=0.9`, `EARLY_STOP_THRESHOLD=0.95`, `MAX_FUZZY_SEARCH_TIME_MS=10000`. > - Normalize text before scoring using `normalizeString` (handles quotes, dashes, ellipsis, NBSP, soft hyphen, BOM). > - Preserve indentation for replacements; keep existing unescape/validation logic. > - **Utils (`src/utils/text_normalization.ts`)** > - New `normalizeString` helper for Unicode normalization used by fuzzy matching. > - **Tests (`src/pro/main/ipc/processors/search_replace_processor.test.ts`)** > - Add cases for typos, smart quotes, below-threshold failure, exact-over-fuzzy preference, whitespace differences, and ambiguity. > - **Dependencies** > - Add `fastest-levenshtein` to `package.json`. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 289dab915c37bc4f9ab4bf0209ff3f95a57341fc. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup>   --- ## Summary by cubic Makes applySearchReplace more tolerant by preferring exact and whitespace-insensitive matches, then falling back to fast two-pass Levenshtein-based fuzzy matching with Unicode normalization. Improves failure messages and adds tests to cover common edit scenarios. - **New Features** - Exact > whitespace-insensitive > fuzzy matching flow (threshold 0.9). - Two-pass fuzzy search with exact-line prefilter; early stop at 0.95 and 10s timeout. - normalizeString handles smart quotes, dashes, ellipsis, NBSPs, soft hyphen, and BOM. - Errors now report best fuzzy similarity when below threshold. - Tests for typos, smart quotes, below-threshold cases, exact-over-fuzzy, whitespace differences, and ambiguity. - **Dependencies** - Add fastest-levenshtein. <sup>Written for commit 289dab915c37bc4f9ab4bf0209ff3f95a57341fc. Summary will update automatically on new commits.</sup>
2025-11-04 09:30:25 -08:00
parent 369149b202
commit ae1ec68453
5 changed files with 374 additions and 6 deletions
--- a/src/pro/main/ipc/processors/search_replace_processor.test.ts
+++ b/src/pro/main/ipc/processors/search_replace_processor.test.ts
@@ -0,0 +1,169 @@
+import { describe, it, expect } from "vitest";
+import { applySearchReplace } from "./search_replace_processor";
+
+describe("applySearchReplace", () => {
+  describe("fuzzy matching with Levenshtein distance", () => {
+    it("should match content with minor typos", () => {
+      const originalContent = `function hello() {
+  console.log("Hello, World!");
+  return true;
+}`;
+
+      // Search block has a typo: "consle" instead of "console"
+      const diffContent = `<<<<<<< SEARCH
+function hello() {
+  consle.log("Hello, World!");
+  return true;
+}
+=======
+function hello() {
+  console.log("Hello, Universe!");
+  return true;
+}
+>>>>>>> REPLACE`;
+
+      const result = applySearchReplace(originalContent, diffContent);
+      expect(result.success).toBe(true);
+      expect(result.content).toContain("Hello, Universe!");
+    });
+
+    it("should match content with smart quotes normalized", () => {
+      const originalContent = `function greet() {
+  console.log("Hello");
+}`;
+
+      // Search block uses smart quotes
+      const diffContent = `<<<<<<< SEARCH
+function greet() {
+  console.log("Hello");
+}
+=======
+function greet() {
+  console.log("Goodbye");
+}
+>>>>>>> REPLACE`;
+
+      const result = applySearchReplace(originalContent, diffContent);
+      expect(result.success).toBe(true);
+      expect(result.content).toContain("Goodbye");
+    });
+
+    it("should fail when similarity is below threshold", () => {
+      const originalContent = `function hello() {
+  console.log("Hello, World!");
+  return true;
+}`;
+
+      // Search block is too different (multiple typos and changes)
+      const diffContent = `<<<<<<< SEARCH
+function goodbye() {
+  consle.error("Bye, Earth!");
+  return false;
+}
+=======
+function hello() {
+  console.log("Hello, Universe!");
+  return true;
+}
+>>>>>>> REPLACE`;
+
+      const result = applySearchReplace(originalContent, diffContent);
+      expect(result.success).toBe(false);
+      expect(result.error).toContain("Best fuzzy match had similarity");
+    });
+
+    it("should prefer exact match over fuzzy match", () => {
+      const originalContent = `function hello() {
+  console.log("Hello");
+}
+
+function hello() {
+  consle.log("Hello");
+}`;
+
+      // Should match the first exact occurrence, not the fuzzy one
+      const diffContent = `<<<<<<< SEARCH
+function hello() {
+  console.log("Hello");
+}
+=======
+function hello() {
+  console.log("Goodbye");
+}
+>>>>>>> REPLACE`;
+
+      const result = applySearchReplace(originalContent, diffContent);
+      expect(result.success).toBe(true);
+      // Should only replace the first exact match
+      expect(result.content).toContain('console.log("Goodbye")');
+      expect(result.content).toContain('consle.log("Hello")');
+    });
+
+    it("should handle whitespace differences with lenient matching before fuzzy", () => {
+      const originalContent = `function test() {
+    console.log("test");
+}`;
+
+      // Different indentation
+      const diffContent = `<<<<<<< SEARCH
+function test() {
+  console.log("test");
+}
+=======
+function test() {
+  console.log("updated");
+}
+>>>>>>> REPLACE`;
+
+      const result = applySearchReplace(originalContent, diffContent);
+      expect(result.success).toBe(true);
+      expect(result.content).toContain("updated");
+    });
+  });
+
+  describe("existing functionality", () => {
+    it("should handle exact matches", () => {
+      const originalContent = `function hello() {
+  console.log("Hello");
+}`;
+
+      const diffContent = `<<<<<<< SEARCH
+function hello() {
+  console.log("Hello");
+}
+=======
+function hello() {
+  console.log("Goodbye");
+}
+>>>>>>> REPLACE`;
+
+      const result = applySearchReplace(originalContent, diffContent);
+      expect(result.success).toBe(true);
+      expect(result.content).toContain("Goodbye");
+    });
+
+    it("should detect ambiguous matches", () => {
+      const originalContent = `function hello() {
+  console.log("Hello");
+}
+
+function hello() {
+  console.log("Hello");
+}`;
+
+      const diffContent = `<<<<<<< SEARCH
+function hello() {
+  console.log("Hello");
+}
+=======
+function hello() {
+  console.log("Goodbye");
+}
+>>>>>>> REPLACE`;
+
+      const result = applySearchReplace(originalContent, diffContent);
+      expect(result.success).toBe(false);
+      expect(result.error).toContain("ambiguous");
+    });
+  });
+});
--- a/src/pro/main/ipc/processors/search_replace_processor.ts
+++ b/src/pro/main/ipc/processors/search_replace_processor.ts
@@ -1,6 +1,17 @@
 /* eslint-disable no-irregular-whitespace */

 import { parseSearchReplaceBlocks } from "@/pro/shared/search_replace_parser";
+import { distance } from "fastest-levenshtein";
+import { normalizeString } from "@/utils/text_normalization";
+
+// Minimum similarity threshold for fuzzy matching (0 to 1, where 1 is exact match)
+const FUZZY_MATCH_THRESHOLD = 0.9;
+
+// Early termination threshold - stop searching if we find a match this good
+const EARLY_STOP_THRESHOLD = 0.95;
+
+// Maximum time to spend on fuzzy matching (in milliseconds)
+const MAX_FUZZY_SEARCH_TIME_MS = 10_000; // 10 seconds

 function unescapeMarkers(content: string): string {
  return content
@@ -9,6 +20,148 @@ function unescapeMarkers(content: string): string {
    .replace(/^\\>>>>>>>/gm, ">>>>>>>");
 }

+/**
+ * Calculate similarity between two strings using Levenshtein distance
+ * Returns a value between 0 and 1, where 1 is an exact match
+ */
+function getSimilarity(original: string, search: string): number {
+  // Empty searches are no longer supported
+  if (search === "") {
+    return 0;
+  }
+
+  // Use the normalizeString utility to handle smart quotes and other special characters
+  const normalizedOriginal = normalizeString(original);
+  const normalizedSearch = normalizeString(search);
+
+  if (normalizedOriginal === normalizedSearch) {
+    return 1;
+  }
+
+  // Calculate Levenshtein distance using fastest-levenshtein's distance function
+  const dist = distance(normalizedOriginal, normalizedSearch);
+
+  // Calculate similarity ratio (0 to 1, where 1 is an exact match)
+  const maxLength = Math.max(
+    normalizedOriginal.length,
+    normalizedSearch.length,
+  );
+  return 1 - dist / maxLength;
+}
+
+/**
+ * Quick scoring function that counts how many lines exactly match.
+ * This is much faster than Levenshtein and serves as a good pre-filter.
+ */
+function quickScoreByExactLines(
+  targetLines: string[],
+  searchLines: string[],
+  startIdx: number,
+): number {
+  let exactMatches = 0;
+
+  for (let i = 0; i < searchLines.length; i++) {
+    if (startIdx + i >= targetLines.length) break;
+
+    if (
+      normalizeString(targetLines[startIdx + i]) ===
+      normalizeString(searchLines[i])
+    ) {
+      exactMatches++;
+    }
+  }
+
+  return exactMatches / searchLines.length;
+}
+
+/**
+ * Fast fuzzy search using a two-pass approach:
+ * 1. Quick pre-filter pass: Count exact line matches (fast)
+ * 2. Detailed pass: Only compute Levenshtein on promising candidates (expensive)
+ *
+ * The key insight: If two blocks are similar enough for fuzzy matching (e.g., 90%),
+ * then likely at least 60% of their lines will match exactly.
+ */
+function fastFuzzySearch(
+  lines: string[],
+  searchChunk: string,
+  startIndex: number,
+  endIndex: number,
+) {
+  const searchLines = searchChunk.split(/\r?\n/);
+  const searchLen = searchLines.length;
+
+  // Track start time for timeout
+  const startTime = performance.now();
+
+  // Quick threshold: require at least 60% exact line matches to be a candidate
+  const QUICK_THRESHOLD = 0.6;
+
+  // First pass: find candidates with high exact line match ratio (very fast)
+  const candidates: Array<{ index: number; quickScore: number }> = [];
+
+  for (let i = startIndex; i <= endIndex - searchLen; i++) {
+    // Check time limit
+    const elapsed = performance.now() - startTime;
+    if (elapsed > MAX_FUZZY_SEARCH_TIME_MS) {
+      console.warn(
+        `Fast fuzzy search timed out during pre-filter after ${(elapsed / 1000).toFixed(1)}s`,
+      );
+      break;
+    }
+
+    const quickScore = quickScoreByExactLines(lines, searchLines, i);
+
+    if (quickScore >= QUICK_THRESHOLD) {
+      candidates.push({ index: i, quickScore });
+    }
+  }
+
+  // Sort candidates by quick score (best first)
+  candidates.sort((a, b) => b.quickScore - a.quickScore);
+
+  // Second pass: only compute expensive Levenshtein on top candidates
+  let bestScore = 0;
+  let bestMatchIndex = -1;
+
+  const MAX_CANDIDATES_TO_CHECK = 10; // Only check top 10 candidates
+
+  for (
+    let i = 0;
+    i < Math.min(candidates.length, MAX_CANDIDATES_TO_CHECK);
+    i++
+  ) {
+    const candidate = candidates[i];
+
+    // Check time limit
+    const elapsed = performance.now() - startTime;
+    if (elapsed > MAX_FUZZY_SEARCH_TIME_MS) {
+      console.warn(
+        `Fast fuzzy search timed out during detailed pass after ${(elapsed / 1000).toFixed(1)}s. Best match: ${(bestScore * 100).toFixed(1)}%`,
+      );
+      break;
+    }
+
+    const originalChunk = lines
+      .slice(candidate.index, candidate.index + searchLen)
+      .join("\n");
+
+    const similarity = getSimilarity(originalChunk, searchChunk);
+
+    if (similarity > bestScore) {
+      bestScore = similarity;
+      bestMatchIndex = candidate.index;
+
+      // Early exit if we found a very good match
+      if (bestScore >= EARLY_STOP_THRESHOLD) {
+        return { bestScore, bestMatchIndex };
+      }
+    }
+  }
+
+  return { bestScore, bestMatchIndex };
+}
+
 export function applySearchReplace(
  originalContent: string,
  diffContent: string,
@@ -113,14 +266,29 @@ export function applySearchReplace(
        };
      }

-      if (candidates.length === 0) {
+      if (candidates.length === 1) {
+        matchIndex = candidates[0];
+      }
+    }
+
+    // If still no match, try fuzzy matching with Levenshtein distance
+    if (matchIndex === -1) {
+      const searchChunk = searchLines.join("\n");
+      const { bestScore, bestMatchIndex } = fastFuzzySearch(
+        resultLines,
+        searchChunk,
+        0,
+        resultLines.length,
+      );
+
+      if (bestScore >= FUZZY_MATCH_THRESHOLD) {
+        matchIndex = bestMatchIndex;
+      } else {
        return {
          success: false,
-          error: "Search block did not match any content in the target file",
+          error: `Search block did not match any content in the target file. Best fuzzy match had similarity of ${(bestScore * 100).toFixed(1)}% (threshold: ${(FUZZY_MATCH_THRESHOLD * 100).toFixed(1)}%)`,
        };
      }
-
-      matchIndex = candidates[0];
    }

    const matchedLines = resultLines.slice(
--- a/src/utils/text_normalization.ts
+++ b/src/utils/text_normalization.ts
@@ -0,0 +1,20 @@
+/**
+ * Normalizes text for comparison by handling smart quotes and other special characters
+ */
+export function normalizeString(text: string): string {
+  return (
+    text
+      // Normalize smart quotes to regular quotes
+      .replace(/[\u2018\u2019]/g, "'") // Single quotes
+      .replace(/[\u201C\u201D]/g, '"') // Double quotes
+      // Normalize different types of dashes
+      .replace(/[\u2013\u2014]/g, "-") // En dash and em dash to hyphen
+      // Normalize ellipsis
+      .replace(/\u2026/g, "...") // Ellipsis to three dots
+      // Normalize non-breaking spaces
+      .replace(/\u00A0/g, " ") // Non-breaking space to regular space
+      // Normalize other common Unicode variants
+      .replace(/\u00AD/g, "") // Soft hyphen (remove)
+      .replace(/[\uFEFF]/g, "")
+  ); // Zero-width no-break space (remove)
+}