Initial commit: New MoreminiMore website with fresh design
This commit is contained in:
281
node_modules/diff/libesm/diff/word.js
generated
vendored
Normal file
281
node_modules/diff/libesm/diff/word.js
generated
vendored
Normal file
@@ -0,0 +1,281 @@
|
||||
import Diff from './base.js';
|
||||
import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap, leadingWs, trailingWs, leadingAndTrailingWs, segment } from '../util/string.js';
|
||||
// Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode
|
||||
//
|
||||
// Chars/ranges counted as "word" characters by this regex are as follows:
|
||||
//
|
||||
// + U+00AD Soft hyphen
|
||||
// + 00C0–00FF (letters with diacritics from the Latin-1 Supplement), except:
|
||||
// - U+00D7 × Multiplication sign
|
||||
// - U+00F7 ÷ Division sign
|
||||
// + Latin Extended-A, 0100–017F
|
||||
// + Latin Extended-B, 0180–024F
|
||||
// + IPA Extensions, 0250–02AF
|
||||
// + Spacing Modifier Letters, 02B0–02FF, except:
|
||||
// - U+02C7 ˇ ˇ Caron
|
||||
// - U+02D8 ˘ ˘ Breve
|
||||
// - U+02D9 ˙ ˙ Dot Above
|
||||
// - U+02DA ˚ ˚ Ring Above
|
||||
// - U+02DB ˛ ˛ Ogonek
|
||||
// - U+02DC ˜ ˜ Small Tilde
|
||||
// - U+02DD ˝ ˝ Double Acute Accent
|
||||
// + Latin Extended Additional, 1E00–1EFF
|
||||
const extendedWordChars = 'a-zA-Z0-9_\\u{AD}\\u{C0}-\\u{D6}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\\u{2C8}-\\u{2D7}\\u{2DE}-\\u{2FF}\\u{1E00}-\\u{1EFF}';
|
||||
// Each token is one of the following:
|
||||
// - A punctuation mark plus the surrounding whitespace
|
||||
// - A word plus the surrounding whitespace
|
||||
// - Pure whitespace (but only in the special case where the entire text
|
||||
// is just whitespace)
|
||||
//
|
||||
// We have to include surrounding whitespace in the tokens because the two
|
||||
// alternative approaches produce horribly broken results:
|
||||
// * If we just discard the whitespace, we can't fully reproduce the original
|
||||
// text from the sequence of tokens and any attempt to render the diff will
|
||||
// get the whitespace wrong.
|
||||
// * If we have separate tokens for whitespace, then in a typical text every
|
||||
// second token will be a single space character. But this often results in
|
||||
// the optimal diff between two texts being a perverse one that preserves
|
||||
// the spaces between words but deletes and reinserts actual common words.
|
||||
// See https://github.com/kpdecker/jsdiff/issues/160#issuecomment-1866099640
|
||||
// for an example.
|
||||
//
|
||||
// Keeping the surrounding whitespace of course has implications for .equals
|
||||
// and .join, not just .tokenize.
|
||||
// This regex does NOT fully implement the tokenization rules described above.
|
||||
// Instead, it gives runs of whitespace their own "token". The tokenize method
|
||||
// then handles stitching whitespace tokens onto adjacent word or punctuation
|
||||
// tokens.
|
||||
const tokenizeIncludingWhitespace = new RegExp(`[${extendedWordChars}]+|\\s+|[^${extendedWordChars}]`, 'ug');
|
||||
class WordDiff extends Diff {
|
||||
equals(left, right, options) {
|
||||
if (options.ignoreCase) {
|
||||
left = left.toLowerCase();
|
||||
right = right.toLowerCase();
|
||||
}
|
||||
return left.trim() === right.trim();
|
||||
}
|
||||
tokenize(value, options = {}) {
|
||||
let parts;
|
||||
if (options.intlSegmenter) {
|
||||
const segmenter = options.intlSegmenter;
|
||||
if (segmenter.resolvedOptions().granularity != 'word') {
|
||||
throw new Error('The segmenter passed must have a granularity of "word"');
|
||||
}
|
||||
// We want `parts` to be an array whose elements alternate between being
|
||||
// pure whitespace and being pure non-whitespace. This is ALMOST what the
|
||||
// segments returned by a word-based Intl.Segmenter already look like,
|
||||
// but not quite - see explanation in the docs of our custom segment()
|
||||
// function.
|
||||
parts = segment(value, segmenter);
|
||||
}
|
||||
else {
|
||||
parts = value.match(tokenizeIncludingWhitespace) || [];
|
||||
}
|
||||
const tokens = [];
|
||||
let prevPart = null;
|
||||
parts.forEach(part => {
|
||||
if ((/\s/).test(part)) {
|
||||
if (prevPart == null) {
|
||||
tokens.push(part);
|
||||
}
|
||||
else {
|
||||
tokens.push(tokens.pop() + part);
|
||||
}
|
||||
}
|
||||
else if (prevPart != null && (/\s/).test(prevPart)) {
|
||||
if (tokens[tokens.length - 1] == prevPart) {
|
||||
tokens.push(tokens.pop() + part);
|
||||
}
|
||||
else {
|
||||
tokens.push(prevPart + part);
|
||||
}
|
||||
}
|
||||
else {
|
||||
tokens.push(part);
|
||||
}
|
||||
prevPart = part;
|
||||
});
|
||||
return tokens;
|
||||
}
|
||||
join(tokens) {
|
||||
// Tokens being joined here will always have appeared consecutively in the
|
||||
// same text, so we can simply strip off the leading whitespace from all the
|
||||
// tokens except the first (and except any whitespace-only tokens - but such
|
||||
// a token will always be the first and only token anyway) and then join them
|
||||
// and the whitespace around words and punctuation will end up correct.
|
||||
return tokens.map((token, i) => {
|
||||
if (i == 0) {
|
||||
return token;
|
||||
}
|
||||
else {
|
||||
return token.replace((/^\s+/), '');
|
||||
}
|
||||
}).join('');
|
||||
}
|
||||
postProcess(changes, options) {
|
||||
if (!changes || options.oneChangePerToken) {
|
||||
return changes;
|
||||
}
|
||||
let lastKeep = null;
|
||||
// Change objects representing any insertion or deletion since the last
|
||||
// "keep" change object. There can be at most one of each.
|
||||
let insertion = null;
|
||||
let deletion = null;
|
||||
changes.forEach(change => {
|
||||
if (change.added) {
|
||||
insertion = change;
|
||||
}
|
||||
else if (change.removed) {
|
||||
deletion = change;
|
||||
}
|
||||
else {
|
||||
if (insertion || deletion) { // May be false at start of text
|
||||
dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change, options.intlSegmenter);
|
||||
}
|
||||
lastKeep = change;
|
||||
insertion = null;
|
||||
deletion = null;
|
||||
}
|
||||
});
|
||||
if (insertion || deletion) {
|
||||
dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null, options.intlSegmenter);
|
||||
}
|
||||
return changes;
|
||||
}
|
||||
}
|
||||
export const wordDiff = new WordDiff();
|
||||
export function diffWords(oldStr, newStr, options) {
|
||||
// This option has never been documented and never will be (it's clearer to
|
||||
// just call `diffWordsWithSpace` directly if you need that behavior), but
|
||||
// has existed in jsdiff for a long time, so we retain support for it here
|
||||
// for the sake of backwards compatibility.
|
||||
if ((options === null || options === void 0 ? void 0 : options.ignoreWhitespace) != null && !options.ignoreWhitespace) {
|
||||
return diffWordsWithSpace(oldStr, newStr, options);
|
||||
}
|
||||
return wordDiff.diff(oldStr, newStr, options);
|
||||
}
|
||||
function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep, segmenter) {
|
||||
// Before returning, we tidy up the leading and trailing whitespace of the
|
||||
// change objects to eliminate cases where trailing whitespace in one object
|
||||
// is repeated as leading whitespace in the next.
|
||||
// Below are examples of the outcomes we want here to explain the code.
|
||||
// I=insert, K=keep, D=delete
|
||||
// 1. diffing 'foo bar baz' vs 'foo baz'
|
||||
// Prior to cleanup, we have K:'foo ' D:' bar ' K:' baz'
|
||||
// After cleanup, we want: K:'foo ' D:'bar ' K:'baz'
|
||||
//
|
||||
// 2. Diffing 'foo bar baz' vs 'foo qux baz'
|
||||
// Prior to cleanup, we have K:'foo ' D:' bar ' I:' qux ' K:' baz'
|
||||
// After cleanup, we want K:'foo ' D:'bar' I:'qux' K:' baz'
|
||||
//
|
||||
// 3. Diffing 'foo\nbar baz' vs 'foo baz'
|
||||
// Prior to cleanup, we have K:'foo ' D:'\nbar ' K:' baz'
|
||||
// After cleanup, we want K'foo' D:'\nbar' K:' baz'
|
||||
//
|
||||
// 4. Diffing 'foo baz' vs 'foo\nbar baz'
|
||||
// Prior to cleanup, we have K:'foo\n' I:'\nbar ' K:' baz'
|
||||
// After cleanup, we ideally want K'foo' I:'\nbar' K:' baz'
|
||||
// but don't actually manage this currently (the pre-cleanup change
|
||||
// objects don't contain enough information to make it possible).
|
||||
//
|
||||
// 5. Diffing 'foo bar baz' vs 'foo baz'
|
||||
// Prior to cleanup, we have K:'foo ' D:' bar ' K:' baz'
|
||||
// After cleanup, we want K:'foo ' D:' bar ' K:'baz'
|
||||
//
|
||||
// Our handling is unavoidably imperfect in the case where there's a single
|
||||
// indel between keeps and the whitespace has changed. For instance, consider
|
||||
// diffing 'foo\tbar\nbaz' vs 'foo baz'. Unless we create an extra change
|
||||
// object to represent the insertion of the space character (which isn't even
|
||||
// a token), we have no way to avoid losing information about the texts'
|
||||
// original whitespace in the result we return. Still, we do our best to
|
||||
// output something that will look sensible if we e.g. print it with
|
||||
// insertions in green and deletions in red.
|
||||
// Between two "keep" change objects (or before the first or after the last
|
||||
// change object), we can have either:
|
||||
// * A "delete" followed by an "insert"
|
||||
// * Just an "insert"
|
||||
// * Just a "delete"
|
||||
// We handle the three cases separately.
|
||||
if (deletion && insertion) {
|
||||
const [oldWsPrefix, oldWsSuffix] = leadingAndTrailingWs(deletion.value, segmenter);
|
||||
const [newWsPrefix, newWsSuffix] = leadingAndTrailingWs(insertion.value, segmenter);
|
||||
if (startKeep) {
|
||||
const commonWsPrefix = longestCommonPrefix(oldWsPrefix, newWsPrefix);
|
||||
startKeep.value = replaceSuffix(startKeep.value, newWsPrefix, commonWsPrefix);
|
||||
deletion.value = removePrefix(deletion.value, commonWsPrefix);
|
||||
insertion.value = removePrefix(insertion.value, commonWsPrefix);
|
||||
}
|
||||
if (endKeep) {
|
||||
const commonWsSuffix = longestCommonSuffix(oldWsSuffix, newWsSuffix);
|
||||
endKeep.value = replacePrefix(endKeep.value, newWsSuffix, commonWsSuffix);
|
||||
deletion.value = removeSuffix(deletion.value, commonWsSuffix);
|
||||
insertion.value = removeSuffix(insertion.value, commonWsSuffix);
|
||||
}
|
||||
}
|
||||
else if (insertion) {
|
||||
// The whitespaces all reflect what was in the new text rather than
|
||||
// the old, so we essentially have no information about whitespace
|
||||
// insertion or deletion. We just want to dedupe the whitespace.
|
||||
// We do that by having each change object keep its trailing
|
||||
// whitespace and deleting duplicate leading whitespace where
|
||||
// present.
|
||||
if (startKeep) {
|
||||
const ws = leadingWs(insertion.value, segmenter);
|
||||
insertion.value = insertion.value.substring(ws.length);
|
||||
}
|
||||
if (endKeep) {
|
||||
const ws = leadingWs(endKeep.value, segmenter);
|
||||
endKeep.value = endKeep.value.substring(ws.length);
|
||||
}
|
||||
// otherwise we've got a deletion and no insertion
|
||||
}
|
||||
else if (startKeep && endKeep) {
|
||||
const newWsFull = leadingWs(endKeep.value, segmenter), [delWsStart, delWsEnd] = leadingAndTrailingWs(deletion.value, segmenter);
|
||||
// Any whitespace that comes straight after startKeep in both the old and
|
||||
// new texts, assign to startKeep and remove from the deletion.
|
||||
const newWsStart = longestCommonPrefix(newWsFull, delWsStart);
|
||||
deletion.value = removePrefix(deletion.value, newWsStart);
|
||||
// Any whitespace that comes straight before endKeep in both the old and
|
||||
// new texts, and hasn't already been assigned to startKeep, assign to
|
||||
// endKeep and remove from the deletion.
|
||||
const newWsEnd = longestCommonSuffix(removePrefix(newWsFull, newWsStart), delWsEnd);
|
||||
deletion.value = removeSuffix(deletion.value, newWsEnd);
|
||||
endKeep.value = replacePrefix(endKeep.value, newWsFull, newWsEnd);
|
||||
// If there's any whitespace from the new text that HASN'T already been
|
||||
// assigned, assign it to the start:
|
||||
startKeep.value = replaceSuffix(startKeep.value, newWsFull, newWsFull.slice(0, newWsFull.length - newWsEnd.length));
|
||||
}
|
||||
else if (endKeep) {
|
||||
// We are at the start of the text. Preserve all the whitespace on
|
||||
// endKeep, and just remove whitespace from the end of deletion to the
|
||||
// extent that it overlaps with the start of endKeep.
|
||||
const endKeepWsPrefix = leadingWs(endKeep.value, segmenter);
|
||||
const deletionWsSuffix = trailingWs(deletion.value, segmenter);
|
||||
const overlap = maximumOverlap(deletionWsSuffix, endKeepWsPrefix);
|
||||
deletion.value = removeSuffix(deletion.value, overlap);
|
||||
}
|
||||
else if (startKeep) {
|
||||
// We are at the END of the text. Preserve all the whitespace on
|
||||
// startKeep, and just remove whitespace from the start of deletion to
|
||||
// the extent that it overlaps with the end of startKeep.
|
||||
const startKeepWsSuffix = trailingWs(startKeep.value, segmenter);
|
||||
const deletionWsPrefix = leadingWs(deletion.value, segmenter);
|
||||
const overlap = maximumOverlap(startKeepWsSuffix, deletionWsPrefix);
|
||||
deletion.value = removePrefix(deletion.value, overlap);
|
||||
}
|
||||
}
|
||||
class WordsWithSpaceDiff extends Diff {
|
||||
tokenize(value) {
|
||||
// Slightly different to the tokenizeIncludingWhitespace regex used above in
|
||||
// that this one treats each individual newline as a distinct token, rather
|
||||
// than merging them into other surrounding whitespace. This was requested
|
||||
// in https://github.com/kpdecker/jsdiff/issues/180 &
|
||||
// https://github.com/kpdecker/jsdiff/issues/211
|
||||
const regex = new RegExp(`(\\r?\\n)|[${extendedWordChars}]+|[^\\S\\n\\r]+|[^${extendedWordChars}]`, 'ug');
|
||||
return value.match(regex) || [];
|
||||
}
|
||||
}
|
||||
export const wordsWithSpaceDiff = new WordsWithSpaceDiff();
|
||||
export function diffWordsWithSpace(oldStr, newStr, options) {
|
||||
return wordsWithSpaceDiff.diff(oldStr, newStr, options);
|
||||
}
|
||||
Reference in New Issue
Block a user