Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion packages/utils/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
"dependencies": {
"@bundle-stats/plugin-webpack-filter": "^4.21.4",
"@bundle-stats/plugin-webpack-validate": "^4.21.4",
"serialize-query-params": "2.0.2"
"serialize-query-params": "2.0.2",
"text-similarity-node": "^1.0.1"
},
"devDependencies": {
"@types/jest": "29.5.14",
Expand Down
237 changes: 237 additions & 0 deletions packages/utils/src/utils/__tests__/string-similarity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
import { compareTwoStrings, extractBestCandidates, compareWithCosine } from '../string-similarity';

describe('string-similarity', () => {
describe('compareTwoStrings', () => {
test('should return 1 for identical strings', () => {
expect(compareTwoStrings('hello', 'hello')).toBe(1);
expect(compareTwoStrings('test-file.js', 'test-file.js')).toBe(1);
});

test('should return 0 for completely different strings', () => {
const result = compareTwoStrings('abc', 'xyz');
expect(result).toBeGreaterThanOrEqual(0);
expect(result).toBeLessThan(0.5); // Should be low similarity
});

test('should return high similarity for similar strings', () => {
const result = compareTwoStrings('hello', 'hallo');
expect(result).toBeGreaterThan(0.8); // Jaro-Winkler is good at detecting typos
});

test('should handle case sensitivity', () => {
// Case-insensitive (default)
const caseInsensitive = compareTwoStrings('Hello', 'hello', false);
expect(caseInsensitive).toBe(1);

// Case-sensitive
const caseSensitive = compareTwoStrings('Hello', 'hello', true);
expect(caseSensitive).toBeLessThan(1);
});

test('should return 0 for empty strings', () => {
expect(compareTwoStrings('', 'hello')).toBe(0);
expect(compareTwoStrings('hello', '')).toBe(0);
expect(compareTwoStrings('', '')).toBe(0);
});

test('should work with file paths', () => {
const path1 = 'static/js/main.abc123.chunk.js';
const path2 = 'static/js/main.def456.chunk.js';
const result = compareTwoStrings(path1, path2);
expect(result).toBeGreaterThan(0.8); // High similarity despite different hashes
});

test('should work with webpack module names', () => {
const module1 = './node_modules/react/index.js';
const module2 = './node_modules/react/index.jsx';
const result = compareTwoStrings(module1, module2);
expect(result).toBeGreaterThan(0.9); // Very similar, just different extension
});
});

describe('extractBestCandidates', () => {
test('should find the best match from candidates', () => {
const result = extractBestCandidates('hello', ['hallo', 'world', 'help']);

expect(result.bestMatch.target).toBe('hallo');
expect(result.bestMatch.rating).toBeGreaterThan(0.8);
expect(result.bestMatchIndex).toBe(0);
});

test('should return all ratings', () => {
const result = extractBestCandidates('test', ['test', 'best', 'rest', 'west']);

expect(result.ratings).toHaveLength(4);
expect(result.ratings[0].target).toBe('test');
expect(result.ratings[0].rating).toBe(1); // Exact match
});

test('should handle file paths with hashes', () => {
const mainFile = 'main.abc123.js';
const candidates = ['main.def456.js', 'vendor.abc123.js', 'runtime.xyz789.js'];

const result = extractBestCandidates(mainFile, candidates);

expect(result.bestMatch.target).toBe('main.def456.js');
expect(result.bestMatch.rating).toBeGreaterThan(0.7);
});

test('should handle webpack chunk names', () => {
const mainChunk = 'static/js/2.abc123.chunk.js';
const candidates = [
'static/js/2.def456.chunk.js',
'static/js/3.abc123.chunk.js',
'static/css/2.abc123.chunk.css',
];

const result = extractBestCandidates(mainChunk, candidates);

// Both files have high similarity, but different strengths
// The algorithm finds the best match based on overall similarity
expect(result.bestMatch.rating).toBeGreaterThan(0.8);
expect(result.ratings).toHaveLength(3);

// Verify that static/js files have higher ratings than static/css
const jsFile1Rating = result.ratings[0].rating;
const jsFile2Rating = result.ratings[1].rating;
const cssFileRating = result.ratings[2].rating;

expect(Math.max(jsFile1Rating, jsFile2Rating)).toBeGreaterThan(cssFileRating);
});

test('should handle module paths', () => {
const mainModule = './node_modules/lodash/get.js';
const candidates = [
'./node_modules/lodash/set.js',
'./node_modules/lodash/get.js',
'./node_modules/react/index.js',
];

const result = extractBestCandidates(mainModule, candidates);

expect(result.bestMatch.target).toBe('./node_modules/lodash/get.js');
expect(result.bestMatch.rating).toBe(1); // Exact match
});

test('should handle empty candidates array', () => {
const result = extractBestCandidates('test', []);

expect(result.ratings).toHaveLength(0);
expect(result.bestMatch.target).toBe('');
expect(result.bestMatch.rating).toBe(0);
expect(result.bestMatchIndex).toBe(-1);
});

test('should handle empty main string', () => {
const result = extractBestCandidates('', ['test', 'best']);

expect(result.ratings).toHaveLength(0);
expect(result.bestMatch.target).toBe('');
expect(result.bestMatch.rating).toBe(0);
expect(result.bestMatchIndex).toBe(-1);
});

test('should handle case sensitivity', () => {
const candidates = ['Hello', 'hello', 'HELLO'];

// Case-insensitive (default)
const caseInsensitive = extractBestCandidates('hello', candidates, false);
expect(caseInsensitive.bestMatch.rating).toBe(1);

// Case-sensitive
const caseSensitive = extractBestCandidates('hello', candidates, true);
expect(caseSensitive.bestMatch.target).toBe('hello');
expect(caseSensitive.bestMatch.rating).toBe(1);
});

test('should work with real-world Next.js build output', () => {
const baselineFile = '_next/static/chunks/pages/index-abc123def.js';
const currentFiles = [
'_next/static/chunks/pages/index-xyz789abc.js',
'_next/static/chunks/pages/about-abc123def.js',
'_next/static/chunks/framework-123456789.js',
];

const result = extractBestCandidates(baselineFile, currentFiles);

expect(result.bestMatch.target).toBe('_next/static/chunks/pages/index-xyz789abc.js');
expect(result.bestMatch.rating).toBeGreaterThan(0.7);
});

test('should work with webpack module paths with loaders', () => {
const baselineModule = './src/components/Button.jsx';
const currentModules = [
'./src/components/Button.jsx',
'./src/components/Button.css',
'./src/components/Link.jsx',
];

const result = extractBestCandidates(baselineModule, currentModules);

expect(result.bestMatch.target).toBe('./src/components/Button.jsx');
expect(result.bestMatch.rating).toBe(1);
});
});

describe('compareWithCosine', () => {
test('should return 1 for identical strings', () => {
expect(compareWithCosine('hello world', 'hello world')).toBe(1);
});

test('should handle word-based tokenization', () => {
const result = compareWithCosine('hello world', 'world hello', true);
expect(result).toBeGreaterThan(0.9); // Word order shouldn't matter much
});

test('should handle character-based tokenization', () => {
const result = compareWithCosine('hello', 'hallo', false);
expect(result).toBeGreaterThanOrEqual(0.5);
});

test('should return 0 for empty strings', () => {
expect(compareWithCosine('', 'hello')).toBe(0);
expect(compareWithCosine('hello', '')).toBe(0);
});

test('should work with long file paths', () => {
const path1 = './src/components/features/dashboard/analytics/ChartComponent.jsx';
const path2 = './src/components/features/dashboard/analytics/TableComponent.jsx';
const result = compareWithCosine(path1, path2, true);
expect(result).toBeGreaterThan(0.6); // Should have high similarity
});
});

describe('Performance characteristics', () => {
test('should handle large candidate lists efficiently', () => {
const mainString = 'test-file-123.js';
const candidates = Array.from({ length: 1000 }, (_, i) => `file-${i}.js`);
candidates.push('test-file-456.js'); // Add a similar file

const startTime = performance.now();
const result = extractBestCandidates(mainString, candidates);
const endTime = performance.now();

expect(result.bestMatch.target).toBe('test-file-456.js');
expect(endTime - startTime).toBeLessThan(1000); // Should complete in less than 1 second
});
});

describe('Edge cases', () => {
test('should handle strings with special characters', () => {
const result = compareTwoStrings('[email protected]', '[email protected]');
expect(result).toBeGreaterThan(0.8);
});

test('should handle unicode characters', () => {
const result = compareTwoStrings('café', 'cafe', false);
expect(result).toBeGreaterThan(0.5);
});

test('should handle very long strings', () => {
const longString1 = 'a'.repeat(1000);
const longString2 = `${'a'.repeat(999)}b`;
const result = compareTwoStrings(longString1, longString2);
expect(result).toBeGreaterThan(0.9); // Should be very similar
});
});
});
1 change: 1 addition & 0 deletions packages/utils/src/utils/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ export * from './insights';
export * from './file-types';
export * from './format';
export * from './metrics';
export * from './string-similarity';
118 changes: 118 additions & 0 deletions packages/utils/src/utils/string-similarity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import * as textSimilarity from 'text-similarity-node';

export interface BestMatch {
target: string;
rating: number;
}

export interface BestMatchResult {
ratings: BestMatch[];
bestMatch: BestMatch;
bestMatchIndex: number;
}

/**
* Compare two strings and return a similarity score between 0 and 1
* Uses Jaro-Winkler algorithm which is optimized for short strings and proper names
*
* @param str1 - First string to compare
* @param str2 - Second string to compare
* @param caseSensitive - Whether comparison should be case-sensitive (default: false)
* @returns Similarity score between 0 (completely different) and 1 (identical)
*/
export const compareTwoStrings = (str1: string, str2: string, caseSensitive = false): number => {
if (!str1 || !str2) {
return 0;
}

if (str1 === str2) {
return 1;
}

// Use Jaro-Winkler algorithm for better performance with file paths and names
return textSimilarity.similarity.jaroWinkler(str1, str2, caseSensitive);
};
Comment on lines +23 to +34
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical bug: Case-insensitive comparison broken by early exact equality check.

The exact equality check on line 28 uses ===, which is always case-sensitive. This breaks case-insensitive comparison. For example, compareTwoStrings('Hello', 'hello', false) should return 1 (as expected by the test at line 23-24), but the current implementation will:

  1. Pass through the === check (returns false)
  2. Call jaroWinkler('Hello', 'hello', false) which returns a value < 1

Apply this diff to fix the issue:

 export const compareTwoStrings = (str1: string, str2: string, caseSensitive = false): number => {
   if (!str1 || !str2) {
     return 0;
   }
 
-  if (str1 === str2) {
+  // Normalize for case-insensitive comparison
+  const normalizedStr1 = caseSensitive ? str1 : str1.toLowerCase();
+  const normalizedStr2 = caseSensitive ? str2 : str2.toLowerCase();
+
+  if (normalizedStr1 === normalizedStr2) {
     return 1;
   }
 
   // Use Jaro-Winkler algorithm for better performance with file paths and names
   return textSimilarity.similarity.jaroWinkler(str1, str2, caseSensitive);
 };
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
export const compareTwoStrings = (str1: string, str2: string, caseSensitive = false): number => {
if (!str1 || !str2) {
return 0;
}
if (str1 === str2) {
return 1;
}
// Use Jaro-Winkler algorithm for better performance with file paths and names
return textSimilarity.similarity.jaroWinkler(str1, str2, caseSensitive);
};
export const compareTwoStrings = (str1: string, str2: string, caseSensitive = false): number => {
if (!str1 || !str2) {
return 0;
}
// Normalize for case-insensitive comparison
const normalizedStr1 = caseSensitive ? str1 : str1.toLowerCase();
const normalizedStr2 = caseSensitive ? str2 : str2.toLowerCase();
if (normalizedStr1 === normalizedStr2) {
return 1;
}
// Use Jaro-Winkler algorithm for better performance with file paths and names
return textSimilarity.similarity.jaroWinkler(str1, str2, caseSensitive);
};
🤖 Prompt for AI Agents
In packages/utils/src/utils/string-similarity.ts around lines 23 to 34, the
early exact equality check uses === which is always case-sensitive and breaks
case-insensitive mode; change that check to respect the caseSensitive parameter
(i.e., if caseSensitive is true use ===, otherwise compare str1.toLowerCase()
=== str2.toLowerCase()) so case-insensitive equal strings return 1 and only fall
through to jaroWinkler when they truly differ.


/**
* Find the best matching strings from a list of candidates
*
* @param mainString - The string to compare against
* @param targetStrings - Array of candidate strings to compare
* @param caseSensitive - Whether comparison should be case-sensitive (default: false)
* @returns Object containing all ratings, best match, and best match index
*
* @example
* ```typescript
* const result = extractBestCandidates('hello', ['hallo', 'world', 'help']);
* // result.bestMatch: { target: 'hallo', rating: 0.92 }
* // result.bestMatchIndex: 0
* // result.ratings: [
* // { target: 'hallo', rating: 0.92 },
* // { target: 'world', rating: 0.46 },
* // { target: 'help', rating: 0.85 }
* // ]
* ```
*/
export const extractBestCandidates = (
mainString: string,
targetStrings: string[],
caseSensitive = false,
): BestMatchResult => {
if (!mainString || !targetStrings || targetStrings.length === 0) {
return {
ratings: [],
bestMatch: { target: '', rating: 0 },
bestMatchIndex: -1,
};
}

// Calculate similarity for all candidates
const ratings: BestMatch[] = targetStrings.map((target) => ({
target,
rating: compareTwoStrings(mainString, target, caseSensitive),
}));

// Find the best match
let bestMatchIndex = 0;
let bestRating = ratings[0].rating;

for (let i = 1; i < ratings.length; i += 1) {
if (ratings[i].rating > bestRating) {
bestRating = ratings[i].rating;
bestMatchIndex = i;
}
}

return {
ratings,
bestMatch: ratings[bestMatchIndex],
bestMatchIndex,
};
};

/**
* Alternative comparison using Cosine similarity for longer strings
* This is more suitable for comparing longer file paths or content
*
* @param str1 - First string to compare
* @param str2 - Second string to compare
* @param useWords - Whether to tokenize by words (true) or characters (false)
* @param caseSensitive - Whether comparison should be case-sensitive
* @returns Similarity score between 0 and 1
*/
export const compareWithCosine = (
str1: string,
str2: string,
useWords = true,
caseSensitive = false,
): number => {
if (!str1 || !str2) {
return 0;
}

if (str1 === str2) {
return 1;
}

return textSimilarity.similarity.cosine(str1, str2, useWords, caseSensitive);
};
Comment on lines +103 to +118
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical bug: Same case-insensitive comparison issue.

This function has the same bug as compareTwoStrings. The exact equality check on line 113 uses ===, which breaks case-insensitive comparison.

Apply this diff to fix the issue:

 export const compareWithCosine = (
   str1: string,
   str2: string,
   useWords = true,
   caseSensitive = false,
 ): number => {
   if (!str1 || !str2) {
     return 0;
   }
 
-  if (str1 === str2) {
+  // Normalize for case-insensitive comparison
+  const normalizedStr1 = caseSensitive ? str1 : str1.toLowerCase();
+  const normalizedStr2 = caseSensitive ? str2 : str2.toLowerCase();
+
+  if (normalizedStr1 === normalizedStr2) {
     return 1;
   }
 
   return textSimilarity.similarity.cosine(str1, str2, useWords, caseSensitive);
 };
🤖 Prompt for AI Agents
In packages/utils/src/utils/string-similarity.ts around lines 103 to 118, the
exact-equality check uses === which ignores the caseSensitive parameter and
breaks case-insensitive comparisons; change the equality check to compare
normalized strings: if caseSensitive is false, compare str1.toLowerCase() ===
str2.toLowerCase(), otherwise keep the existing strict equality, then proceed to
call textSimilarity.similarity.cosine with the original parameters.