First string to compare.
Second string to compare.
Optional custom similarity configuration (weights, thresholds, debug).
Similarity score between 0 and 100.
export function calculateEnhancedSimilarity(
str1: string,
str2: string,
config: Partial<SimilarityConfig> = {},
): number {
const finalConfig = { ...DEFAULT_SIMILARITY_CONFIG, ...config };
if (!str1 || !str2) return 0;
if (str1 === str2) return 100;
const norm1 = enhancedNormalize(str1);
const norm2 = enhancedNormalize(str2);
if (norm1 === norm2) return 100;
if (norm1.length === 0 || norm2.length === 0) return 0;
let cacheKey: string | null = null;
if (!finalConfig.debug) {
const pairKey = makeOrderedPairKey(norm1, norm2);
const configKey = createConfigKey(finalConfig);
cacheKey = `${pairKey}::${configKey}`;
const cached = getCacheEntry(enhancedSimilarityCache, cacheKey);
if (cached !== undefined) {
return cached;
}
}
// Check for extreme length differences
const lengthRatio =
Math.min(norm1.length, norm2.length) / Math.max(norm1.length, norm2.length);
if (lengthRatio < finalConfig.lengthDifferenceThreshold) {
// Apply penalty for very different lengths, but don't completely eliminate the match
const lengthPenalty = lengthRatio;
// Still calculate basic similarity but apply the penalty
const basicSimilarity = stringSimilarity.compareTwoStrings(norm1, norm2);
const penalizedScore = basicSimilarity * lengthPenalty;
const roundedPenaltyScore = Math.round(penalizedScore * 100);
if (finalConfig.debug) {
console.debug(
`[Similarity] Length penalty applied: ${str1} vs ${str2}, ratio: ${lengthRatio.toFixed(2)}, score: ${(penalizedScore * 100).toFixed(1)}`,
);
}
const boundedPenaltyScore = Math.min(100, Math.max(0, roundedPenaltyScore));
if (cacheKey) {
setCacheEntry(
enhancedSimilarityCache,
cacheKey,
boundedPenaltyScore,
PAIR_SIMILARITY_CACHE_LIMIT,
);
}
return boundedPenaltyScore;
}
// Calculate different types of similarity
const exactMatch = calculateExactMatch(str1, str2);
const substringMatch = calculateSubstringMatch(str1, str2);
const wordOrderSim = calculateWordOrderSimilarity(str1, str2);
const characterSim = calculateCharacterSimilarity(str1, str2);
const semanticSim = calculateSemanticSimilarity(str1, str2);
const jaroWinklerSim = calculateJaroWinklerSimilarity(norm1, norm2);
const ngramSim = calculateNgramSimilarity(norm1, norm2);
// Calculate weighted average with all similarity metrics
const totalWeight =
finalConfig.exactMatchWeight +
finalConfig.substringMatchWeight +
finalConfig.wordOrderWeight +
finalConfig.characterSimilarityWeight +
finalConfig.semanticWeight +
finalConfig.jaroWinklerWeight +
finalConfig.ngramWeight;
const weightedScore =
(exactMatch * finalConfig.exactMatchWeight +
substringMatch * finalConfig.substringMatchWeight +
wordOrderSim * finalConfig.wordOrderWeight +
characterSim * finalConfig.characterSimilarityWeight +
semanticSim * finalConfig.semanticWeight +
jaroWinklerSim * finalConfig.jaroWinklerWeight +
ngramSim * finalConfig.ngramWeight) /
totalWeight;
const roundedScore = Math.round(weightedScore * 100);
const boundedScore = Math.min(100, Math.max(0, roundedScore));
if (finalConfig.debug) {
console.debug(
`[Similarity] Similarity calculation for "${str1}" vs "${str2}":`,
);
console.debug(`[Similarity] Exact: ${(exactMatch * 100).toFixed(1)}%`);
console.debug(
`[Similarity] Substring: ${(substringMatch * 100).toFixed(1)}%`,
);
console.debug(
`[Similarity] Word Order: ${(wordOrderSim * 100).toFixed(1)}%`,
);
console.debug(
`[Similarity] Character: ${(characterSim * 100).toFixed(1)}%`,
);
console.debug(
`[Similarity] Semantic: ${(semanticSim * 100).toFixed(1)}%`,
);
console.debug(
`[Similarity] Jaro-Winkler: ${(jaroWinklerSim * 100).toFixed(1)}%`,
);
console.debug(`[Similarity] N-gram: ${(ngramSim * 100).toFixed(1)}%`);
console.debug(`[Similarity] Final: ${boundedScore}%`);
}
if (cacheKey) {
setCacheEntry(
enhancedSimilarityCache,
cacheKey,
boundedScore,
PAIR_SIMILARITY_CACHE_LIMIT,
);
}
return boundedScore;
}
Calculates enhanced title similarity using multiple algorithms with weighted combination. Combines exact matching, character similarity, semantic matching, and edit distances. Results are cached for performance with support for custom configuration.