Function enhancedNormalize

enhancedNormalize(text: string): string

Normalizes text for similarity comparison using enhanced Unicode and punctuation handling.

Removes diacritics, normalizes punctuation and Japanese characters, expands abbreviations, and removes spaces for consistent matching. Results are cached for performance.

Parameters

text: string
The text to normalize.

Returns string

The normalized text string.

Source

export function enhancedNormalize(text: string): string {
  if (!text) return "";

  const cached = getCacheEntry(normalizeCache, text);
  if (cached !== undefined) {
    return cached;
  }

  let normalized = text.trim();

  // Remove common ignorable patterns
  for (const pattern of IGNORABLE_PATTERNS) {
    normalized = normalized.replaceAll(pattern, "");
  }

  // Normalize Unicode characters
  normalized = normalized.normalize("NFD");

  // Convert full-width characters to half-width
  normalized = normalized.replaceAll(/[\uff01-\uff5e]/g, (char) =>
    String.fromCodePoint((char.codePointAt(0) ?? 0) - 0xfee0),
  );

  // Remove diacritics/accents
  normalized = normalized.replaceAll(/[\u0300-\u036f]/g, "");

  // Normalize punctuation and special characters
  normalized = normalized
    .replaceAll("'", "'") // Normalize apostrophes
    .replaceAll('" ', '"') // Normalize quotes
    .replaceAll(/[\u2013\u2014]/g, "-") // Normalize dashes
    .replaceAll("\u2026", "...") // Normalize ellipsis
    .replaceAll("\u00d7", "x") // Normalize multiplication sign
    .replaceAll("\uff01", "!") // Japanese exclamation
    .replaceAll("\uff1f", "?") // Japanese question mark
    .replaceAll("\uff1a", ":") // Japanese colon
    .replaceAll("\uff1b", ";") // Japanese semicolon
    .replaceAll("\uff0c", ",") // Japanese comma
    .replaceAll("\u3002", ".") // Japanese period
    .replaceAll("\uff08", "(") // Japanese left parenthesis
    .replaceAll("\uff09", ")") // Japanese right parenthesis
    .replaceAll("\u300c", '"') // Japanese left quote
    .replaceAll("\u300d", '"') // Japanese right quote
    .replaceAll("\u300e", '"') // Japanese left double quote
    .replaceAll("\u300f", '"'); // Japanese right double quote

  // Handle common abbreviations
  for (const [abbrev, expansion] of ABBREVIATION_MAP) {
    const regex = new RegExp(String.raw`\b${abbrev}\b`, "gi");
    normalized = normalized.replaceAll(regex, expansion);
  }

  // Normalize whitespace and special characters
  normalized = normalized
    .replaceAll(/[^\w\s\-']/g, " ") // Replace most special chars with space
    .replaceAll("-", "") // Remove dashes to match manga-search-service normalization
    .replaceAll(/\s+/g, "") // Remove all spaces for more consistent matching
    .toLowerCase()
    .trim();

  setCacheEntry(normalizeCache, text, normalized, NORMALIZE_CACHE_LIMIT);

  return normalized;
}

Function enhancedNormalize

Parameters

Returns string

Source

Settings