jumble/src/lib/read-aloud-content-language.ts

/**
 * Heuristic language guess for read-aloud / Piper when there is no persisted translation `lang`.
 * Keep in sync with `services/piper-tts-proxy/server.ts` `detectLanguage` (same script / ratio logic),
 * plus `en-gb` hints and a few extra Latin scripts (pl, cs, tr) that have Piper voices in-app.
 */
export function detectReadAloudContentLanguage(text: string): string {
  if (!text || text.length === 0) return 'en'

  const sample = text.slice(0, Math.min(500, text.length))
  const total = sample.length || 1

  const germanChars = (sample.match(/[äöüßÄÖÜ]/g) || []).length
  const frenchChars = (sample.match(/[éèêëàâäçôùûüÉÈÊËÀÂÄÇÔÙÛÜ]/g) || []).length
  const spanishChars = (sample.match(/[ñáéíóúüÑÁÉÍÓÚÜ¿¡]/g) || []).length
  const italianChars = (sample.match(/[àèéìòùÀÈÉÌÒÙ]/g) || []).length
  const cyrillicChars = (sample.match(/[а-яёА-ЯЁ]/g) || []).length
  const hangulChars = (sample.match(/[\uac00-\ud7af]/g) || []).length
  const kanaChars = (sample.match(/[\u3040-\u309f\u30a0-\u30ff]/g) || []).length
  const hanChars = (sample.match(/[\u4e00-\u9fff]/g) || []).length
  const arabicChars = (sample.match(/[\u0600-\u06ff]/g) || []).length
  const polishChars = (sample.match(/[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]/g) || []).length
  const czechChars = (sample.match(/[řůŘŮ]/g) || []).length
  /** Exclude üöç (shared with German / French); rely on ğ/ı/ş/İ so “Grüße” is not Turkish. */
  const turkishChars = (sample.match(/[ğĞıİşŞ]/g) || []).length

  const cyrillicRatio = cyrillicChars / total
  const hangulRatio = hangulChars / total
  const kanaRatio = kanaChars / total
  const hanRatio = hanChars / total
  const arabicRatio = arabicChars / total
  const germanRatio = germanChars / total
  const frenchRatio = frenchChars / total
  const spanishRatio = spanishChars / total
  const italianRatio = italianChars / total
  const polishRatio = polishChars / total
  const czechRatio = czechChars / total
  const turkishRatio = turkishChars / total

  if (cyrillicRatio > 0.1) return 'ru'
  if (hangulRatio > 0.06 || kanaRatio > 0.02) return 'en'
  if (hanRatio > 0.1) return 'zh'
  if (arabicRatio > 0.1) return 'ar'
  if (germanRatio > 0.02) return 'de'
  if (frenchRatio > 0.02) return 'fr'
  /** Before Spanish: shared letters like `ó` (Polish) would otherwise count as Spanish. */
  if (polishRatio > 0.02) return 'pl'
  if (czechRatio > 0.015) return 'cs'
  if (spanishRatio > 0.02) return 'es'
  if (italianRatio > 0.02) return 'it'
  if (turkishRatio > 0.02) return 'tr'

  if (preferBritishEnglish(sample)) {
    return 'en-gb'
  }
  return 'en'
}

/** Weak signal: UK spellings vs US spellings when the rest looks like Latin “English”. */
function preferBritishEnglish(sample: string): boolean {
  const uk =
    /\b(colour|behaviour|realise|realising|centre|defence|favour|favourite|organised|travelling|neighbour|humour|labour)\b/gi
  const us =
    /\b(color|behavior|realize|realizing|center|defense|favor|favorite|organized|traveling|neighbor|humor|labor)\b/gi
  let ukN = 0
  let usN = 0
  for (const _ of sample.matchAll(uk)) ukN++
  for (const _ of sample.matchAll(us)) usN++
  return ukN > usN
}