You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

69 lines
3.2 KiB

/**
* Heuristic language guess for read-aloud / Piper when there is no persisted translation `lang`.
* Keep in sync with `services/piper-tts-proxy/server.ts` `detectLanguage` (same script / ratio logic),
* plus `en-gb` hints and a few extra Latin scripts (pl, cs, tr) that have Piper voices in-app.
*/
export function detectReadAloudContentLanguage(text: string): string {
if (!text || text.length === 0) return 'en'
const sample = text.slice(0, Math.min(500, text.length))
const total = sample.length || 1
const germanChars = (sample.match(/[äöüßÄÖÜ]/g) || []).length
const frenchChars = (sample.match(/[éèêëàâäçôùûüÉÈÊËÀÂÄÇÔÙÛÜ]/g) || []).length
const spanishChars = (sample.match(/[ñáéíóúüÑÁÉÍÓÚÜ¿¡]/g) || []).length
const italianChars = (sample.match(/[àèéìòùÀÈÉÌÒÙ]/g) || []).length
const cyrillicChars = (sample.match(/[а-яёА-ЯЁ]/g) || []).length
const hangulChars = (sample.match(/[\uac00-\ud7af]/g) || []).length
const kanaChars = (sample.match(/[\u3040-\u309f\u30a0-\u30ff]/g) || []).length
const hanChars = (sample.match(/[\u4e00-\u9fff]/g) || []).length
const arabicChars = (sample.match(/[\u0600-\u06ff]/g) || []).length
const polishChars = (sample.match(/[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]/g) || []).length
const czechChars = (sample.match(/[řůŘŮ]/g) || []).length
/** Exclude üöç (shared with German / French); rely on ğ/ı/ş/İ so “Grüße” is not Turkish. */
const turkishChars = (sample.match(/[ğĞıİşŞ]/g) || []).length
const cyrillicRatio = cyrillicChars / total
const hangulRatio = hangulChars / total
const kanaRatio = kanaChars / total
const hanRatio = hanChars / total
const arabicRatio = arabicChars / total
const germanRatio = germanChars / total
const frenchRatio = frenchChars / total
const spanishRatio = spanishChars / total
const italianRatio = italianChars / total
const polishRatio = polishChars / total
const czechRatio = czechChars / total
const turkishRatio = turkishChars / total
if (cyrillicRatio > 0.1) return 'ru'
if (hangulRatio > 0.06 || kanaRatio > 0.02) return 'en'
if (hanRatio > 0.1) return 'zh'
if (arabicRatio > 0.1) return 'ar'
if (germanRatio > 0.02) return 'de'
if (frenchRatio > 0.02) return 'fr'
/** Before Spanish: shared letters like `ó` (Polish) would otherwise count as Spanish. */
if (polishRatio > 0.02) return 'pl'
if (czechRatio > 0.015) return 'cs'
if (spanishRatio > 0.02) return 'es'
if (italianRatio > 0.02) return 'it'
if (turkishRatio > 0.02) return 'tr'
if (preferBritishEnglish(sample)) {
return 'en-gb'
}
return 'en'
}
/** Weak signal: UK spellings vs US spellings when the rest looks like Latin “English”. */
function preferBritishEnglish(sample: string): boolean {
const uk =
/\b(colour|behaviour|realise|realising|centre|defence|favour|favourite|organised|travelling|neighbour|humour|labour)\b/gi
const us =
/\b(color|behavior|realize|realizing|center|defense|favor|favorite|organized|traveling|neighbor|humor|labor)\b/gi
let ukN = 0
let usN = 0
for (const _ of sample.matchAll(uk)) ukN++
for (const _ of sample.matchAll(us)) usN++
return ukN > usN
}