You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
661 lines
20 KiB
661 lines
20 KiB
/** |
|
* TTS Service |
|
* Manages text-to-speech with multiple provider support |
|
*/ |
|
|
|
import type { TTSProvider, TTSProviderInterface, TTSOptions, TTSState, TTSVoice, TTSEventCallbacks } from './types.js'; |
|
|
|
/** |
|
* Base class for audio-based TTS providers |
|
*/ |
|
abstract class AudioProvider implements TTSProviderInterface { |
|
protected state: TTSState = 'idle'; |
|
protected callbacks: TTSEventCallbacks = {}; |
|
protected audioElement: HTMLAudioElement | null = null; |
|
|
|
abstract readonly name: string; |
|
abstract readonly type: TTSProvider; |
|
|
|
protected updateState(newState: TTSState): void { |
|
if (this.state !== newState) { |
|
this.state = newState; |
|
this.callbacks.onStateChange?.(newState); |
|
} |
|
} |
|
|
|
protected setupAudioElement(audioUrl: string, volume: number = 1.0): void { |
|
this.audioElement = new Audio(audioUrl); |
|
this.audioElement.volume = volume; |
|
|
|
this.audioElement.onplay = () => this.updateState('playing'); |
|
this.audioElement.onpause = () => this.updateState('paused'); |
|
this.audioElement.onended = () => { |
|
this.updateState('idle'); |
|
URL.revokeObjectURL(audioUrl); |
|
this.audioElement = null; |
|
this.callbacks.onEnd?.(); |
|
}; |
|
this.audioElement.onerror = () => { |
|
this.updateState('error'); |
|
this.callbacks.onError?.(new Error('Audio playback failed')); |
|
}; |
|
this.audioElement.ontimeupdate = () => { |
|
if (this.audioElement?.duration) { |
|
this.callbacks.onProgress?.(this.audioElement.currentTime / this.audioElement.duration); |
|
} |
|
}; |
|
} |
|
|
|
async pause(): Promise<void> { |
|
if (this.audioElement && this.state === 'playing') { |
|
this.audioElement.pause(); |
|
} |
|
} |
|
|
|
async resume(): Promise<void> { |
|
if (this.audioElement && this.state === 'paused') { |
|
await this.audioElement.play(); |
|
} |
|
} |
|
|
|
async stop(): Promise<void> { |
|
if (this.audioElement) { |
|
this.audioElement.pause(); |
|
this.audioElement.currentTime = 0; |
|
this.audioElement = null; |
|
} |
|
this.updateState('idle'); |
|
} |
|
|
|
getState(): TTSState { |
|
return this.state; |
|
} |
|
|
|
async getProgress(): Promise<number> { |
|
if (this.audioElement?.duration) { |
|
return this.audioElement.currentTime / this.audioElement.duration; |
|
} |
|
return 0; |
|
} |
|
|
|
async setProgress(position: number): Promise<void> { |
|
if (this.audioElement?.duration) { |
|
this.audioElement.currentTime = position * this.audioElement.duration; |
|
} |
|
} |
|
|
|
setCallbacks(callbacks: TTSEventCallbacks): void { |
|
this.callbacks = { ...this.callbacks, ...callbacks }; |
|
} |
|
|
|
abstract isAvailable(): Promise<boolean>; |
|
abstract getVoices(): Promise<TTSVoice[]>; |
|
abstract speak(text: string, options?: TTSOptions): Promise<void>; |
|
abstract destroy(): void; |
|
} |
|
|
|
/** |
|
* Web Speech API TTS Provider |
|
*/ |
|
class WebSpeechProvider implements TTSProviderInterface { |
|
readonly name = 'Web Speech API'; |
|
readonly type: TTSProvider = 'webspeech'; |
|
|
|
private synth: SpeechSynthesis; |
|
private utterance: SpeechSynthesisUtterance | null = null; |
|
private state: TTSState = 'idle'; |
|
private callbacks: TTSEventCallbacks = {}; |
|
|
|
constructor() { |
|
if (typeof window === 'undefined' || !('speechSynthesis' in window)) { |
|
throw new Error('Web Speech API not available'); |
|
} |
|
this.synth = window.speechSynthesis; |
|
this.synth.onvoiceschanged = () => this.loadVoices(); |
|
this.loadVoices(); |
|
} |
|
|
|
private loadVoices(): void { |
|
// Voices loaded asynchronously |
|
} |
|
|
|
async isAvailable(): Promise<boolean> { |
|
return typeof window !== 'undefined' && 'speechSynthesis' in window; |
|
} |
|
|
|
async getVoices(): Promise<TTSVoice[]> { |
|
const voices = this.synth.getVoices(); |
|
return voices.map(voice => ({ |
|
id: voice.voiceURI, |
|
name: voice.name, |
|
lang: voice.lang, |
|
gender: voice.name.toLowerCase().includes('female') ? 'female' : |
|
voice.name.toLowerCase().includes('male') ? 'male' : 'neutral', |
|
provider: 'webspeech' |
|
})); |
|
} |
|
|
|
private getBestVoice(lang: string = 'en'): SpeechSynthesisVoice | null { |
|
const voices = this.synth.getVoices(); |
|
const langPrefix = lang.split('-')[0]; |
|
const langVoices = voices.filter(v => v.lang.startsWith(langPrefix)); |
|
|
|
if (langVoices.length === 0) return voices[0] || null; |
|
|
|
return langVoices.find(v => |
|
v.name.toLowerCase().includes('google') || |
|
v.voiceURI.toLowerCase().includes('google') |
|
) || langVoices.find(v => v.name.toLowerCase().includes('neural')) || langVoices[0]; |
|
} |
|
|
|
async speak(text: string, options?: TTSOptions): Promise<void> { |
|
this.stop(); |
|
if (!text.trim()) return; |
|
|
|
this.utterance = new SpeechSynthesisUtterance(text); |
|
|
|
if (options?.voice?.provider === 'webspeech') { |
|
const voice = this.synth.getVoices().find(v => v.voiceURI === options.voice!.id); |
|
if (voice) this.utterance.voice = voice; |
|
} else { |
|
const bestVoice = this.getBestVoice(); |
|
if (bestVoice) this.utterance.voice = bestVoice; |
|
} |
|
|
|
this.utterance.rate = options?.speed ?? 1.0; |
|
this.utterance.pitch = options?.pitch ?? 1.0; |
|
this.utterance.volume = options?.volume ?? 1.0; |
|
if (this.utterance.voice) { |
|
this.utterance.lang = this.utterance.voice.lang; |
|
} |
|
|
|
this.utterance.onstart = () => this.updateState('playing'); |
|
this.utterance.onend = () => { |
|
this.updateState('idle'); |
|
this.utterance = null; |
|
this.callbacks.onEnd?.(); |
|
}; |
|
this.utterance.onerror = (event) => { |
|
this.updateState('error'); |
|
this.callbacks.onError?.(new Error(`Speech synthesis error: ${event.error}`)); |
|
}; |
|
|
|
this.synth.speak(this.utterance); |
|
this.updateState('playing'); |
|
} |
|
|
|
async pause(): Promise<void> { |
|
if (this.state === 'playing' && this.synth.speaking) { |
|
this.synth.pause(); |
|
this.updateState('paused'); |
|
} |
|
} |
|
|
|
async resume(): Promise<void> { |
|
if (this.state === 'paused' && this.synth.paused) { |
|
this.synth.resume(); |
|
this.updateState('playing'); |
|
} |
|
} |
|
|
|
async stop(): Promise<void> { |
|
if (this.synth.speaking || this.synth.paused) { |
|
this.synth.cancel(); |
|
} |
|
this.utterance = null; |
|
this.updateState('idle'); |
|
} |
|
|
|
getState(): TTSState { |
|
return this.state; |
|
} |
|
|
|
async getProgress(): Promise<number> { |
|
return 0; // Web Speech API doesn't provide progress |
|
} |
|
|
|
async setProgress(position: number): Promise<void> { |
|
// Not supported - would need to stop and restart |
|
} |
|
|
|
destroy(): void { |
|
this.stop(); |
|
this.callbacks = {}; |
|
} |
|
|
|
setCallbacks(callbacks: TTSEventCallbacks): void { |
|
this.callbacks = { ...this.callbacks, ...callbacks }; |
|
} |
|
|
|
private updateState(newState: TTSState): void { |
|
if (this.state !== newState) { |
|
this.state = newState; |
|
this.callbacks.onStateChange?.(newState); |
|
} |
|
} |
|
} |
|
|
|
/** |
|
* OpenAI TTS Provider |
|
*/ |
|
class OpenAIProvider extends AudioProvider { |
|
readonly name = 'OpenAI TTS'; |
|
readonly type: TTSProvider = 'openai'; |
|
|
|
private apiKey: string | null = null; |
|
|
|
constructor(apiKey?: string) { |
|
super(); |
|
this.apiKey = apiKey || null; |
|
} |
|
|
|
setApiKey(apiKey: string): void { |
|
this.apiKey = apiKey; |
|
} |
|
|
|
async isAvailable(): Promise<boolean> { |
|
return this.apiKey !== null && typeof window !== 'undefined'; |
|
} |
|
|
|
async getVoices(): Promise<TTSVoice[]> { |
|
return [ |
|
{ id: 'alloy', name: 'Alloy', lang: 'en', provider: 'openai' }, |
|
{ id: 'echo', name: 'Echo', lang: 'en', provider: 'openai' }, |
|
{ id: 'fable', name: 'Fable', lang: 'en', provider: 'openai' }, |
|
{ id: 'onyx', name: 'Onyx', lang: 'en', provider: 'openai' }, |
|
{ id: 'nova', name: 'Nova', lang: 'en', provider: 'openai' }, |
|
{ id: 'shimmer', name: 'Shimmer', lang: 'en', provider: 'openai' } |
|
]; |
|
} |
|
|
|
async speak(text: string, options?: TTSOptions): Promise<void> { |
|
if (!this.apiKey) { |
|
throw new Error('OpenAI API key not set'); |
|
} |
|
|
|
this.stop(); |
|
if (!text.trim()) return; |
|
|
|
const voice = options?.voice || (await this.getVoices())[0]; |
|
const speed = Math.max(0.25, Math.min(4.0, options?.speed ?? 1.0)); |
|
|
|
try { |
|
this.updateState('synthesizing'); |
|
|
|
const response = await fetch('https://api.openai.com/v1/audio/speech', { |
|
method: 'POST', |
|
headers: { |
|
'Authorization': `Bearer ${this.apiKey}`, |
|
'Content-Type': 'application/json' |
|
}, |
|
body: JSON.stringify({ |
|
model: 'tts-1', |
|
input: text, |
|
voice: voice.id, |
|
speed |
|
}) |
|
}); |
|
|
|
if (!response.ok) { |
|
const error = await response.json().catch(() => ({ error: { message: 'Unknown error' } })); |
|
const errorMessage = error.error?.message || response.statusText; |
|
|
|
if (response.status === 429 || errorMessage.toLowerCase().includes('quota')) { |
|
const quotaError = new Error(`OpenAI TTS quota exceeded: ${errorMessage}`); |
|
(quotaError as any).isQuotaError = true; |
|
throw quotaError; |
|
} |
|
|
|
throw new Error(`OpenAI TTS error: ${errorMessage}`); |
|
} |
|
|
|
const audioBlob = await response.blob(); |
|
const audioUrl = URL.createObjectURL(audioBlob); |
|
|
|
this.setupAudioElement(audioUrl, options?.volume ?? 1.0); |
|
await this.audioElement!.play(); |
|
this.updateState('playing'); |
|
} catch (error) { |
|
this.updateState('error'); |
|
const err = error instanceof Error ? error : new Error('Failed to speak text'); |
|
this.callbacks.onError?.(err); |
|
throw err; |
|
} |
|
} |
|
|
|
destroy(): void { |
|
this.stop(); |
|
this.callbacks = {}; |
|
this.apiKey = null; |
|
} |
|
} |
|
|
|
/** |
|
* Piper TTS Provider |
|
*/ |
|
class PiperProvider extends AudioProvider { |
|
readonly name = 'Piper TTS'; |
|
readonly type: TTSProvider = 'piper'; |
|
|
|
private serverUrl: string | null = null; |
|
private abortController: AbortController | null = null; |
|
private timeoutId: ReturnType<typeof setTimeout> | null = null; |
|
|
|
private readonly defaultVoices: TTSVoice[] = [ |
|
// English (US) - all quality levels |
|
{ id: 'en_US-lessac-low', name: 'English (US) - Lessac Low', lang: 'en-US', provider: 'piper' }, |
|
{ id: 'en_US-lessac-medium', name: 'English (US) - Lessac Medium', lang: 'en-US', provider: 'piper' }, |
|
{ id: 'en_US-lessac-high', name: 'English (US) - Lessac High', lang: 'en-US', provider: 'piper' }, |
|
// English (GB) |
|
{ id: 'en_GB-alba-medium', name: 'English (GB) - Alba Medium', lang: 'en-GB', provider: 'piper' }, |
|
// German |
|
{ id: 'de_DE-thorsten-low', name: 'German - Thorsten Low', lang: 'de-DE', provider: 'piper' }, |
|
{ id: 'de_DE-thorsten-medium', name: 'German - Thorsten Medium', lang: 'de-DE', provider: 'piper' }, |
|
// French |
|
{ id: 'fr_FR-siwis-low', name: 'French - Siwis Low', lang: 'fr-FR', provider: 'piper' }, |
|
{ id: 'fr_FR-siwis-medium', name: 'French - Siwis Medium', lang: 'fr-FR', provider: 'piper' }, |
|
// Spanish |
|
{ id: 'es_ES-davefx-medium', name: 'Spanish - Davefx Medium', lang: 'es-ES', provider: 'piper' }, |
|
// Italian - riccardo voices not available |
|
// Russian |
|
{ id: 'ru_RU-ruslan-medium', name: 'Russian - Ruslan Medium', lang: 'ru-RU', provider: 'piper' }, |
|
// Chinese |
|
{ id: 'zh_CN-huayan-medium', name: 'Chinese - Huayan Medium', lang: 'zh-CN', provider: 'piper' }, |
|
// Arabic - hafez voice not available |
|
// Polish |
|
{ id: 'pl_PL-darkman-medium', name: 'Polish - Darkman Medium', lang: 'pl-PL', provider: 'piper' }, |
|
// Portuguese - edresson voice not available |
|
// Dutch |
|
{ id: 'nl_NL-mls-medium', name: 'Dutch - MLS Medium', lang: 'nl-NL', provider: 'piper' }, |
|
// Czech |
|
{ id: 'cs_CZ-jirka-medium', name: 'Czech - Jirka Medium', lang: 'cs-CZ', provider: 'piper' }, |
|
// Turkish |
|
{ id: 'tr_TR-dfki-medium', name: 'Turkish - DFKI Medium', lang: 'tr-TR', provider: 'piper' }, |
|
// Japanese - nanami voice not available |
|
// Korean - kyungha voice not available |
|
]; |
|
|
|
constructor(serverUrl?: string) { |
|
super(); |
|
this.serverUrl = serverUrl || null; |
|
} |
|
|
|
async isAvailable(): Promise<boolean> { |
|
if (typeof window === 'undefined') return false; |
|
return this.serverUrl !== null || typeof Worker !== 'undefined'; |
|
} |
|
|
|
async getVoices(): Promise<TTSVoice[]> { |
|
return this.defaultVoices; |
|
} |
|
|
|
async initialize(): Promise<void> { |
|
// Server-based synthesis doesn't need initialization |
|
} |
|
|
|
async speak(text: string, options?: TTSOptions): Promise<void> { |
|
if (!text.trim()) return; |
|
|
|
this.stop(); |
|
this.updateState('synthesizing'); |
|
|
|
if (!this.serverUrl) { |
|
throw new Error('Piper TTS server URL not configured'); |
|
} |
|
|
|
const voice = options?.voice || this.defaultVoices[0]; |
|
const speed = Math.max(0.25, Math.min(2.0, options?.speed ?? 1.0)); |
|
|
|
console.log('PiperProvider: Using voice:', voice.id, 'from options:', options?.voice?.id || 'default'); |
|
|
|
try { |
|
// Create abort controller for cancellation |
|
this.abortController = new AbortController(); |
|
this.timeoutId = setTimeout(() => { |
|
console.log('Piper TTS: Request timeout'); |
|
this.abortController?.abort(); |
|
}, 300000); // 5 minutes |
|
|
|
const response = await fetch('/api/piper-tts', { |
|
method: 'POST', |
|
headers: { 'Content-Type': 'application/json' }, |
|
body: JSON.stringify({ text, voice: voice.id, speed }), |
|
signal: this.abortController.signal, |
|
}); |
|
|
|
// Clear timeout on success |
|
if (this.timeoutId) { |
|
clearTimeout(this.timeoutId); |
|
this.timeoutId = null; |
|
} |
|
|
|
if (!response.ok) { |
|
const errorText = await response.text().catch(() => response.statusText); |
|
throw new Error(`Piper TTS server error: ${response.status} ${errorText}`); |
|
} |
|
|
|
const audioBlob = await response.blob(); |
|
|
|
if (audioBlob.size === 0) { |
|
throw new Error('Received empty audio blob from Piper TTS server'); |
|
} |
|
|
|
// Cache the audio blob |
|
try { |
|
const { cacheMedia } = await import('../../services/cache/media-cache.js'); |
|
// Create a cache key from text + voice + speed for TTS |
|
const cacheKey = `tts:${voice.id}:${speed}:${text.substring(0, 100)}`; |
|
await cacheMedia(cacheKey, audioBlob, 'audio'); |
|
} catch (cacheError) { |
|
// Cache failure is non-critical |
|
console.debug('Failed to cache TTS audio:', cacheError); |
|
} |
|
|
|
const audioUrl = URL.createObjectURL(audioBlob); |
|
this.setupAudioElement(audioUrl, options?.volume ?? 1.0); |
|
await this.audioElement!.play(); |
|
this.updateState('playing'); |
|
} catch (error) { |
|
// Clear abort controller and timeout |
|
this.abortController = null; |
|
if (this.timeoutId) { |
|
clearTimeout(this.timeoutId); |
|
this.timeoutId = null; |
|
} |
|
|
|
if (error instanceof Error && error.name === 'AbortError') { |
|
console.log('Piper TTS: Request cancelled'); |
|
this.updateState('idle'); |
|
return; // Don't throw on cancellation |
|
} |
|
|
|
this.updateState('error'); |
|
const err = error instanceof Error ? error : new Error('Failed to speak text'); |
|
this.callbacks.onError?.(err); |
|
throw err; |
|
} |
|
} |
|
|
|
async stop(): Promise<void> { |
|
// Abort ongoing fetch request |
|
if (this.abortController) { |
|
console.log('Piper TTS: Aborting request'); |
|
this.abortController.abort(); |
|
this.abortController = null; |
|
} |
|
if (this.timeoutId) { |
|
clearTimeout(this.timeoutId); |
|
this.timeoutId = null; |
|
} |
|
await super.stop(); |
|
} |
|
|
|
destroy(): void { |
|
this.stop(); |
|
this.callbacks = {}; |
|
} |
|
} |
|
|
|
/** |
|
* TTS Service |
|
* Manages TTS providers and provides unified interface |
|
*/ |
|
export class TTSService { |
|
private provider: TTSProviderInterface | null = null; |
|
private providerType: TTSProvider = 'webspeech'; |
|
private callbacks: TTSEventCallbacks = {}; |
|
|
|
async initialize(providerType: TTSProvider = 'webspeech'): Promise<void> { |
|
if (this.provider) { |
|
this.provider.destroy(); |
|
} |
|
|
|
this.providerType = providerType; |
|
|
|
if (providerType === 'webspeech') { |
|
this.provider = new WebSpeechProvider(); |
|
} else if (providerType === 'openai') { |
|
const { loadEncryptedApiKey } = await import('../security/api-key-storage.js'); |
|
const password = prompt('Enter your password to access OpenAI API key:'); |
|
if (!password) { |
|
throw new Error('Password required to access OpenAI API key'); |
|
} |
|
|
|
const apiKey = await loadEncryptedApiKey('tts.openai', password); |
|
if (!apiKey || !apiKey.startsWith('sk-')) { |
|
throw new Error('Invalid or missing OpenAI API key'); |
|
} |
|
|
|
this.provider = new OpenAIProvider(apiKey); |
|
} else if (providerType === 'piper') { |
|
const serverUrl = localStorage.getItem('piper_tts_server_url') || 'http://localhost:5000'; |
|
this.provider = new PiperProvider(serverUrl); |
|
await (this.provider as PiperProvider).initialize(); |
|
} else { |
|
throw new Error(`Unknown TTS provider: ${providerType}`); |
|
} |
|
|
|
if (this.provider.setCallbacks) { |
|
this.provider.setCallbacks(this.callbacks); |
|
} |
|
} |
|
|
|
async isAvailable(): Promise<boolean> { |
|
if (!this.provider) { |
|
// Try to auto-initialize with best available provider |
|
try { |
|
const { hasApiKey } = await import('../security/api-key-storage.js'); |
|
if (await hasApiKey('tts.openai')) { |
|
try { |
|
await this.initialize('openai'); |
|
return true; |
|
} catch { |
|
// Fall through |
|
} |
|
} |
|
} catch { |
|
// Ignore |
|
} |
|
|
|
try { |
|
const serverUrl = localStorage.getItem('piper_tts_server_url') || 'http://localhost:5000'; |
|
const provider = new PiperProvider(serverUrl); |
|
if (await provider.isAvailable()) { |
|
await this.initialize('piper'); |
|
return true; |
|
} |
|
} catch { |
|
// Fall through |
|
} |
|
|
|
await this.initialize('webspeech'); |
|
} |
|
|
|
return this.provider ? await this.provider.isAvailable() : false; |
|
} |
|
|
|
async getVoices(): Promise<TTSVoice[]> { |
|
if (!this.provider) { |
|
await this.initialize(); |
|
} |
|
return this.provider ? await this.provider.getVoices() : []; |
|
} |
|
|
|
async speak(text: string, options?: TTSOptions): Promise<void> { |
|
if (!this.provider) { |
|
await this.initialize(); |
|
} |
|
|
|
if (this.provider) { |
|
try { |
|
await this.provider.speak(text, options); |
|
} catch (error) { |
|
// Auto-fallback from OpenAI to Web Speech on quota error |
|
if (error instanceof Error && (error as any).isQuotaError && this.providerType === 'openai') { |
|
console.warn('OpenAI TTS quota exceeded, falling back to Web Speech API'); |
|
await this.initialize('webspeech'); |
|
if (this.provider) { |
|
await this.provider.speak(text, options); |
|
return; |
|
} |
|
} |
|
throw error; |
|
} |
|
} |
|
} |
|
|
|
async pause(): Promise<void> { |
|
await this.provider?.pause(); |
|
} |
|
|
|
async resume(): Promise<void> { |
|
await this.provider?.resume(); |
|
} |
|
|
|
async stop(): Promise<void> { |
|
await this.provider?.stop(); |
|
} |
|
|
|
getState(): TTSState { |
|
return this.provider?.getState() ?? 'idle'; |
|
} |
|
|
|
async getProgress(): Promise<number> { |
|
return this.provider ? await this.provider.getProgress() : 0; |
|
} |
|
|
|
async setProgress(position: number): Promise<void> { |
|
if (this.provider) { |
|
await this.provider.setProgress(position); |
|
} |
|
} |
|
|
|
setCallbacks(callbacks: TTSEventCallbacks): void { |
|
this.callbacks = { ...this.callbacks, ...callbacks }; |
|
if (this.provider?.setCallbacks) { |
|
this.provider.setCallbacks(this.callbacks); |
|
} |
|
} |
|
|
|
getProviderType(): TTSProvider { |
|
return this.providerType; |
|
} |
|
|
|
destroy(): void { |
|
if (this.provider) { |
|
this.provider.destroy(); |
|
this.provider = null; |
|
} |
|
this.callbacks = {}; |
|
} |
|
} |
|
|
|
// Singleton instance |
|
let ttsServiceInstance: TTSService | null = null; |
|
|
|
export function getTTSService(): TTSService { |
|
if (!ttsServiceInstance) { |
|
ttsServiceInstance = new TTSService(); |
|
} |
|
return ttsServiceInstance; |
|
}
|
|
|