You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

661 lines
20 KiB

/**
* TTS Service
* Manages text-to-speech with multiple provider support
*/
import type { TTSProvider, TTSProviderInterface, TTSOptions, TTSState, TTSVoice, TTSEventCallbacks } from './types.js';
/**
* Base class for audio-based TTS providers
*/
abstract class AudioProvider implements TTSProviderInterface {
protected state: TTSState = 'idle';
protected callbacks: TTSEventCallbacks = {};
protected audioElement: HTMLAudioElement | null = null;
abstract readonly name: string;
abstract readonly type: TTSProvider;
protected updateState(newState: TTSState): void {
if (this.state !== newState) {
this.state = newState;
this.callbacks.onStateChange?.(newState);
}
}
protected setupAudioElement(audioUrl: string, volume: number = 1.0): void {
this.audioElement = new Audio(audioUrl);
this.audioElement.volume = volume;
this.audioElement.onplay = () => this.updateState('playing');
this.audioElement.onpause = () => this.updateState('paused');
this.audioElement.onended = () => {
this.updateState('idle');
URL.revokeObjectURL(audioUrl);
this.audioElement = null;
this.callbacks.onEnd?.();
};
this.audioElement.onerror = () => {
this.updateState('error');
this.callbacks.onError?.(new Error('Audio playback failed'));
};
this.audioElement.ontimeupdate = () => {
if (this.audioElement?.duration) {
this.callbacks.onProgress?.(this.audioElement.currentTime / this.audioElement.duration);
}
};
}
async pause(): Promise<void> {
if (this.audioElement && this.state === 'playing') {
this.audioElement.pause();
}
}
async resume(): Promise<void> {
if (this.audioElement && this.state === 'paused') {
await this.audioElement.play();
}
}
async stop(): Promise<void> {
if (this.audioElement) {
this.audioElement.pause();
this.audioElement.currentTime = 0;
this.audioElement = null;
}
this.updateState('idle');
}
getState(): TTSState {
return this.state;
}
async getProgress(): Promise<number> {
if (this.audioElement?.duration) {
return this.audioElement.currentTime / this.audioElement.duration;
}
return 0;
}
async setProgress(position: number): Promise<void> {
if (this.audioElement?.duration) {
this.audioElement.currentTime = position * this.audioElement.duration;
}
}
setCallbacks(callbacks: TTSEventCallbacks): void {
this.callbacks = { ...this.callbacks, ...callbacks };
}
abstract isAvailable(): Promise<boolean>;
abstract getVoices(): Promise<TTSVoice[]>;
abstract speak(text: string, options?: TTSOptions): Promise<void>;
abstract destroy(): void;
}
/**
* Web Speech API TTS Provider
*/
class WebSpeechProvider implements TTSProviderInterface {
readonly name = 'Web Speech API';
readonly type: TTSProvider = 'webspeech';
private synth: SpeechSynthesis;
private utterance: SpeechSynthesisUtterance | null = null;
private state: TTSState = 'idle';
private callbacks: TTSEventCallbacks = {};
constructor() {
if (typeof window === 'undefined' || !('speechSynthesis' in window)) {
throw new Error('Web Speech API not available');
}
this.synth = window.speechSynthesis;
this.synth.onvoiceschanged = () => this.loadVoices();
this.loadVoices();
}
private loadVoices(): void {
// Voices loaded asynchronously
}
async isAvailable(): Promise<boolean> {
return typeof window !== 'undefined' && 'speechSynthesis' in window;
}
async getVoices(): Promise<TTSVoice[]> {
const voices = this.synth.getVoices();
return voices.map(voice => ({
id: voice.voiceURI,
name: voice.name,
lang: voice.lang,
gender: voice.name.toLowerCase().includes('female') ? 'female' :
voice.name.toLowerCase().includes('male') ? 'male' : 'neutral',
provider: 'webspeech'
}));
}
private getBestVoice(lang: string = 'en'): SpeechSynthesisVoice | null {
const voices = this.synth.getVoices();
const langPrefix = lang.split('-')[0];
const langVoices = voices.filter(v => v.lang.startsWith(langPrefix));
if (langVoices.length === 0) return voices[0] || null;
return langVoices.find(v =>
v.name.toLowerCase().includes('google') ||
v.voiceURI.toLowerCase().includes('google')
) || langVoices.find(v => v.name.toLowerCase().includes('neural')) || langVoices[0];
}
async speak(text: string, options?: TTSOptions): Promise<void> {
this.stop();
if (!text.trim()) return;
this.utterance = new SpeechSynthesisUtterance(text);
if (options?.voice?.provider === 'webspeech') {
const voice = this.synth.getVoices().find(v => v.voiceURI === options.voice!.id);
if (voice) this.utterance.voice = voice;
} else {
const bestVoice = this.getBestVoice();
if (bestVoice) this.utterance.voice = bestVoice;
}
this.utterance.rate = options?.speed ?? 1.0;
this.utterance.pitch = options?.pitch ?? 1.0;
this.utterance.volume = options?.volume ?? 1.0;
if (this.utterance.voice) {
this.utterance.lang = this.utterance.voice.lang;
}
this.utterance.onstart = () => this.updateState('playing');
this.utterance.onend = () => {
this.updateState('idle');
this.utterance = null;
this.callbacks.onEnd?.();
};
this.utterance.onerror = (event) => {
this.updateState('error');
this.callbacks.onError?.(new Error(`Speech synthesis error: ${event.error}`));
};
this.synth.speak(this.utterance);
this.updateState('playing');
}
async pause(): Promise<void> {
if (this.state === 'playing' && this.synth.speaking) {
this.synth.pause();
this.updateState('paused');
}
}
async resume(): Promise<void> {
if (this.state === 'paused' && this.synth.paused) {
this.synth.resume();
this.updateState('playing');
}
}
async stop(): Promise<void> {
if (this.synth.speaking || this.synth.paused) {
this.synth.cancel();
}
this.utterance = null;
this.updateState('idle');
}
getState(): TTSState {
return this.state;
}
async getProgress(): Promise<number> {
return 0; // Web Speech API doesn't provide progress
}
async setProgress(position: number): Promise<void> {
// Not supported - would need to stop and restart
}
destroy(): void {
this.stop();
this.callbacks = {};
}
setCallbacks(callbacks: TTSEventCallbacks): void {
this.callbacks = { ...this.callbacks, ...callbacks };
}
private updateState(newState: TTSState): void {
if (this.state !== newState) {
this.state = newState;
this.callbacks.onStateChange?.(newState);
}
}
}
/**
* OpenAI TTS Provider
*/
class OpenAIProvider extends AudioProvider {
readonly name = 'OpenAI TTS';
readonly type: TTSProvider = 'openai';
private apiKey: string | null = null;
constructor(apiKey?: string) {
super();
this.apiKey = apiKey || null;
}
setApiKey(apiKey: string): void {
this.apiKey = apiKey;
}
async isAvailable(): Promise<boolean> {
return this.apiKey !== null && typeof window !== 'undefined';
}
async getVoices(): Promise<TTSVoice[]> {
return [
{ id: 'alloy', name: 'Alloy', lang: 'en', provider: 'openai' },
{ id: 'echo', name: 'Echo', lang: 'en', provider: 'openai' },
{ id: 'fable', name: 'Fable', lang: 'en', provider: 'openai' },
{ id: 'onyx', name: 'Onyx', lang: 'en', provider: 'openai' },
{ id: 'nova', name: 'Nova', lang: 'en', provider: 'openai' },
{ id: 'shimmer', name: 'Shimmer', lang: 'en', provider: 'openai' }
];
}
async speak(text: string, options?: TTSOptions): Promise<void> {
if (!this.apiKey) {
throw new Error('OpenAI API key not set');
}
this.stop();
if (!text.trim()) return;
const voice = options?.voice || (await this.getVoices())[0];
const speed = Math.max(0.25, Math.min(4.0, options?.speed ?? 1.0));
try {
this.updateState('synthesizing');
const response = await fetch('https://api.openai.com/v1/audio/speech', {
method: 'POST',
headers: {
'Authorization': `Bearer ${this.apiKey}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'tts-1',
input: text,
voice: voice.id,
speed
})
});
if (!response.ok) {
const error = await response.json().catch(() => ({ error: { message: 'Unknown error' } }));
const errorMessage = error.error?.message || response.statusText;
if (response.status === 429 || errorMessage.toLowerCase().includes('quota')) {
const quotaError = new Error(`OpenAI TTS quota exceeded: ${errorMessage}`);
(quotaError as any).isQuotaError = true;
throw quotaError;
}
throw new Error(`OpenAI TTS error: ${errorMessage}`);
}
const audioBlob = await response.blob();
const audioUrl = URL.createObjectURL(audioBlob);
this.setupAudioElement(audioUrl, options?.volume ?? 1.0);
await this.audioElement!.play();
this.updateState('playing');
} catch (error) {
this.updateState('error');
const err = error instanceof Error ? error : new Error('Failed to speak text');
this.callbacks.onError?.(err);
throw err;
}
}
destroy(): void {
this.stop();
this.callbacks = {};
this.apiKey = null;
}
}
/**
* Piper TTS Provider
*/
class PiperProvider extends AudioProvider {
readonly name = 'Piper TTS';
readonly type: TTSProvider = 'piper';
private serverUrl: string | null = null;
private abortController: AbortController | null = null;
private timeoutId: ReturnType<typeof setTimeout> | null = null;
private readonly defaultVoices: TTSVoice[] = [
// English (US) - all quality levels
{ id: 'en_US-lessac-low', name: 'English (US) - Lessac Low', lang: 'en-US', provider: 'piper' },
{ id: 'en_US-lessac-medium', name: 'English (US) - Lessac Medium', lang: 'en-US', provider: 'piper' },
{ id: 'en_US-lessac-high', name: 'English (US) - Lessac High', lang: 'en-US', provider: 'piper' },
// English (GB)
{ id: 'en_GB-alba-medium', name: 'English (GB) - Alba Medium', lang: 'en-GB', provider: 'piper' },
// German
{ id: 'de_DE-thorsten-low', name: 'German - Thorsten Low', lang: 'de-DE', provider: 'piper' },
{ id: 'de_DE-thorsten-medium', name: 'German - Thorsten Medium', lang: 'de-DE', provider: 'piper' },
// French
{ id: 'fr_FR-siwis-low', name: 'French - Siwis Low', lang: 'fr-FR', provider: 'piper' },
{ id: 'fr_FR-siwis-medium', name: 'French - Siwis Medium', lang: 'fr-FR', provider: 'piper' },
// Spanish
{ id: 'es_ES-davefx-medium', name: 'Spanish - Davefx Medium', lang: 'es-ES', provider: 'piper' },
// Italian - riccardo voices not available
// Russian
{ id: 'ru_RU-ruslan-medium', name: 'Russian - Ruslan Medium', lang: 'ru-RU', provider: 'piper' },
// Chinese
{ id: 'zh_CN-huayan-medium', name: 'Chinese - Huayan Medium', lang: 'zh-CN', provider: 'piper' },
// Arabic - hafez voice not available
// Polish
{ id: 'pl_PL-darkman-medium', name: 'Polish - Darkman Medium', lang: 'pl-PL', provider: 'piper' },
// Portuguese - edresson voice not available
// Dutch
{ id: 'nl_NL-mls-medium', name: 'Dutch - MLS Medium', lang: 'nl-NL', provider: 'piper' },
// Czech
{ id: 'cs_CZ-jirka-medium', name: 'Czech - Jirka Medium', lang: 'cs-CZ', provider: 'piper' },
// Turkish
{ id: 'tr_TR-dfki-medium', name: 'Turkish - DFKI Medium', lang: 'tr-TR', provider: 'piper' },
// Japanese - nanami voice not available
// Korean - kyungha voice not available
];
constructor(serverUrl?: string) {
super();
this.serverUrl = serverUrl || null;
}
async isAvailable(): Promise<boolean> {
if (typeof window === 'undefined') return false;
return this.serverUrl !== null || typeof Worker !== 'undefined';
}
async getVoices(): Promise<TTSVoice[]> {
return this.defaultVoices;
}
async initialize(): Promise<void> {
// Server-based synthesis doesn't need initialization
}
async speak(text: string, options?: TTSOptions): Promise<void> {
if (!text.trim()) return;
this.stop();
this.updateState('synthesizing');
if (!this.serverUrl) {
throw new Error('Piper TTS server URL not configured');
}
const voice = options?.voice || this.defaultVoices[0];
const speed = Math.max(0.25, Math.min(2.0, options?.speed ?? 1.0));
console.log('PiperProvider: Using voice:', voice.id, 'from options:', options?.voice?.id || 'default');
try {
// Create abort controller for cancellation
this.abortController = new AbortController();
this.timeoutId = setTimeout(() => {
console.log('Piper TTS: Request timeout');
this.abortController?.abort();
}, 300000); // 5 minutes
const response = await fetch('/api/piper-tts', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text, voice: voice.id, speed }),
signal: this.abortController.signal,
});
// Clear timeout on success
if (this.timeoutId) {
clearTimeout(this.timeoutId);
this.timeoutId = null;
}
if (!response.ok) {
const errorText = await response.text().catch(() => response.statusText);
throw new Error(`Piper TTS server error: ${response.status} ${errorText}`);
}
const audioBlob = await response.blob();
if (audioBlob.size === 0) {
throw new Error('Received empty audio blob from Piper TTS server');
}
// Cache the audio blob
try {
const { cacheMedia } = await import('../../services/cache/media-cache.js');
// Create a cache key from text + voice + speed for TTS
const cacheKey = `tts:${voice.id}:${speed}:${text.substring(0, 100)}`;
await cacheMedia(cacheKey, audioBlob, 'audio');
} catch (cacheError) {
// Cache failure is non-critical
console.debug('Failed to cache TTS audio:', cacheError);
}
const audioUrl = URL.createObjectURL(audioBlob);
this.setupAudioElement(audioUrl, options?.volume ?? 1.0);
await this.audioElement!.play();
this.updateState('playing');
} catch (error) {
// Clear abort controller and timeout
this.abortController = null;
if (this.timeoutId) {
clearTimeout(this.timeoutId);
this.timeoutId = null;
}
if (error instanceof Error && error.name === 'AbortError') {
console.log('Piper TTS: Request cancelled');
this.updateState('idle');
return; // Don't throw on cancellation
}
this.updateState('error');
const err = error instanceof Error ? error : new Error('Failed to speak text');
this.callbacks.onError?.(err);
throw err;
}
}
async stop(): Promise<void> {
// Abort ongoing fetch request
if (this.abortController) {
console.log('Piper TTS: Aborting request');
this.abortController.abort();
this.abortController = null;
}
if (this.timeoutId) {
clearTimeout(this.timeoutId);
this.timeoutId = null;
}
await super.stop();
}
destroy(): void {
this.stop();
this.callbacks = {};
}
}
/**
* TTS Service
* Manages TTS providers and provides unified interface
*/
export class TTSService {
private provider: TTSProviderInterface | null = null;
private providerType: TTSProvider = 'webspeech';
private callbacks: TTSEventCallbacks = {};
async initialize(providerType: TTSProvider = 'webspeech'): Promise<void> {
if (this.provider) {
this.provider.destroy();
}
this.providerType = providerType;
if (providerType === 'webspeech') {
this.provider = new WebSpeechProvider();
} else if (providerType === 'openai') {
const { loadEncryptedApiKey } = await import('../security/api-key-storage.js');
const password = prompt('Enter your password to access OpenAI API key:');
if (!password) {
throw new Error('Password required to access OpenAI API key');
}
const apiKey = await loadEncryptedApiKey('tts.openai', password);
if (!apiKey || !apiKey.startsWith('sk-')) {
throw new Error('Invalid or missing OpenAI API key');
}
this.provider = new OpenAIProvider(apiKey);
} else if (providerType === 'piper') {
const serverUrl = localStorage.getItem('piper_tts_server_url') || 'http://localhost:5000';
this.provider = new PiperProvider(serverUrl);
await (this.provider as PiperProvider).initialize();
} else {
throw new Error(`Unknown TTS provider: ${providerType}`);
}
if (this.provider.setCallbacks) {
this.provider.setCallbacks(this.callbacks);
}
}
async isAvailable(): Promise<boolean> {
if (!this.provider) {
// Try to auto-initialize with best available provider
try {
const { hasApiKey } = await import('../security/api-key-storage.js');
if (await hasApiKey('tts.openai')) {
try {
await this.initialize('openai');
return true;
} catch {
// Fall through
}
}
} catch {
// Ignore
}
try {
const serverUrl = localStorage.getItem('piper_tts_server_url') || 'http://localhost:5000';
const provider = new PiperProvider(serverUrl);
if (await provider.isAvailable()) {
await this.initialize('piper');
return true;
}
} catch {
// Fall through
}
await this.initialize('webspeech');
}
return this.provider ? await this.provider.isAvailable() : false;
}
async getVoices(): Promise<TTSVoice[]> {
if (!this.provider) {
await this.initialize();
}
return this.provider ? await this.provider.getVoices() : [];
}
async speak(text: string, options?: TTSOptions): Promise<void> {
if (!this.provider) {
await this.initialize();
}
if (this.provider) {
try {
await this.provider.speak(text, options);
} catch (error) {
// Auto-fallback from OpenAI to Web Speech on quota error
if (error instanceof Error && (error as any).isQuotaError && this.providerType === 'openai') {
console.warn('OpenAI TTS quota exceeded, falling back to Web Speech API');
await this.initialize('webspeech');
if (this.provider) {
await this.provider.speak(text, options);
return;
}
}
throw error;
}
}
}
async pause(): Promise<void> {
await this.provider?.pause();
}
async resume(): Promise<void> {
await this.provider?.resume();
}
async stop(): Promise<void> {
await this.provider?.stop();
}
getState(): TTSState {
return this.provider?.getState() ?? 'idle';
}
async getProgress(): Promise<number> {
return this.provider ? await this.provider.getProgress() : 0;
}
async setProgress(position: number): Promise<void> {
if (this.provider) {
await this.provider.setProgress(position);
}
}
setCallbacks(callbacks: TTSEventCallbacks): void {
this.callbacks = { ...this.callbacks, ...callbacks };
if (this.provider?.setCallbacks) {
this.provider.setCallbacks(this.callbacks);
}
}
getProviderType(): TTSProvider {
return this.providerType;
}
destroy(): void {
if (this.provider) {
this.provider.destroy();
this.provider = null;
}
this.callbacks = {};
}
}
// Singleton instance
let ttsServiceInstance: TTSService | null = null;
export function getTTSService(): TTSService {
if (!ttsServiceInstance) {
ttsServiceInstance = new TTSService();
}
return ttsServiceInstance;
}