You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1051 lines
42 KiB
1051 lines
42 KiB
interface TTSRequest { |
|
text: string; |
|
voice?: string; |
|
speed?: number; |
|
} |
|
|
|
const PIPER_TTS_DEBUG = process.env.PIPER_TTS_DEBUG === '1'; |
|
|
|
function piperDebug(...args: unknown[]) { |
|
if (PIPER_TTS_DEBUG) console.log(...args); |
|
} |
|
|
|
const WYOMING_DOWN_COOLDOWN_MS = 60_000; |
|
let lastWyomingDownLog = 0; |
|
|
|
function isWyomingUnreachableMessage(msg: string): boolean { |
|
return /ECONNREFUSED|connection refused|ENOTFOUND|ETIMEDOUT/i.test(msg); |
|
} |
|
|
|
function logWyomingUnreachableThrottled(host: string, port: number, detail?: string) { |
|
const now = Date.now(); |
|
if (now - lastWyomingDownLog < WYOMING_DOWN_COOLDOWN_MS) return; |
|
lastWyomingDownLog = now; |
|
console.warn( |
|
`[Piper TTS] Wyoming/Piper not reachable at ${host}:${port}${detail ? ` — ${detail}` : ''}. ` + |
|
`Start the Wyoming Piper service or set PIPER_TTS_HOST / PIPER_TTS_PORT. ` + |
|
`Suppressing similar messages for ${WYOMING_DOWN_COOLDOWN_MS / 1000}s. Use PIPER_TTS_DEBUG=1 for verbose logs.` |
|
); |
|
} |
|
|
|
/** |
|
* Proxy endpoint for Piper TTS using Wyoming protocol (TCP) |
|
* Wyoming protocol: JSON messages newline-delimited, then raw binary audio |
|
*/ |
|
/** HTTP handler (plain `Request` / `Response`); bridged to Wyoming Piper over TCP. */ |
|
export async function handlePiperTtsPost(request: Request): Promise<Response> { |
|
piperDebug('Piper TTS API: Request received'); |
|
try { |
|
const body: TTSRequest = await request.json(); |
|
const { text, voice, speed } = body; |
|
|
|
piperDebug('Piper TTS API: Processing request', { |
|
textLength: text?.length, |
|
voice, |
|
speed, |
|
voiceType: typeof voice, |
|
voiceValue: voice |
|
}); |
|
|
|
if (!text?.trim()) { |
|
console.error('Piper TTS API: Missing text field'); |
|
return errorResponse(400, 'Missing required field: text'); |
|
} |
|
|
|
// Filter and prepare text |
|
const filteredText = filterCryptographicContent(text); |
|
if (!filteredText.trim()) { |
|
console.warn('Piper TTS API: Text is empty after filtering'); |
|
return errorResponse(400, 'Text contains only cryptographic addresses/IDs that cannot be read aloud'); |
|
} |
|
|
|
const sentences = splitIntoSentences(filteredText); |
|
const fullText = sentences.filter(s => s.trim().length > 0).join(' '); |
|
piperDebug(`Piper TTS API: Processing ${sentences.length} sentences, total length: ${fullText.length}`); |
|
|
|
// Use provided voice, or auto-detect language and select voice if not provided |
|
let selectedVoice = voice; |
|
if (!selectedVoice || selectedVoice.trim() === '') { |
|
const detectedLang = detectLanguage(fullText); |
|
selectedVoice = getVoiceForLanguage(detectedLang); |
|
piperDebug(`Piper TTS API: No voice provided, auto-detected language: ${detectedLang}, selected voice: ${selectedVoice}`); |
|
} else { |
|
piperDebug(`Piper TTS API: Using provided voice: ${selectedVoice}`); |
|
} |
|
|
|
// Stream audio response with cancellation support |
|
const abortController = new AbortController(); |
|
let wyomingCleanup: (() => void) | null = null; |
|
|
|
const stream = new ReadableStream({ |
|
async start(controller) { |
|
const tcpConfig = getTcpConfig(); |
|
try { |
|
const audioChunks: Uint8Array[] = []; |
|
let audioFormat: { rate: number; width: number; channels: number } | null = null; |
|
let totalBytes = 0; |
|
|
|
piperDebug('Piper TTS API: Connecting to Wyoming server at', tcpConfig.hostname, 'port', tcpConfig.port); |
|
|
|
await synthesizeWithWyoming( |
|
tcpConfig, |
|
fullText, |
|
selectedVoice, |
|
speed, |
|
abortController.signal, |
|
(cleanup) => { |
|
wyomingCleanup = cleanup; |
|
}, |
|
(chunk: Uint8Array, format?: { rate: number; width: number; channels: number }) => { |
|
if (abortController.signal.aborted) return; |
|
|
|
if (format && !audioFormat) { |
|
audioFormat = format; |
|
piperDebug('Piper TTS API: Received audio format:', format); |
|
} |
|
if (chunk.length > 0) { |
|
audioChunks.push(chunk); |
|
totalBytes += chunk.length; |
|
} |
|
} |
|
); |
|
|
|
if (abortController.signal.aborted) { |
|
piperDebug('Piper TTS API: Synthesis aborted'); |
|
controller.close(); |
|
return; |
|
} |
|
|
|
if (!audioFormat || totalBytes === 0) { |
|
throw new Error('No audio data received from Wyoming server'); |
|
} |
|
|
|
piperDebug('Piper TTS API: Collected audio, total size:', totalBytes, 'bytes'); |
|
|
|
const format = audioFormat as { rate: number; width: number; channels: number }; |
|
const wavHeader = createWavHeader(format.rate, format.width, format.channels, totalBytes); |
|
controller.enqueue(wavHeader); |
|
|
|
for (const chunk of audioChunks) { |
|
if (abortController.signal.aborted) break; |
|
controller.enqueue(chunk); |
|
} |
|
|
|
controller.close(); |
|
} catch (error) { |
|
if (abortController.signal.aborted) { |
|
piperDebug('Piper TTS API: Operation cancelled'); |
|
controller.close(); |
|
} else { |
|
const msg = error instanceof Error ? error.message : String(error); |
|
if (isWyomingUnreachableMessage(msg)) { |
|
logWyomingUnreachableThrottled(tcpConfig.hostname, tcpConfig.port, msg); |
|
if (PIPER_TTS_DEBUG) console.error('Piper TTS API: Streaming error:', error); |
|
} else { |
|
console.error('Piper TTS API: Streaming error:', error); |
|
} |
|
controller.error(error); |
|
} |
|
} |
|
}, |
|
cancel() { |
|
piperDebug('Piper TTS API: Stream cancelled by client'); |
|
abortController.abort(); |
|
if (wyomingCleanup) { |
|
wyomingCleanup(); |
|
} |
|
} |
|
}); |
|
|
|
return new Response(stream, { |
|
headers: { |
|
'Content-Type': 'audio/wav', |
|
'Transfer-Encoding': 'chunked', |
|
'Access-Control-Allow-Origin': '*', |
|
}, |
|
}); |
|
} catch (error) { |
|
const message = error instanceof Error ? error.message : 'Unknown error'; |
|
console.error('Piper TTS API error:', message); |
|
return errorResponse(500, message); |
|
} |
|
}; |
|
|
|
/** |
|
* Synthesize speech using Wyoming protocol |
|
* Protocol flow (standard): |
|
* 1. Send: {"type":"synthesize","data":{"text":"..."}}\n |
|
* 2. Receive format: {"rate":22050,"width":2,"channels":1}\n |
|
* 3. Receive raw binary audio (no delimiters) |
|
* 4. Optionally receive: {"type":"done"}\n or connection closes |
|
* |
|
* Some implementations may send audio-chunk messages: |
|
* - {"type":"audio-chunk","payload_length":N}\n followed by N bytes of binary audio |
|
* - These may arrive before or after the format message |
|
* - We handle both standard and audio-chunk variants for compatibility |
|
*/ |
|
async function synthesizeWithWyoming( |
|
config: { hostname: string; port: number }, |
|
text: string, |
|
voice: string | undefined, |
|
speed: number | undefined, |
|
abortSignal: AbortSignal, |
|
onCleanup: (cleanup: () => void) => void, |
|
onChunk: (chunk: Uint8Array, format?: { rate: number; width: number; channels: number }) => void |
|
): Promise<void> { |
|
const net = await import('net'); |
|
|
|
return new Promise<void>((resolve, reject) => { |
|
let socket: import('net').Socket | null = null; |
|
let buffer = Buffer.alloc(0); |
|
let audioFormat: { rate: number; width: number; channels: number } | null = null; |
|
let hasReceivedAudio = false; |
|
let isResolved = false; |
|
let lastDataTime = Date.now(); |
|
let completionTimer: NodeJS.Timeout | null = null; |
|
const preFormatAudioChunks: Uint8Array[] = []; // Buffer audio chunks received before format |
|
let hasProcessedAudioChunks = false; // Track if we've processed audio-chunk messages |
|
|
|
piperDebug('Wyoming: Creating TCP connection to', config.hostname, 'port', config.port); |
|
|
|
const cleanup = () => { |
|
if (socket && !socket.destroyed) { |
|
piperDebug('Wyoming: Cleaning up TCP connection'); |
|
socket.destroy(); |
|
} |
|
}; |
|
|
|
// Register cleanup function |
|
onCleanup(cleanup); |
|
|
|
// Check if already aborted |
|
if (abortSignal.aborted) { |
|
piperDebug('Wyoming: Abort signal already set, not connecting'); |
|
reject(new Error('Operation cancelled')); |
|
return; |
|
} |
|
|
|
// Listen for abort signal |
|
const abortHandler = () => { |
|
piperDebug('Wyoming: Abort signal received, cleaning up'); |
|
if (completionTimer) { |
|
clearTimeout(completionTimer); |
|
completionTimer = null; |
|
} |
|
cleanup(); |
|
clearTimeout(timeout); |
|
if (!isResolved) { |
|
isResolved = true; |
|
reject(new Error('Operation cancelled')); |
|
} |
|
}; |
|
abortSignal.addEventListener('abort', abortHandler); |
|
|
|
const timeout = setTimeout(() => { |
|
cleanup(); |
|
if (!isResolved) { |
|
isResolved = true; |
|
console.error('Wyoming: Timeout after 5 minutes'); |
|
reject(new Error('Wyoming protocol timeout')); |
|
} |
|
}, 300000); // 5 minutes |
|
|
|
try { |
|
socket = net.createConnection(config.port, config.hostname, () => { |
|
piperDebug('Wyoming: TCP connected successfully'); |
|
// Send synthesize request |
|
// Wyoming protocol expects voice as an object with 'name' property, not a plain string |
|
const message = { |
|
type: 'synthesize', |
|
data: { |
|
text, |
|
...(voice ? { voice: { name: voice } } : {}), |
|
...(speed !== undefined && speed !== 1.0 ? { speed } : {}), |
|
} |
|
}; |
|
const messageStr = JSON.stringify(message) + '\n'; |
|
piperDebug( |
|
'Wyoming: Sending synthesize message, text length:', |
|
text.length, |
|
'voice:', |
|
voice ? `{name: "${voice}"}` : 'none (will use default)' |
|
); |
|
piperDebug('Wyoming: Full message:', messageStr.trim()); |
|
try { |
|
socket!.write(messageStr); |
|
piperDebug('Wyoming: Synthesize message sent'); |
|
} catch (writeError) { |
|
console.error('Wyoming: Failed to write message:', writeError); |
|
cleanup(); |
|
clearTimeout(timeout); |
|
if (!isResolved) { |
|
isResolved = true; |
|
reject(new Error(`Failed to send message: ${writeError instanceof Error ? writeError.message : String(writeError)}`)); |
|
} |
|
} |
|
}); |
|
} catch (error) { |
|
console.error('Wyoming: Failed to create connection:', error); |
|
cleanup(); |
|
clearTimeout(timeout); |
|
if (!isResolved) { |
|
isResolved = true; |
|
reject(new Error(`Failed to create connection: ${error instanceof Error ? error.message : String(error)}`)); |
|
} |
|
return; |
|
} |
|
|
|
socket.on('data', (data: Buffer) => { |
|
// Check if aborted |
|
if (abortSignal.aborted) { |
|
console.log('Wyoming: Aborted, ignoring data'); |
|
return; |
|
} |
|
|
|
lastDataTime = Date.now(); |
|
|
|
// Clear completion timer since we're receiving data |
|
if (completionTimer) { |
|
clearTimeout(completionTimer); |
|
completionTimer = null; |
|
} |
|
|
|
console.log('Wyoming: Received data, size:', data.length, 'bytes, audioFormat:', audioFormat ? 'received' : 'not received'); |
|
buffer = Buffer.concat([buffer, data]); |
|
|
|
// Process buffer |
|
while (buffer.length > 0) { |
|
// Check if aborted during processing |
|
if (abortSignal.aborted) { |
|
console.log('Wyoming: Aborted during buffer processing'); |
|
break; |
|
} |
|
|
|
// After format received, check for "done" message, audio-chunk messages, or process as raw audio |
|
if (audioFormat) { |
|
// Check if buffer starts with JSON (for done/error/audio-chunk messages) |
|
if (buffer.length > 0 && buffer[0] === 0x7b) { // '{' byte |
|
const newlineIndex = buffer.indexOf('\n'); |
|
if (newlineIndex !== -1) { |
|
try { |
|
const line = buffer.subarray(0, newlineIndex).toString('utf8').trim(); |
|
const message = JSON.parse(line); |
|
|
|
if (message.type === 'done') { |
|
console.log('Wyoming: Received done message'); |
|
if (completionTimer) { |
|
clearTimeout(completionTimer); |
|
completionTimer = null; |
|
} |
|
buffer = buffer.subarray(newlineIndex + 1); |
|
cleanup(); |
|
clearTimeout(timeout); |
|
if (!isResolved) { |
|
isResolved = true; |
|
resolve(); |
|
} |
|
return; |
|
} |
|
|
|
if (message.type === 'error') { |
|
console.error('Wyoming: Received error message:', message.message); |
|
buffer = buffer.subarray(newlineIndex + 1); |
|
cleanup(); |
|
clearTimeout(timeout); |
|
if (!isResolved) { |
|
isResolved = true; |
|
reject(new Error(message.message || 'Wyoming protocol error')); |
|
} |
|
return; |
|
} |
|
|
|
if (message.type === 'audio-stop') { |
|
console.log('Wyoming: Received audio-stop message'); |
|
buffer = buffer.subarray(newlineIndex + 1); |
|
if (completionTimer) { |
|
clearTimeout(completionTimer); |
|
completionTimer = null; |
|
} |
|
cleanup(); |
|
clearTimeout(timeout); |
|
if (!isResolved) { |
|
isResolved = true; |
|
resolve(); |
|
} |
|
return; |
|
} |
|
|
|
// Handle audio-chunk messages after format |
|
if (message.type === 'audio-chunk' && typeof message.payload_length === 'number') { |
|
const payloadLength = message.payload_length; |
|
const messageEnd = newlineIndex + 1; |
|
// If data_length is specified, there's additional JSON data before the payload |
|
const dataLength = typeof message.data_length === 'number' ? message.data_length : 0; |
|
const payloadStart = messageEnd + dataLength; |
|
const payloadEnd = payloadStart + payloadLength; |
|
|
|
if (buffer.length >= payloadEnd) { |
|
const audioPayload = new Uint8Array(buffer.subarray(payloadStart, payloadEnd)); |
|
onChunk(audioPayload); |
|
hasReceivedAudio = true; |
|
hasProcessedAudioChunks = true; |
|
buffer = buffer.subarray(payloadEnd); |
|
continue; // Continue processing loop |
|
} else { |
|
// Don't have full payload yet - wait for more data |
|
break; |
|
} |
|
} |
|
} catch (error) { |
|
// Not valid JSON - treat as raw audio |
|
// Fall through to raw audio processing |
|
} |
|
} else { |
|
// No newline yet - might be incomplete JSON, wait for more data |
|
break; |
|
} |
|
} |
|
|
|
// No JSON message found - process all buffer as raw audio |
|
if (buffer.length > 0) { |
|
onChunk(new Uint8Array(buffer)); |
|
hasReceivedAudio = true; |
|
buffer = Buffer.alloc(0); |
|
} |
|
|
|
// If we've received audio and buffer is empty, set a completion timer |
|
// This handles cases where the server doesn't send "done" or close connection |
|
if (hasReceivedAudio && buffer.length === 0 && !completionTimer) { |
|
completionTimer = setTimeout(() => { |
|
if (!isResolved && hasReceivedAudio && !abortSignal.aborted) { |
|
console.log('Wyoming: No data received for 500ms after audio, assuming completion'); |
|
cleanup(); |
|
clearTimeout(timeout); |
|
isResolved = true; |
|
resolve(); |
|
} |
|
}, 500); // 500ms timeout after last data |
|
} |
|
|
|
// Break and wait for more data (could be more audio or "done" message) |
|
break; |
|
} |
|
|
|
// Before format: scan buffer for JSON format message |
|
// Look for '{' followed by newline-delimited JSON |
|
let formatFound = false; |
|
let searchStart = 0; |
|
|
|
while (searchStart < buffer.length && !formatFound) { |
|
const braceIndex = buffer.indexOf(0x7b, searchStart); // '{' byte |
|
if (braceIndex === -1) { |
|
// No more '{' found - this is all binary data, buffer it |
|
break; |
|
} |
|
|
|
// Look for newline after this '{' |
|
const newlineIndex = buffer.indexOf('\n', braceIndex); |
|
if (newlineIndex === -1) { |
|
// No newline yet - wait for more data |
|
break; |
|
} |
|
|
|
// Try to parse as JSON |
|
const lineBytes = buffer.subarray(braceIndex, newlineIndex); |
|
const line = lineBytes.toString('utf8').trim(); |
|
|
|
if (line.endsWith('}')) { |
|
try { |
|
const message = JSON.parse(line); |
|
console.log('Wyoming: Received message:', JSON.stringify(message)); |
|
|
|
// Check for audio-start message (contains format info) |
|
if (message.type === 'audio-start' && (message.rate !== undefined || message.channels !== undefined)) { |
|
audioFormat = { |
|
rate: message.rate, |
|
width: message.width || 2, |
|
channels: message.channels, |
|
}; |
|
console.log('Wyoming: Audio format from audio-start:', audioFormat); |
|
|
|
// Send format notification |
|
onChunk(new Uint8Array(0), audioFormat); |
|
|
|
// Process any buffered audio chunks |
|
if (preFormatAudioChunks.length > 0) { |
|
console.log('Wyoming: Processing', preFormatAudioChunks.length, 'buffered audio chunks after audio-start'); |
|
for (const chunk of preFormatAudioChunks) { |
|
onChunk(chunk); |
|
hasReceivedAudio = true; |
|
} |
|
preFormatAudioChunks.length = 0; |
|
hasProcessedAudioChunks = true; |
|
} |
|
|
|
buffer = buffer.subarray(newlineIndex + 1); |
|
searchStart = 0; |
|
continue; |
|
} |
|
|
|
// Check for format message (can be a standalone format object or embedded in other messages) |
|
if (message.rate !== undefined || message.channels !== undefined) { |
|
audioFormat = { |
|
rate: message.rate, |
|
width: message.width || 2, |
|
channels: message.channels, |
|
}; |
|
console.log('Wyoming: Audio format:', audioFormat); |
|
|
|
// Remove everything up to and including the format message |
|
const dataAfterFormat = buffer.subarray(newlineIndex + 1); |
|
|
|
// Send format notification first |
|
onChunk(new Uint8Array(0), audioFormat); |
|
|
|
// Process any buffered audio chunks received before format |
|
if (preFormatAudioChunks.length > 0) { |
|
console.log('Wyoming: Processing', preFormatAudioChunks.length, 'buffered audio chunks'); |
|
for (const chunk of preFormatAudioChunks) { |
|
onChunk(chunk); |
|
hasReceivedAudio = true; |
|
} |
|
preFormatAudioChunks.length = 0; // Clear the buffer |
|
hasProcessedAudioChunks = true; |
|
} |
|
|
|
// Process any raw data before format as audio (protocol violation, but handle it) |
|
// BUT: Skip this if we've already processed audio-chunk messages, as that data |
|
// is likely protocol overhead or corrupted, not actual audio |
|
if (braceIndex > 0 && !hasProcessedAudioChunks) { |
|
const preFormatData = buffer.subarray(0, braceIndex); |
|
// Only process if it's not empty and looks like audio (not JSON) |
|
// Also check that it's a reasonable size (not just a few bytes of protocol overhead) |
|
if (preFormatData.length > 0 && preFormatData[0] !== 0x7b && preFormatData.length > 100) { |
|
console.warn('Wyoming: Processing', braceIndex, 'bytes of raw data received before format message as audio'); |
|
onChunk(new Uint8Array(preFormatData)); |
|
hasReceivedAudio = true; |
|
} else if (preFormatData.length > 0 && preFormatData.length <= 100) { |
|
console.warn('Wyoming: Skipping', preFormatData.length, 'bytes of data before format (likely protocol overhead)'); |
|
} |
|
} else if (braceIndex > 0 && hasProcessedAudioChunks) { |
|
console.warn('Wyoming: Skipping', braceIndex, 'bytes of data before format (audio-chunk messages already processed)'); |
|
} |
|
|
|
// Process data after format as audio |
|
if (dataAfterFormat.length > 0) { |
|
onChunk(new Uint8Array(dataAfterFormat)); |
|
hasReceivedAudio = true; |
|
} |
|
|
|
buffer = Buffer.alloc(0); |
|
formatFound = true; |
|
continue; // Continue processing loop |
|
} |
|
|
|
// Check for done/error messages |
|
if (message.type === 'done') { |
|
console.log('Wyoming: Received done message'); |
|
buffer = buffer.subarray(newlineIndex + 1); |
|
cleanup(); |
|
clearTimeout(timeout); |
|
if (!isResolved) { |
|
isResolved = true; |
|
if (hasReceivedAudio) { |
|
resolve(); |
|
} else { |
|
reject(new Error('No audio data received')); |
|
} |
|
} |
|
return; |
|
} |
|
|
|
if (message.type === 'error') { |
|
console.error('Wyoming: Received error message:', message.message); |
|
buffer = buffer.subarray(newlineIndex + 1); |
|
cleanup(); |
|
clearTimeout(timeout); |
|
if (!isResolved) { |
|
isResolved = true; |
|
reject(new Error(message.message || 'Wyoming protocol error')); |
|
} |
|
return; |
|
} |
|
|
|
// Handle audio-stop message |
|
if (message.type === 'audio-stop') { |
|
console.log('Wyoming: Received audio-stop message'); |
|
buffer = buffer.subarray(newlineIndex + 1); |
|
|
|
// If we have buffered audio chunks but no format, use default format |
|
if (preFormatAudioChunks.length > 0 && !audioFormat) { |
|
console.warn('Wyoming: Format message never received, using default format for', preFormatAudioChunks.length, 'buffered chunks'); |
|
// Default Piper TTS format: 22050 Hz, 16-bit (width=2), mono (channels=1) |
|
audioFormat = { |
|
rate: 22050, |
|
width: 2, |
|
channels: 1, |
|
}; |
|
console.log('Wyoming: Using default audio format:', audioFormat); |
|
|
|
// Send format notification |
|
onChunk(new Uint8Array(0), audioFormat); |
|
|
|
// Process buffered chunks |
|
for (const chunk of preFormatAudioChunks) { |
|
onChunk(chunk); |
|
hasReceivedAudio = true; |
|
} |
|
preFormatAudioChunks.length = 0; |
|
hasProcessedAudioChunks = true; |
|
} |
|
|
|
cleanup(); |
|
clearTimeout(timeout); |
|
if (!isResolved) { |
|
isResolved = true; |
|
if (hasReceivedAudio) { |
|
resolve(); |
|
} else { |
|
reject(new Error('No audio data received')); |
|
} |
|
} |
|
return; |
|
} |
|
|
|
// Handle audio-chunk messages |
|
if (message.type === 'audio-chunk' && typeof message.payload_length === 'number') { |
|
const payloadLength = message.payload_length; |
|
const messageEnd = newlineIndex + 1; |
|
// If data_length is specified, there's additional JSON data before the payload |
|
const dataLength = typeof message.data_length === 'number' ? message.data_length : 0; |
|
const payloadStart = messageEnd + dataLength; |
|
const payloadEnd = payloadStart + payloadLength; |
|
|
|
console.log('Wyoming: Processing audio-chunk, payload_length:', payloadLength, 'data_length:', dataLength, 'buffer length:', buffer.length, 'payloadStart:', payloadStart, 'payloadEnd:', payloadEnd); |
|
|
|
// Check if we have the full payload |
|
if (buffer.length >= payloadEnd) { |
|
// If there's data_length, try to parse the format from that data |
|
if (dataLength > 0 && !audioFormat) { |
|
const dataBytes = buffer.subarray(messageEnd, payloadStart); |
|
try { |
|
const dataStr = dataBytes.toString('utf8'); |
|
const formatData = JSON.parse(dataStr); |
|
if (formatData.rate !== undefined || formatData.channels !== undefined) { |
|
audioFormat = { |
|
rate: formatData.rate, |
|
width: formatData.width || 2, |
|
channels: formatData.channels, |
|
}; |
|
console.log('Wyoming: Found format in data section:', audioFormat); |
|
onChunk(new Uint8Array(0), audioFormat); |
|
} |
|
} catch (e) { |
|
console.warn('Wyoming: Failed to parse data section as JSON:', e); |
|
} |
|
} |
|
|
|
// Extract the audio payload (after the data section) |
|
const audioPayload = new Uint8Array(buffer.subarray(payloadStart, payloadEnd)); |
|
console.log('Wyoming: Extracted audio payload:', audioPayload.length, 'bytes, first 8 bytes:', Array.from(audioPayload.slice(0, 8)).map(b => '0x' + b.toString(16).padStart(2, '0')).join(' ')); |
|
|
|
// Check if format is embedded in the audio-chunk message itself |
|
if (!audioFormat && (message.rate !== undefined || message.channels !== undefined)) { |
|
audioFormat = { |
|
rate: message.rate || 22050, |
|
width: message.width || 2, |
|
channels: message.channels || 1, |
|
}; |
|
console.log('Wyoming: Found format in audio-chunk message:', audioFormat); |
|
onChunk(new Uint8Array(0), audioFormat); |
|
} |
|
|
|
// If we have format, process it as audio; otherwise buffer it |
|
if (audioFormat) { |
|
onChunk(audioPayload); |
|
hasReceivedAudio = true; |
|
hasProcessedAudioChunks = true; |
|
} else { |
|
// Buffer audio chunks until we get format |
|
preFormatAudioChunks.push(audioPayload); |
|
console.log('Wyoming: Buffering audio-chunk payload of', payloadLength, 'bytes (format not yet received)'); |
|
hasProcessedAudioChunks = true; // Mark that we've seen audio-chunk messages |
|
} |
|
|
|
// Remove the message and payload from buffer |
|
buffer = buffer.subarray(payloadEnd); |
|
searchStart = 0; // Reset search to start of buffer |
|
continue; |
|
} else { |
|
// Don't have full payload yet - wait for more data |
|
console.log('Wyoming: Waiting for more data, need', payloadEnd, 'have', buffer.length); |
|
break; |
|
} |
|
} |
|
|
|
// Other JSON message - skip it and continue searching |
|
searchStart = newlineIndex + 1; |
|
} catch (error) { |
|
// Not valid JSON - continue searching |
|
searchStart = braceIndex + 1; |
|
} |
|
} else { |
|
// Incomplete JSON - continue searching |
|
searchStart = braceIndex + 1; |
|
} |
|
} |
|
|
|
// If we found format, continue processing; otherwise wait for more data |
|
if (!formatFound) { |
|
break; |
|
} |
|
} |
|
}); |
|
|
|
socket.on('error', (error: Error) => { |
|
if (isWyomingUnreachableMessage(error.message)) { |
|
logWyomingUnreachableThrottled(config.hostname, config.port, error.message); |
|
if (PIPER_TTS_DEBUG) console.error('Wyoming: TCP error:', error.message); |
|
} else { |
|
console.error('Wyoming: TCP error:', error.message); |
|
} |
|
abortSignal.removeEventListener('abort', abortHandler); |
|
cleanup(); |
|
clearTimeout(timeout); |
|
if (!isResolved) { |
|
isResolved = true; |
|
reject(new Error(`TCP error: ${error.message}`)); |
|
} |
|
}); |
|
|
|
socket.on('close', () => { |
|
piperDebug( |
|
'Wyoming: Connection closed, hasReceivedAudio:', |
|
hasReceivedAudio, |
|
'buffer length:', |
|
buffer.length, |
|
'buffered chunks:', |
|
preFormatAudioChunks.length |
|
); |
|
if (completionTimer) { |
|
clearTimeout(completionTimer); |
|
completionTimer = null; |
|
} |
|
abortSignal.removeEventListener('abort', abortHandler); |
|
cleanup(); |
|
clearTimeout(timeout); |
|
|
|
// If we have buffered audio chunks but no format, use default format |
|
if (!abortSignal.aborted && preFormatAudioChunks.length > 0 && !audioFormat) { |
|
console.warn('Wyoming: Format message never received before connection close, using default format for', preFormatAudioChunks.length, 'buffered chunks'); |
|
// Default Piper TTS format: 22050 Hz, 16-bit (width=2), mono (channels=1) |
|
audioFormat = { |
|
rate: 22050, |
|
width: 2, |
|
channels: 1, |
|
}; |
|
console.log('Wyoming: Using default audio format:', audioFormat); |
|
|
|
// Send format notification |
|
onChunk(new Uint8Array(0), audioFormat); |
|
|
|
// Process buffered chunks |
|
for (const chunk of preFormatAudioChunks) { |
|
onChunk(chunk); |
|
hasReceivedAudio = true; |
|
} |
|
preFormatAudioChunks.length = 0; |
|
hasProcessedAudioChunks = true; |
|
} |
|
|
|
// Only process remaining buffer if not aborted |
|
if (!abortSignal.aborted && buffer.length > 0 && audioFormat) { |
|
console.log('Wyoming: Streaming remaining buffer:', buffer.length, 'bytes'); |
|
onChunk(new Uint8Array(buffer)); |
|
hasReceivedAudio = true; |
|
} |
|
|
|
if (!isResolved) { |
|
isResolved = true; |
|
if (abortSignal.aborted) { |
|
piperDebug('Wyoming: Connection closed after abort'); |
|
reject(new Error('Operation cancelled')); |
|
} else if (hasReceivedAudio) { |
|
piperDebug('Wyoming: Resolving - audio received'); |
|
resolve(); |
|
} else { |
|
piperDebug('Wyoming: Rejecting - no audio received'); |
|
reject(new Error('Connection closed without audio data')); |
|
} |
|
} |
|
}); |
|
}); |
|
} |
|
|
|
function getTcpConfig(): { hostname: string; port: number } { |
|
// Allow override via environment variable |
|
const piperHost = process.env.PIPER_TTS_HOST || process.env.PIPER_HOST; |
|
const piperPort = process.env.PIPER_TTS_PORT || process.env.PIPER_PORT; |
|
|
|
if (piperHost && piperPort) { |
|
return { |
|
hostname: piperHost, |
|
port: parseInt(piperPort, 10), |
|
}; |
|
} |
|
|
|
// Default: use Docker service name in production, localhost in development |
|
const isDevelopment = process.env.NODE_ENV === 'development'; |
|
return { |
|
hostname: isDevelopment ? 'localhost' : 'piper-tts', |
|
port: 10200, |
|
}; |
|
} |
|
|
|
function createWavHeader(sampleRate: number, bytesPerSample: number, channels: number, dataSize: number): Uint8Array { |
|
const header = new ArrayBuffer(44); |
|
const view = new DataView(header); |
|
|
|
// RIFF header |
|
view.setUint8(0, 0x52); // 'R' |
|
view.setUint8(1, 0x49); // 'I' |
|
view.setUint8(2, 0x46); // 'F' |
|
view.setUint8(3, 0x46); // 'F' |
|
view.setUint32(4, 36 + dataSize, true); // File size - 8 |
|
|
|
// WAVE header |
|
view.setUint8(8, 0x57); // 'W' |
|
view.setUint8(9, 0x41); // 'A' |
|
view.setUint8(10, 0x56); // 'V' |
|
view.setUint8(11, 0x45); // 'E' |
|
|
|
// fmt chunk |
|
view.setUint8(12, 0x66); // 'f' |
|
view.setUint8(13, 0x6D); // 'm' |
|
view.setUint8(14, 0x74); // 't' |
|
view.setUint8(15, 0x20); // ' ' |
|
view.setUint32(16, 16, true); // fmt chunk size |
|
view.setUint16(20, 1, true); // Audio format (1 = PCM) |
|
view.setUint16(22, channels, true); // Number of channels |
|
view.setUint32(24, sampleRate, true); // Sample rate |
|
view.setUint32(28, sampleRate * channels * bytesPerSample, true); // Byte rate |
|
view.setUint16(32, channels * bytesPerSample, true); // Block align |
|
view.setUint16(34, bytesPerSample * 8, true); // Bits per sample |
|
|
|
// data chunk |
|
view.setUint8(36, 0x64); // 'd' |
|
view.setUint8(37, 0x61); // 'a' |
|
view.setUint8(38, 0x74); // 't' |
|
view.setUint8(39, 0x61); // 'a' |
|
view.setUint32(40, dataSize, true); // Data size |
|
|
|
return new Uint8Array(header); |
|
} |
|
|
|
function filterCryptographicContent(text: string): string { |
|
let filtered = text; |
|
|
|
// Remove URLs |
|
filtered = filtered.replace(/https?:\/\/[^\s]+/gi, ''); |
|
filtered = filtered.replace(/www\.[^\s]+/gi, ''); |
|
|
|
// Remove Nostr URIs and bech32 addresses |
|
filtered = filtered.replace(/nostr:[^\s]+/gi, ''); |
|
filtered = filtered.replace(/\b(npub|note|nevent|naddr|nprofile|nsec|ncryptsec)1[a-z0-9]{20,}\b/gi, ''); |
|
|
|
// Remove hex strings |
|
filtered = filtered.replace(/\b[0-9a-f]{64}\b/gi, ''); |
|
filtered = filtered.replace(/\b[0-9a-f]{32,63}\b/gi, ''); |
|
|
|
// Remove emojis |
|
filtered = filtered.replace(/[\u{1F300}-\u{1F9FF}]/gu, ''); |
|
filtered = filtered.replace(/[\u{1F600}-\u{1F64F}]/gu, ''); |
|
filtered = filtered.replace(/[\u{2600}-\u{26FF}]/gu, ''); |
|
filtered = filtered.replace(/[\u{2700}-\u{27BF}]/gu, ''); |
|
|
|
// Remove markdown and asciidoc markup |
|
|
|
// Code blocks (markdown and asciidoc) |
|
filtered = filtered.replace(/```[\s\S]*?```/g, ''); |
|
filtered = filtered.replace(/`[^`]+`/g, ''); |
|
filtered = filtered.replace(/----[\s\S]*?----/g, ''); // AsciiDoc code blocks |
|
filtered = filtered.replace(/\[source[^\]]*\][\s\S]*?----/g, ''); // AsciiDoc source blocks |
|
|
|
// Headers (markdown and asciidoc) |
|
filtered = filtered.replace(/^#+\s+/gm, ''); // Markdown headers at start of line |
|
filtered = filtered.replace(/\s+#+\s+/g, ' '); // Markdown headers in middle of text |
|
filtered = filtered.replace(/^=+\s*$/gm, ''); // AsciiDoc headers (single line) |
|
filtered = filtered.replace(/^=+\s+/gm, ''); // AsciiDoc headers at start of line |
|
filtered = filtered.replace(/\s+=+\s+/g, ' '); // AsciiDoc headers in middle of text |
|
|
|
// Links (markdown and asciidoc) |
|
filtered = filtered.replace(/\[([^\]]+)\]\([^\)]+\)/g, '$1'); // Markdown links |
|
filtered = filtered.replace(/\[\[([^\]]+)\]\]/g, '$1'); // AsciiDoc links |
|
filtered = filtered.replace(/link:([^\[]+)\[([^\]]+)\]/g, '$2'); // AsciiDoc link: syntax |
|
|
|
// Images (markdown and asciidoc) |
|
filtered = filtered.replace(/!\[([^\]]*)\]\([^\)]+\)/g, ''); // Markdown images |
|
filtered = filtered.replace(/image::?[^\[]+\[([^\]]*)\]/g, '$1'); // AsciiDoc images |
|
|
|
// Emphasis and formatting |
|
filtered = filtered.replace(/\*\*([^*]+)\*\*/g, '$1'); // Bold markdown |
|
filtered = filtered.replace(/\*([^*]+)\*/g, '$1'); // Italic markdown |
|
filtered = filtered.replace(/__([^_]+)__/g, '$1'); // Bold markdown (underscore) |
|
filtered = filtered.replace(/_([^_]+)_/g, '$1'); // Italic markdown (underscore) |
|
filtered = filtered.replace(/\*\*([^*]+)\*\*/g, '$1'); // Bold asciidoc |
|
filtered = filtered.replace(/\*([^*]+)\*/g, '$1'); // Italic asciidoc |
|
filtered = filtered.replace(/\+\+([^+]+)\+\+/g, '$1'); // Monospace asciidoc |
|
filtered = filtered.replace(/~~([^~]+)~~/g, '$1'); // Strikethrough markdown |
|
|
|
// Lists (markdown and asciidoc) |
|
filtered = filtered.replace(/^[\*\-\+]\s+/gm, ''); // Markdown unordered lists |
|
filtered = filtered.replace(/^\d+\.\s+/gm, ''); // Markdown ordered lists |
|
filtered = filtered.replace(/^\.\s+/gm, ''); // AsciiDoc unordered lists |
|
filtered = filtered.replace(/^\d+\.\s+/gm, ''); // AsciiDoc ordered lists |
|
|
|
// Blockquotes |
|
filtered = filtered.replace(/^>\s+/gm, ''); // Markdown blockquotes |
|
filtered = filtered.replace(/^\[quote[^\]]*\][\s\S]*?\[quote\]/g, ''); // AsciiDoc quotes |
|
|
|
// Horizontal rules |
|
filtered = filtered.replace(/^[-*_]{3,}\s*$/gm, ''); // Markdown horizontal rules |
|
filtered = filtered.replace(/^'''+\s*$/gm, ''); // AsciiDoc horizontal rules |
|
|
|
// Tables (markdown and asciidoc) |
|
filtered = filtered.replace(/\|/g, ' '); // Remove table separators |
|
filtered = filtered.replace(/^\|.+\|\s*$/gm, ''); // Remove table rows |
|
filtered = filtered.replace(/^\[cols?=[^\]]*\][\s\S]*?\|===\s*$/gm, ''); // AsciiDoc tables |
|
|
|
// Other asciidoc syntax |
|
filtered = filtered.replace(/\[\[([^\]]+)\]\]/g, ''); // AsciiDoc anchors |
|
filtered = filtered.replace(/\[NOTE\]/gi, ''); |
|
filtered = filtered.replace(/\[TIP\]/gi, ''); |
|
filtered = filtered.replace(/\[WARNING\]/gi, ''); |
|
filtered = filtered.replace(/\[IMPORTANT\]/gi, ''); |
|
filtered = filtered.replace(/\[CAUTION\]/gi, ''); |
|
filtered = filtered.replace(/\[source[^\]]*\]/gi, ''); |
|
filtered = filtered.replace(/\[caption[^\]]*\]/gi, ''); |
|
|
|
// Clean up whitespace |
|
filtered = filtered.replace(/\s+/g, ' ').trim(); |
|
|
|
return filtered; |
|
} |
|
|
|
function splitIntoSentences(text: string): string[] { |
|
const cleaned = text |
|
.replace(/^#+\s+/gm, '') |
|
.replace(/\n+/g, ' ') |
|
.trim(); |
|
|
|
const sentences: string[] = []; |
|
const regex = /([.!?]+)\s+/g; |
|
let lastIndex = 0; |
|
let match; |
|
|
|
while ((match = regex.exec(cleaned)) !== null) { |
|
const sentence = cleaned.substring(lastIndex, match.index + match[1].length).trim(); |
|
if (sentence.length > 0) { |
|
sentences.push(sentence); |
|
} |
|
lastIndex = match.index + match[0].length; |
|
} |
|
|
|
const remaining = cleaned.substring(lastIndex).trim(); |
|
if (remaining.length > 0) { |
|
sentences.push(remaining); |
|
} |
|
|
|
return sentences.length > 0 ? sentences : [cleaned]; |
|
} |
|
|
|
function errorResponse(status: number, message: string): Response { |
|
return new Response(JSON.stringify({ error: message }), { |
|
status, |
|
headers: { 'Content-Type': 'application/json' }, |
|
}); |
|
} |
|
|
|
/** |
|
* Simple language detection based on character patterns |
|
* Returns language code (e.g., 'en', 'de', 'fr', 'es', etc.) |
|
*/ |
|
function detectLanguage(text: string): string { |
|
if (!text || text.length === 0) return 'en'; |
|
|
|
// Count character patterns to detect language |
|
const sample = text.substring(0, Math.min(500, text.length)); |
|
|
|
// German: ä, ö, ü, ß |
|
const germanChars = (sample.match(/[äöüßÄÖÜ]/g) || []).length; |
|
// French: é, è, ê, ç, à, etc. |
|
const frenchChars = (sample.match(/[éèêëàâäçôùûüÉÈÊËÀÂÄÇÔÙÛÜ]/g) || []).length; |
|
// Spanish: ñ, á, é, í, ó, ú, ¿, ¡ |
|
const spanishChars = (sample.match(/[ñáéíóúüÑÁÉÍÓÚÜ¿¡]/g) || []).length; |
|
// Italian: à, è, é, ì, ò, ù |
|
const italianChars = (sample.match(/[àèéìòùÀÈÉÌÒÙ]/g) || []).length; |
|
// Russian/Cyrillic |
|
const cyrillicChars = (sample.match(/[а-яёА-ЯЁ]/g) || []).length; |
|
// CJK scripts: Hangul / kana → English Piper (no ko/ja models); Han → Chinese when dominant. |
|
const hangulChars = (sample.match(/[\uac00-\ud7af]/g) || []).length; |
|
const kanaChars = (sample.match(/[\u3040-\u309f\u30a0-\u30ff]/g) || []).length; |
|
const hanChars = (sample.match(/[\u4e00-\u9fff]/g) || []).length; |
|
// Arabic |
|
const arabicChars = (sample.match(/[\u0600-\u06ff]/g) || []).length; |
|
|
|
// Calculate ratios |
|
const total = sample.length; |
|
const germanRatio = germanChars / total; |
|
const frenchRatio = frenchChars / total; |
|
const spanishRatio = spanishChars / total; |
|
const italianRatio = italianChars / total; |
|
const cyrillicRatio = cyrillicChars / total; |
|
const hangulRatio = hangulChars / total; |
|
const kanaRatio = kanaChars / total; |
|
const hanRatio = hanChars / total; |
|
const arabicRatio = arabicChars / total; |
|
|
|
// Detect based on highest ratio |
|
if (cyrillicRatio > 0.1) return 'ru'; |
|
if (hangulRatio > 0.06 || kanaRatio > 0.02) return 'en'; |
|
if (hanRatio > 0.1) return 'zh'; |
|
if (arabicRatio > 0.1) return 'ar'; |
|
if (germanRatio > 0.02) return 'de'; |
|
if (frenchRatio > 0.02) return 'fr'; |
|
if (spanishRatio > 0.02) return 'es'; |
|
if (italianRatio > 0.02) return 'it'; |
|
|
|
// Default to English |
|
return 'en'; |
|
} |
|
|
|
/** |
|
* Map language code to Piper voice name |
|
* Returns voice name (always returns a value, defaults to English) |
|
* Voice names follow pattern: {lang}_{locale}-{voice}-{quality} |
|
* |
|
* Note: These are common voice names. You may need to adjust based on |
|
* which voices are actually available in your piper-data directory. |
|
* To see available voices, check the piper-data folder or Wyoming server logs. |
|
*/ |
|
function getVoiceForLanguage(lang: string): string { |
|
// Voice map keys / ids: keep in sync with `src/lib/trinity-languages.ts` (`TRINITY_PIPER_VOICE`, `EXTRA_READ_ALOUD_PIPER_VOICE`). |
|
const voiceMap: Record<string, string> = { |
|
'en': 'en_US-lessac-medium', // Default English voice |
|
'en-gb': 'en_GB-alan-medium', // British English (rhasspy/piper-voices; install via scripts/download-piper-extra-voices.sh) |
|
'de': 'de_DE-thorsten-medium', // German |
|
'fr': 'fr_FR-siwis-medium', // French |
|
'es': 'es_ES-davefx-medium', // Spanish |
|
'it': 'it_IT-paola-medium', // Italian (rhasspy/piper-voices; install via scripts/download-piper-extra-voices.sh) |
|
'ru': 'ru_RU-ruslan-medium', // Russian |
|
'zh': 'zh_CN-huayan-medium', // Chinese |
|
'ar': 'ar_JO-kareem-medium', // Arabic (rhasspy/piper-voices; install via scripts/download-piper-extra-voices.sh) |
|
'pl': 'pl_PL-darkman-medium', // Polish |
|
'pt': 'pt_BR-cadu-medium', // Portuguese (BR; rhasspy/piper-voices; same script) |
|
'nl': 'nl_NL-mls-medium', // Dutch |
|
'cs': 'cs_CZ-jirka-medium', // Czech |
|
'tr': 'tr_TR-dfki-medium', // Turkish |
|
}; |
|
|
|
return voiceMap[lang] || voiceMap['en']; // Fall back to English |
|
}
|
|
|