16 changed files with 2358 additions and 283 deletions
@ -0,0 +1,179 @@
@@ -0,0 +1,179 @@
|
||||
# Deployment Guide |
||||
|
||||
This guide explains how to deploy aitherboard with Piper TTS on a remote server with Apache. |
||||
|
||||
## Architecture |
||||
|
||||
``` |
||||
Internet → Apache (443) → aitherboard container (9876) |
||||
↓ |
||||
piper-tts container (10200, internal) |
||||
``` |
||||
|
||||
- **Apache** proxies HTTP/WebSocket requests to the aitherboard container |
||||
- **aitherboard** container connects to **piper-tts** container via Docker internal networking |
||||
- **piper-tts** is not exposed to the host - only accessible from within Docker network |
||||
|
||||
## Docker Compose Setup |
||||
|
||||
The `docker-compose.yml` is ready to use as-is. It: |
||||
|
||||
1. Creates a Docker network (`aitherboard-network`) for container communication |
||||
2. Exposes aitherboard on port `9876` (for Apache to proxy to) |
||||
3. Keeps piper-tts internal (port `10200` only accessible from Docker network) |
||||
4. Uses service names (`piper-tts`) for internal communication |
||||
|
||||
## Apache Configuration |
||||
|
||||
Your Apache configuration should proxy to `localhost:9876` (where aitherboard runs): |
||||
|
||||
```apache |
||||
ProxyPreserveHost On |
||||
ProxyRequests Off |
||||
|
||||
# WebSocket upgrade handling - CRITICAL for Nostr apps |
||||
RewriteEngine On |
||||
RewriteCond %{HTTP:Upgrade} websocket [NC] |
||||
RewriteCond %{HTTP:Connection} upgrade [NC] |
||||
RewriteRule ^/?(.*) "ws://127.0.0.1:9876/$1" [P,L] |
||||
|
||||
# Regular HTTP proxy for static files and API calls (catch-all - MUST come LAST) |
||||
ProxyPass / http://127.0.0.1:9876/ |
||||
ProxyPassReverse / http://127.0.0.1:9876/ |
||||
|
||||
# Headers for WebSocket compatibility |
||||
ProxyAddHeaders On |
||||
Header always set X-Forwarded-Proto "https" |
||||
Header always set X-Forwarded-Port "443" |
||||
``` |
||||
|
||||
**Important**: Apache only needs to proxy to aitherboard. It does NOT need to route to piper-tts - that's handled internally by Docker. |
||||
|
||||
## Deployment Steps |
||||
|
||||
1. **Clone and prepare the repository:** |
||||
```bash |
||||
git clone <your-repo> |
||||
cd aitherboard |
||||
``` |
||||
|
||||
2. **Set up Wyoming Piper:** |
||||
```bash |
||||
./setup-wyoming-piper.sh |
||||
``` |
||||
|
||||
3. **Download voices (optional, but recommended):** |
||||
```bash |
||||
./download-voices.sh |
||||
``` |
||||
|
||||
4. **Start the containers:** |
||||
```bash |
||||
docker-compose up -d --build |
||||
``` |
||||
|
||||
5. **Verify containers are running:** |
||||
```bash |
||||
docker-compose ps |
||||
``` |
||||
|
||||
6. **Check logs if needed:** |
||||
```bash |
||||
docker-compose logs aitherboard |
||||
docker-compose logs piper-tts |
||||
``` |
||||
|
||||
## Container Communication |
||||
|
||||
The aitherboard container connects to piper-tts using: |
||||
|
||||
- **Hostname**: `piper-tts` (Docker service name) |
||||
- **Port**: `10200` (internal Docker network) |
||||
|
||||
This is configured via environment variables in `docker-compose.yml`: |
||||
- `PIPER_TTS_HOST=piper-tts` |
||||
- `PIPER_TTS_PORT=10200` |
||||
|
||||
You can override these if needed, but the defaults work for Docker Compose. |
||||
|
||||
## Network Flow |
||||
|
||||
1. **User request** → Apache (port 443) |
||||
2. **Apache** → aitherboard container (localhost:9876) |
||||
3. **aitherboard** → piper-tts container (piper-tts:10200 via Docker network) |
||||
4. **piper-tts** → returns audio to aitherboard |
||||
5. **aitherboard** → returns audio to Apache |
||||
6. **Apache** → returns audio to user |
||||
|
||||
## Troubleshooting |
||||
|
||||
### Piper TTS not working |
||||
|
||||
1. **Check if containers are on the same network:** |
||||
```bash |
||||
docker network inspect aitherboard_aitherboard-network |
||||
``` |
||||
|
||||
2. **Test connection from aitherboard to piper-tts:** |
||||
```bash |
||||
docker exec aitherboard ping piper-tts |
||||
``` |
||||
|
||||
3. **Check piper-tts logs:** |
||||
```bash |
||||
docker-compose logs piper-tts |
||||
``` |
||||
|
||||
4. **Verify voices are available:** |
||||
```bash |
||||
docker exec piper-tts ls -la /data/voices/ |
||||
``` |
||||
|
||||
### Apache can't connect to aitherboard |
||||
|
||||
1. **Check if aitherboard is listening:** |
||||
```bash |
||||
curl http://localhost:9876/healthz.json |
||||
``` |
||||
|
||||
2. **Check aitherboard logs:** |
||||
```bash |
||||
docker-compose logs aitherboard |
||||
``` |
||||
|
||||
3. **Verify port mapping:** |
||||
```bash |
||||
docker-compose ps |
||||
# Should show: 0.0.0.0:9876->9876/tcp |
||||
``` |
||||
|
||||
## Environment Variables |
||||
|
||||
You can customize the setup via environment variables in `docker-compose.yml`: |
||||
|
||||
- `PIPER_TTS_HOST`: Override Piper hostname (default: `piper-tts`) |
||||
- `PIPER_TTS_PORT`: Override Piper port (default: `10200`) |
||||
- `NODE_ENV`: Set to `production` (already set) |
||||
|
||||
## Security Notes |
||||
|
||||
- **piper-tts** is NOT exposed to the internet - only accessible from Docker network |
||||
- Only **aitherboard** port `9876` is exposed to the host |
||||
- Apache handles SSL/TLS termination |
||||
- All internal communication happens over Docker's bridge network |
||||
|
||||
## Updating |
||||
|
||||
To update the containers: |
||||
|
||||
```bash |
||||
docker-compose pull # If using pre-built images |
||||
docker-compose up -d --build # Rebuild and restart |
||||
``` |
||||
|
||||
To update voices: |
||||
|
||||
```bash |
||||
./download-voices.sh |
||||
docker-compose restart piper-tts # Restart to pick up new voices |
||||
``` |
||||
@ -0,0 +1,193 @@
@@ -0,0 +1,193 @@
|
||||
#!/bin/bash |
||||
# Script to download Piper TTS voices and place them in the correct structure |
||||
# Voices are downloaded from Hugging Face: https://huggingface.co/rhasspy/piper-voices |
||||
|
||||
# Don't exit on error - we want to continue downloading other voices even if one fails |
||||
set +e |
||||
|
||||
PIPER_DATA_DIR="./piper-data" |
||||
VOICES_BASE_URL="https://huggingface.co/rhasspy/piper-voices/resolve/main" |
||||
|
||||
# Colors for output |
||||
GREEN='\033[0;32m' |
||||
YELLOW='\033[1;33m' |
||||
NC='\033[0m' # No Color |
||||
|
||||
echo -e "${GREEN}Piper TTS Voice Downloader${NC}" |
||||
echo "================================" |
||||
echo "" |
||||
|
||||
# Create piper-data directory if it doesn't exist |
||||
mkdir -p "$PIPER_DATA_DIR" |
||||
|
||||
# Function to download a voice |
||||
download_voice() { |
||||
local lang=$1 |
||||
local locale=$2 |
||||
local voice=$3 |
||||
local quality=$4 |
||||
|
||||
local voice_name="${locale}-${voice}-${quality}" |
||||
local voice_dir="${PIPER_DATA_DIR}/voices/${locale}/${voice}/${quality}" |
||||
local onnx_file="${voice_dir}/${voice_name}.onnx" |
||||
local json_file="${voice_dir}/${voice_name}.onnx.json" |
||||
|
||||
# Create directory structure |
||||
mkdir -p "$voice_dir" |
||||
|
||||
# Check if voice already exists |
||||
if [ -f "$onnx_file" ] && [ -f "$json_file" ]; then |
||||
echo -e "${YELLOW}Voice ${voice_name} already exists, skipping...${NC}" |
||||
return 0 |
||||
fi |
||||
|
||||
echo "Downloading ${voice_name}..." |
||||
|
||||
# Download .onnx.json file |
||||
local json_url="${VOICES_BASE_URL}/${lang}/${locale}/${voice}/${quality}/${voice_name}.onnx.json" |
||||
local curl_output=$(curl -L -f -w "\n%{http_code}" -o "$json_file" "$json_url" 2>&1) |
||||
local http_code=$(echo "$curl_output" | tail -n1) |
||||
local curl_error=$(echo "$curl_output" | head -n-1) |
||||
|
||||
if [ "$http_code" = "200" ] && [ -f "$json_file" ] && [ -s "$json_file" ]; then |
||||
echo " ✓ Downloaded ${voice_name}.onnx.json" |
||||
else |
||||
echo " ✗ Failed to download ${voice_name}.onnx.json" |
||||
echo " URL: ${json_url}" |
||||
echo " HTTP Code: ${http_code:-unknown}" |
||||
if [ -n "$curl_error" ]; then |
||||
echo " Error: $(echo "$curl_error" | head -n1)" |
||||
fi |
||||
echo " This quality level may not be available for this voice." |
||||
rm -f "$json_file" |
||||
return 1 |
||||
fi |
||||
|
||||
# Download .onnx file |
||||
local onnx_url="${VOICES_BASE_URL}/${lang}/${locale}/${voice}/${quality}/${voice_name}.onnx" |
||||
curl_output=$(curl -L -f -w "\n%{http_code}" -o "$onnx_file" "$onnx_url" 2>&1) |
||||
http_code=$(echo "$curl_output" | tail -n1) |
||||
curl_error=$(echo "$curl_output" | head -n-1) |
||||
|
||||
if [ "$http_code" = "200" ] && [ -f "$onnx_file" ] && [ -s "$onnx_file" ]; then |
||||
local file_size=$(stat -c%s "$onnx_file" 2>/dev/null || echo "0") |
||||
local file_size_mb=$(echo "scale=2; $file_size / 1024 / 1024" | bc 2>/dev/null || echo "?") |
||||
echo " ✓ Downloaded ${voice_name}.onnx (${file_size_mb} MB)" |
||||
else |
||||
echo " ✗ Failed to download ${voice_name}.onnx" |
||||
echo " URL: ${onnx_url}" |
||||
echo " HTTP Code: ${http_code:-unknown}" |
||||
if [ -n "$curl_error" ]; then |
||||
echo " Error: $(echo "$curl_error" | head -n1)" |
||||
fi |
||||
echo " This quality level may not be available for this voice." |
||||
rm -f "$onnx_file" "$json_file" |
||||
return 1 |
||||
fi |
||||
|
||||
echo -e "${GREEN} ✓ Successfully downloaded ${voice_name}${NC}" |
||||
return 0 |
||||
} |
||||
|
||||
# List of voices to download (based on the language detection function) |
||||
# Format: language_code locale voice quality |
||||
VOICES=( |
||||
# English (US) - all quality levels |
||||
"en en_US lessac low" |
||||
"en en_US lessac medium" |
||||
"en en_US lessac high" |
||||
# English (GB) |
||||
"en en_GB alba medium" |
||||
|
||||
# German |
||||
"de de_DE thorsten medium" |
||||
"de de_DE thorsten low" |
||||
|
||||
# French |
||||
"fr fr_FR siwis medium" |
||||
"fr fr_FR siwis low" |
||||
|
||||
# Spanish |
||||
"es es_ES davefx medium" |
||||
# Note: es_ES-davefx-low doesn't exist |
||||
|
||||
# Italian - riccardo doesn't exist, removing |
||||
# "it it_IT riccardo medium" - not available |
||||
# "it it_IT riccardo low" - not available |
||||
|
||||
# Russian |
||||
"ru ru_RU ruslan medium" |
||||
# Note: ru_RU-ruslan-low doesn't exist |
||||
|
||||
# Chinese |
||||
"zh zh_CN huayan medium" |
||||
|
||||
# Arabic - hafez doesn't exist, removing |
||||
# "ar ar_SA hafez medium" - not available |
||||
|
||||
# Polish |
||||
"pl pl_PL darkman medium" |
||||
|
||||
# Portuguese - edresson doesn't exist, removing |
||||
# "pt pt_BR edresson medium" - not available |
||||
|
||||
# Dutch |
||||
"nl nl_NL mls medium" |
||||
|
||||
# Czech |
||||
"cs cs_CZ jirka medium" |
||||
|
||||
# Turkish |
||||
"tr tr_TR dfki medium" |
||||
|
||||
# Japanese - nanami doesn't exist, removing |
||||
# "ja ja_JP nanami medium" - not available |
||||
|
||||
# Korean - kyungha doesn't exist, removing |
||||
# "ko ko_KR kyungha medium" - not available |
||||
) |
||||
|
||||
# Check if specific voices are requested |
||||
if [ $# -gt 0 ]; then |
||||
VOICES=("$@") |
||||
fi |
||||
|
||||
echo "Downloading ${#VOICES[@]} voice(s)..." |
||||
echo "" |
||||
|
||||
SUCCESS=0 |
||||
FAILED=0 |
||||
|
||||
for voice_spec in "${VOICES[@]}"; do |
||||
# Parse voice specification |
||||
read -r lang locale voice_name quality <<< "$voice_spec" |
||||
|
||||
# Validate voice specification |
||||
if [ -z "$lang" ] || [ -z "$locale" ] || [ -z "$voice_name" ] || [ -z "$quality" ]; then |
||||
echo -e "${YELLOW}⚠ Skipping invalid voice specification: '${voice_spec}'${NC}" |
||||
((FAILED++)) |
||||
echo "" |
||||
continue |
||||
fi |
||||
|
||||
if download_voice "$lang" "$locale" "$voice_name" "$quality"; then |
||||
((SUCCESS++)) |
||||
else |
||||
((FAILED++)) |
||||
fi |
||||
echo "" |
||||
done |
||||
|
||||
echo "================================" |
||||
echo -e "${GREEN}Download complete!${NC}" |
||||
echo "Successfully downloaded: $SUCCESS" |
||||
if [ $FAILED -gt 0 ]; then |
||||
echo -e "${YELLOW}Failed: $FAILED${NC}" |
||||
fi |
||||
echo "" |
||||
echo "Voices are now in: $PIPER_DATA_DIR" |
||||
echo "This directory is mounted into the Docker container at /data" |
||||
echo "" |
||||
echo "To use these voices, restart your Docker containers:" |
||||
echo " docker-compose down" |
||||
echo " docker-compose up --build" |
||||
@ -0,0 +1,19 @@
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash |
||||
# Setup script to clone and prepare Wyoming Piper from official source |
||||
|
||||
set -e |
||||
|
||||
WYOMING_PIPER_DIR="./wyoming-piper" |
||||
|
||||
if [ -d "$WYOMING_PIPER_DIR" ]; then |
||||
echo "Wyoming Piper directory already exists. Updating..." |
||||
cd "$WYOMING_PIPER_DIR" |
||||
git pull |
||||
cd .. |
||||
else |
||||
echo "Cloning Wyoming Piper from official repository..." |
||||
git clone https://github.com/rhasspy/wyoming-piper.git "$WYOMING_PIPER_DIR" |
||||
fi |
||||
|
||||
echo "Wyoming Piper setup complete!" |
||||
echo "You can now run: docker-compose up --build" |
||||
@ -0,0 +1,995 @@
@@ -0,0 +1,995 @@
|
||||
import type { RequestHandler } from './$types'; |
||||
|
||||
interface TTSRequest { |
||||
text: string; |
||||
voice?: string; |
||||
speed?: number; |
||||
} |
||||
|
||||
/** |
||||
* Proxy endpoint for Piper TTS using Wyoming protocol (TCP) |
||||
* Wyoming protocol: JSON messages newline-delimited, then raw binary audio |
||||
*/ |
||||
export const POST: RequestHandler = async ({ request }) => { |
||||
console.log('Piper TTS API: Request received'); |
||||
try { |
||||
const body: TTSRequest = await request.json(); |
||||
const { text, voice, speed } = body; |
||||
|
||||
console.log('Piper TTS API: Processing request', { textLength: text?.length, voice, speed, voiceType: typeof voice, voiceValue: voice }); |
||||
|
||||
if (!text?.trim()) { |
||||
console.error('Piper TTS API: Missing text field'); |
||||
return errorResponse(400, 'Missing required field: text'); |
||||
} |
||||
|
||||
// Filter and prepare text
|
||||
const filteredText = filterCryptographicContent(text); |
||||
if (!filteredText.trim()) { |
||||
console.warn('Piper TTS API: Text is empty after filtering'); |
||||
return errorResponse(400, 'Text contains only cryptographic addresses/IDs that cannot be read aloud'); |
||||
} |
||||
|
||||
const sentences = splitIntoSentences(filteredText); |
||||
const fullText = sentences.filter(s => s.trim().length > 0).join(' '); |
||||
console.log(`Piper TTS API: Processing ${sentences.length} sentences, total length: ${fullText.length}`); |
||||
|
||||
// Use provided voice, or auto-detect language and select voice if not provided
|
||||
let selectedVoice = voice; |
||||
if (!selectedVoice || selectedVoice.trim() === '') { |
||||
const detectedLang = detectLanguage(fullText); |
||||
selectedVoice = getVoiceForLanguage(detectedLang); |
||||
console.log(`Piper TTS API: No voice provided, auto-detected language: ${detectedLang}, selected voice: ${selectedVoice}`); |
||||
} else { |
||||
console.log(`Piper TTS API: Using provided voice: ${selectedVoice}`); |
||||
} |
||||
|
||||
// Stream audio response with cancellation support
|
||||
const abortController = new AbortController(); |
||||
let wyomingCleanup: (() => void) | null = null; |
||||
|
||||
const stream = new ReadableStream({ |
||||
async start(controller) { |
||||
try { |
||||
const audioChunks: Uint8Array[] = []; |
||||
let audioFormat: { rate: number; width: number; channels: number } | null = null; |
||||
let totalBytes = 0; |
||||
|
||||
const tcpConfig = getTcpConfig(); |
||||
console.log('Piper TTS API: Connecting to Wyoming server at', tcpConfig.hostname, 'port', tcpConfig.port); |
||||
|
||||
await synthesizeWithWyoming( |
||||
tcpConfig, |
||||
fullText, |
||||
selectedVoice, |
||||
speed, |
||||
abortController.signal, |
||||
(cleanup) => { |
||||
wyomingCleanup = cleanup; |
||||
}, |
||||
(chunk: Uint8Array, format?: { rate: number; width: number; channels: number }) => { |
||||
if (abortController.signal.aborted) return; |
||||
|
||||
if (format && !audioFormat) { |
||||
audioFormat = format; |
||||
console.log('Piper TTS API: Received audio format:', format); |
||||
} |
||||
if (chunk.length > 0) { |
||||
audioChunks.push(chunk); |
||||
totalBytes += chunk.length; |
||||
} |
||||
} |
||||
); |
||||
|
||||
if (abortController.signal.aborted) { |
||||
console.log('Piper TTS API: Synthesis aborted'); |
||||
controller.close(); |
||||
return; |
||||
} |
||||
|
||||
if (!audioFormat || totalBytes === 0) { |
||||
throw new Error('No audio data received from Wyoming server'); |
||||
} |
||||
|
||||
console.log('Piper TTS API: Collected audio, total size:', totalBytes, 'bytes'); |
||||
|
||||
const format = audioFormat as { rate: number; width: number; channels: number }; |
||||
const wavHeader = createWavHeader(format.rate, format.width, format.channels, totalBytes); |
||||
controller.enqueue(wavHeader); |
||||
|
||||
for (const chunk of audioChunks) { |
||||
if (abortController.signal.aborted) break; |
||||
controller.enqueue(chunk); |
||||
} |
||||
|
||||
controller.close(); |
||||
} catch (error) { |
||||
if (abortController.signal.aborted) { |
||||
console.log('Piper TTS API: Operation cancelled'); |
||||
controller.close(); |
||||
} else { |
||||
console.error('Piper TTS API: Streaming error:', error); |
||||
controller.error(error); |
||||
} |
||||
} |
||||
}, |
||||
cancel() { |
||||
console.log('Piper TTS API: Stream cancelled by client'); |
||||
abortController.abort(); |
||||
if (wyomingCleanup) { |
||||
wyomingCleanup(); |
||||
} |
||||
} |
||||
}); |
||||
|
||||
return new Response(stream, { |
||||
headers: { |
||||
'Content-Type': 'audio/wav', |
||||
'Transfer-Encoding': 'chunked', |
||||
'Access-Control-Allow-Origin': '*', |
||||
}, |
||||
}); |
||||
} catch (error) { |
||||
const message = error instanceof Error ? error.message : 'Unknown error'; |
||||
console.error('Piper TTS API error:', message); |
||||
return errorResponse(500, message); |
||||
} |
||||
}; |
||||
|
||||
/** |
||||
* Synthesize speech using Wyoming protocol |
||||
* Protocol flow (standard): |
||||
* 1. Send: {"type":"synthesize","data":{"text":"..."}}\n |
||||
* 2. Receive format: {"rate":22050,"width":2,"channels":1}\n |
||||
* 3. Receive raw binary audio (no delimiters) |
||||
* 4. Optionally receive: {"type":"done"}\n or connection closes |
||||
*
|
||||
* Some implementations may send audio-chunk messages: |
||||
* - {"type":"audio-chunk","payload_length":N}\n followed by N bytes of binary audio |
||||
* - These may arrive before or after the format message |
||||
* - We handle both standard and audio-chunk variants for compatibility |
||||
*/ |
||||
async function synthesizeWithWyoming( |
||||
config: { hostname: string; port: number }, |
||||
text: string, |
||||
voice: string | undefined, |
||||
speed: number | undefined, |
||||
abortSignal: AbortSignal, |
||||
onCleanup: (cleanup: () => void) => void, |
||||
onChunk: (chunk: Uint8Array, format?: { rate: number; width: number; channels: number }) => void |
||||
): Promise<void> { |
||||
const net = await import('net'); |
||||
|
||||
return new Promise<void>((resolve, reject) => { |
||||
let socket: import('net').Socket | null = null; |
||||
let buffer = Buffer.alloc(0); |
||||
let audioFormat: { rate: number; width: number; channels: number } | null = null; |
||||
let hasReceivedAudio = false; |
||||
let isResolved = false; |
||||
let lastDataTime = Date.now(); |
||||
let completionTimer: NodeJS.Timeout | null = null; |
||||
const preFormatAudioChunks: Uint8Array[] = []; // Buffer audio chunks received before format
|
||||
let hasProcessedAudioChunks = false; // Track if we've processed audio-chunk messages
|
||||
|
||||
console.log('Wyoming: Creating TCP connection to', config.hostname, 'port', config.port); |
||||
|
||||
const cleanup = () => { |
||||
if (socket && !socket.destroyed) { |
||||
console.log('Wyoming: Cleaning up TCP connection'); |
||||
socket.destroy(); |
||||
} |
||||
}; |
||||
|
||||
// Register cleanup function
|
||||
onCleanup(cleanup); |
||||
|
||||
// Check if already aborted
|
||||
if (abortSignal.aborted) { |
||||
console.log('Wyoming: Abort signal already set, not connecting'); |
||||
reject(new Error('Operation cancelled')); |
||||
return; |
||||
} |
||||
|
||||
// Listen for abort signal
|
||||
const abortHandler = () => { |
||||
console.log('Wyoming: Abort signal received, cleaning up'); |
||||
if (completionTimer) { |
||||
clearTimeout(completionTimer); |
||||
completionTimer = null; |
||||
} |
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
reject(new Error('Operation cancelled')); |
||||
} |
||||
}; |
||||
abortSignal.addEventListener('abort', abortHandler); |
||||
|
||||
const timeout = setTimeout(() => { |
||||
cleanup(); |
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
console.error('Wyoming: Timeout after 5 minutes'); |
||||
reject(new Error('Wyoming protocol timeout')); |
||||
} |
||||
}, 300000); // 5 minutes
|
||||
|
||||
try { |
||||
socket = net.createConnection(config.port, config.hostname, () => { |
||||
console.log('Wyoming: TCP connected successfully'); |
||||
// Send synthesize request
|
||||
// Wyoming protocol expects voice as an object with 'name' property, not a plain string
|
||||
const message = { |
||||
type: 'synthesize', |
||||
data: { |
||||
text, |
||||
...(voice ? { voice: { name: voice } } : {}), |
||||
...(speed !== undefined && speed !== 1.0 ? { speed } : {}), |
||||
} |
||||
}; |
||||
const messageStr = JSON.stringify(message) + '\n'; |
||||
console.log('Wyoming: Sending synthesize message, text length:', text.length, 'voice:', voice ? `{name: "${voice}"}` : 'none (will use default)'); |
||||
console.log('Wyoming: Full message:', messageStr.trim()); |
||||
try { |
||||
socket!.write(messageStr); |
||||
console.log('Wyoming: Synthesize message sent'); |
||||
} catch (writeError) { |
||||
console.error('Wyoming: Failed to write message:', writeError); |
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
reject(new Error(`Failed to send message: ${writeError instanceof Error ? writeError.message : String(writeError)}`)); |
||||
} |
||||
} |
||||
}); |
||||
} catch (error) { |
||||
console.error('Wyoming: Failed to create connection:', error); |
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
reject(new Error(`Failed to create connection: ${error instanceof Error ? error.message : String(error)}`)); |
||||
} |
||||
return; |
||||
} |
||||
|
||||
socket.on('data', (data: Buffer) => { |
||||
// Check if aborted
|
||||
if (abortSignal.aborted) { |
||||
console.log('Wyoming: Aborted, ignoring data'); |
||||
return; |
||||
} |
||||
|
||||
lastDataTime = Date.now(); |
||||
|
||||
// Clear completion timer since we're receiving data
|
||||
if (completionTimer) { |
||||
clearTimeout(completionTimer); |
||||
completionTimer = null; |
||||
} |
||||
|
||||
console.log('Wyoming: Received data, size:', data.length, 'bytes, audioFormat:', audioFormat ? 'received' : 'not received'); |
||||
buffer = Buffer.concat([buffer, data]); |
||||
|
||||
// Process buffer
|
||||
while (buffer.length > 0) { |
||||
// Check if aborted during processing
|
||||
if (abortSignal.aborted) { |
||||
console.log('Wyoming: Aborted during buffer processing'); |
||||
break; |
||||
} |
||||
|
||||
// After format received, check for "done" message, audio-chunk messages, or process as raw audio
|
||||
if (audioFormat) { |
||||
// Check if buffer starts with JSON (for done/error/audio-chunk messages)
|
||||
if (buffer.length > 0 && buffer[0] === 0x7b) { // '{' byte
|
||||
const newlineIndex = buffer.indexOf('\n'); |
||||
if (newlineIndex !== -1) { |
||||
try { |
||||
const line = buffer.subarray(0, newlineIndex).toString('utf8').trim(); |
||||
const message = JSON.parse(line); |
||||
|
||||
if (message.type === 'done') { |
||||
console.log('Wyoming: Received done message'); |
||||
if (completionTimer) { |
||||
clearTimeout(completionTimer); |
||||
completionTimer = null; |
||||
} |
||||
buffer = buffer.subarray(newlineIndex + 1); |
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
resolve(); |
||||
} |
||||
return; |
||||
} |
||||
|
||||
if (message.type === 'error') { |
||||
console.error('Wyoming: Received error message:', message.message); |
||||
buffer = buffer.subarray(newlineIndex + 1); |
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
reject(new Error(message.message || 'Wyoming protocol error')); |
||||
} |
||||
return; |
||||
} |
||||
|
||||
if (message.type === 'audio-stop') { |
||||
console.log('Wyoming: Received audio-stop message'); |
||||
buffer = buffer.subarray(newlineIndex + 1); |
||||
if (completionTimer) { |
||||
clearTimeout(completionTimer); |
||||
completionTimer = null; |
||||
} |
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
resolve(); |
||||
} |
||||
return; |
||||
} |
||||
|
||||
// Handle audio-chunk messages after format
|
||||
if (message.type === 'audio-chunk' && typeof message.payload_length === 'number') { |
||||
const payloadLength = message.payload_length; |
||||
const messageEnd = newlineIndex + 1; |
||||
// If data_length is specified, there's additional JSON data before the payload
|
||||
const dataLength = typeof message.data_length === 'number' ? message.data_length : 0; |
||||
const payloadStart = messageEnd + dataLength; |
||||
const payloadEnd = payloadStart + payloadLength; |
||||
|
||||
if (buffer.length >= payloadEnd) { |
||||
const audioPayload = new Uint8Array(buffer.subarray(payloadStart, payloadEnd)); |
||||
onChunk(audioPayload); |
||||
hasReceivedAudio = true; |
||||
hasProcessedAudioChunks = true; |
||||
buffer = buffer.subarray(payloadEnd); |
||||
continue; // Continue processing loop
|
||||
} else { |
||||
// Don't have full payload yet - wait for more data
|
||||
break; |
||||
} |
||||
} |
||||
} catch (error) { |
||||
// Not valid JSON - treat as raw audio
|
||||
// Fall through to raw audio processing
|
||||
} |
||||
} else { |
||||
// No newline yet - might be incomplete JSON, wait for more data
|
||||
break; |
||||
} |
||||
} |
||||
|
||||
// No JSON message found - process all buffer as raw audio
|
||||
if (buffer.length > 0) { |
||||
onChunk(new Uint8Array(buffer)); |
||||
hasReceivedAudio = true; |
||||
buffer = Buffer.alloc(0); |
||||
} |
||||
|
||||
// If we've received audio and buffer is empty, set a completion timer
|
||||
// This handles cases where the server doesn't send "done" or close connection
|
||||
if (hasReceivedAudio && buffer.length === 0 && !completionTimer) { |
||||
completionTimer = setTimeout(() => { |
||||
if (!isResolved && hasReceivedAudio && !abortSignal.aborted) { |
||||
console.log('Wyoming: No data received for 500ms after audio, assuming completion'); |
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
isResolved = true; |
||||
resolve(); |
||||
} |
||||
}, 500); // 500ms timeout after last data
|
||||
} |
||||
|
||||
// Break and wait for more data (could be more audio or "done" message)
|
||||
break; |
||||
} |
||||
|
||||
// Before format: scan buffer for JSON format message
|
||||
// Look for '{' followed by newline-delimited JSON
|
||||
let formatFound = false; |
||||
let searchStart = 0; |
||||
|
||||
while (searchStart < buffer.length && !formatFound) { |
||||
const braceIndex = buffer.indexOf(0x7b, searchStart); // '{' byte
|
||||
if (braceIndex === -1) { |
||||
// No more '{' found - this is all binary data, buffer it
|
||||
break; |
||||
} |
||||
|
||||
// Look for newline after this '{'
|
||||
const newlineIndex = buffer.indexOf('\n', braceIndex); |
||||
if (newlineIndex === -1) { |
||||
// No newline yet - wait for more data
|
||||
break; |
||||
} |
||||
|
||||
// Try to parse as JSON
|
||||
const lineBytes = buffer.subarray(braceIndex, newlineIndex); |
||||
const line = lineBytes.toString('utf8').trim(); |
||||
|
||||
if (line.endsWith('}')) { |
||||
try { |
||||
const message = JSON.parse(line); |
||||
console.log('Wyoming: Received message:', JSON.stringify(message)); |
||||
|
||||
// Check for audio-start message (contains format info)
|
||||
if (message.type === 'audio-start' && (message.rate !== undefined || message.channels !== undefined)) { |
||||
audioFormat = { |
||||
rate: message.rate, |
||||
width: message.width || 2, |
||||
channels: message.channels, |
||||
}; |
||||
console.log('Wyoming: Audio format from audio-start:', audioFormat); |
||||
|
||||
// Send format notification
|
||||
onChunk(new Uint8Array(0), audioFormat); |
||||
|
||||
// Process any buffered audio chunks
|
||||
if (preFormatAudioChunks.length > 0) { |
||||
console.log('Wyoming: Processing', preFormatAudioChunks.length, 'buffered audio chunks after audio-start'); |
||||
for (const chunk of preFormatAudioChunks) { |
||||
onChunk(chunk); |
||||
hasReceivedAudio = true; |
||||
} |
||||
preFormatAudioChunks.length = 0; |
||||
hasProcessedAudioChunks = true; |
||||
} |
||||
|
||||
buffer = buffer.subarray(newlineIndex + 1); |
||||
searchStart = 0; |
||||
continue; |
||||
} |
||||
|
||||
// Check for format message (can be a standalone format object or embedded in other messages)
|
||||
if (message.rate !== undefined || message.channels !== undefined) { |
||||
audioFormat = { |
||||
rate: message.rate, |
||||
width: message.width || 2, |
||||
channels: message.channels, |
||||
}; |
||||
console.log('Wyoming: Audio format:', audioFormat); |
||||
|
||||
// Remove everything up to and including the format message
|
||||
const dataAfterFormat = buffer.subarray(newlineIndex + 1); |
||||
|
||||
// Send format notification first
|
||||
onChunk(new Uint8Array(0), audioFormat); |
||||
|
||||
// Process any buffered audio chunks received before format
|
||||
if (preFormatAudioChunks.length > 0) { |
||||
console.log('Wyoming: Processing', preFormatAudioChunks.length, 'buffered audio chunks'); |
||||
for (const chunk of preFormatAudioChunks) { |
||||
onChunk(chunk); |
||||
hasReceivedAudio = true; |
||||
} |
||||
preFormatAudioChunks.length = 0; // Clear the buffer
|
||||
hasProcessedAudioChunks = true; |
||||
} |
||||
|
||||
// Process any raw data before format as audio (protocol violation, but handle it)
|
||||
// BUT: Skip this if we've already processed audio-chunk messages, as that data
|
||||
// is likely protocol overhead or corrupted, not actual audio
|
||||
if (braceIndex > 0 && !hasProcessedAudioChunks) { |
||||
const preFormatData = buffer.subarray(0, braceIndex); |
||||
// Only process if it's not empty and looks like audio (not JSON)
|
||||
// Also check that it's a reasonable size (not just a few bytes of protocol overhead)
|
||||
if (preFormatData.length > 0 && preFormatData[0] !== 0x7b && preFormatData.length > 100) { |
||||
console.warn('Wyoming: Processing', braceIndex, 'bytes of raw data received before format message as audio'); |
||||
onChunk(new Uint8Array(preFormatData)); |
||||
hasReceivedAudio = true; |
||||
} else if (preFormatData.length > 0 && preFormatData.length <= 100) { |
||||
console.warn('Wyoming: Skipping', preFormatData.length, 'bytes of data before format (likely protocol overhead)'); |
||||
} |
||||
} else if (braceIndex > 0 && hasProcessedAudioChunks) { |
||||
console.warn('Wyoming: Skipping', braceIndex, 'bytes of data before format (audio-chunk messages already processed)'); |
||||
} |
||||
|
||||
// Process data after format as audio
|
||||
if (dataAfterFormat.length > 0) { |
||||
onChunk(new Uint8Array(dataAfterFormat)); |
||||
hasReceivedAudio = true; |
||||
} |
||||
|
||||
buffer = Buffer.alloc(0); |
||||
formatFound = true; |
||||
continue; // Continue processing loop
|
||||
} |
||||
|
||||
// Check for done/error messages
|
||||
if (message.type === 'done') { |
||||
console.log('Wyoming: Received done message'); |
||||
buffer = buffer.subarray(newlineIndex + 1); |
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
if (hasReceivedAudio) { |
||||
resolve(); |
||||
} else { |
||||
reject(new Error('No audio data received')); |
||||
} |
||||
} |
||||
return; |
||||
} |
||||
|
||||
if (message.type === 'error') { |
||||
console.error('Wyoming: Received error message:', message.message); |
||||
buffer = buffer.subarray(newlineIndex + 1); |
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
reject(new Error(message.message || 'Wyoming protocol error')); |
||||
} |
||||
return; |
||||
} |
||||
|
||||
// Handle audio-stop message
|
||||
if (message.type === 'audio-stop') { |
||||
console.log('Wyoming: Received audio-stop message'); |
||||
buffer = buffer.subarray(newlineIndex + 1); |
||||
|
||||
// If we have buffered audio chunks but no format, use default format
|
||||
if (preFormatAudioChunks.length > 0 && !audioFormat) { |
||||
console.warn('Wyoming: Format message never received, using default format for', preFormatAudioChunks.length, 'buffered chunks'); |
||||
// Default Piper TTS format: 22050 Hz, 16-bit (width=2), mono (channels=1)
|
||||
audioFormat = { |
||||
rate: 22050, |
||||
width: 2, |
||||
channels: 1, |
||||
}; |
||||
console.log('Wyoming: Using default audio format:', audioFormat); |
||||
|
||||
// Send format notification
|
||||
onChunk(new Uint8Array(0), audioFormat); |
||||
|
||||
// Process buffered chunks
|
||||
for (const chunk of preFormatAudioChunks) { |
||||
onChunk(chunk); |
||||
hasReceivedAudio = true; |
||||
} |
||||
preFormatAudioChunks.length = 0; |
||||
hasProcessedAudioChunks = true; |
||||
} |
||||
|
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
if (hasReceivedAudio) { |
||||
resolve(); |
||||
} else { |
||||
reject(new Error('No audio data received')); |
||||
} |
||||
} |
||||
return; |
||||
} |
||||
|
||||
// Handle audio-chunk messages
|
||||
if (message.type === 'audio-chunk' && typeof message.payload_length === 'number') { |
||||
const payloadLength = message.payload_length; |
||||
const messageEnd = newlineIndex + 1; |
||||
// If data_length is specified, there's additional JSON data before the payload
|
||||
const dataLength = typeof message.data_length === 'number' ? message.data_length : 0; |
||||
const payloadStart = messageEnd + dataLength; |
||||
const payloadEnd = payloadStart + payloadLength; |
||||
|
||||
console.log('Wyoming: Processing audio-chunk, payload_length:', payloadLength, 'data_length:', dataLength, 'buffer length:', buffer.length, 'payloadStart:', payloadStart, 'payloadEnd:', payloadEnd); |
||||
|
||||
// Check if we have the full payload
|
||||
if (buffer.length >= payloadEnd) { |
||||
// If there's data_length, try to parse the format from that data
|
||||
if (dataLength > 0 && !audioFormat) { |
||||
const dataBytes = buffer.subarray(messageEnd, payloadStart); |
||||
try { |
||||
const dataStr = dataBytes.toString('utf8'); |
||||
const formatData = JSON.parse(dataStr); |
||||
if (formatData.rate !== undefined || formatData.channels !== undefined) { |
||||
audioFormat = { |
||||
rate: formatData.rate, |
||||
width: formatData.width || 2, |
||||
channels: formatData.channels, |
||||
}; |
||||
console.log('Wyoming: Found format in data section:', audioFormat); |
||||
onChunk(new Uint8Array(0), audioFormat); |
||||
} |
||||
} catch (e) { |
||||
console.warn('Wyoming: Failed to parse data section as JSON:', e); |
||||
} |
||||
} |
||||
|
||||
// Extract the audio payload (after the data section)
|
||||
const audioPayload = new Uint8Array(buffer.subarray(payloadStart, payloadEnd)); |
||||
console.log('Wyoming: Extracted audio payload:', audioPayload.length, 'bytes, first 8 bytes:', Array.from(audioPayload.slice(0, 8)).map(b => '0x' + b.toString(16).padStart(2, '0')).join(' ')); |
||||
|
||||
// Check if format is embedded in the audio-chunk message itself
|
||||
if (!audioFormat && (message.rate !== undefined || message.channels !== undefined)) { |
||||
audioFormat = { |
||||
rate: message.rate || 22050, |
||||
width: message.width || 2, |
||||
channels: message.channels || 1, |
||||
}; |
||||
console.log('Wyoming: Found format in audio-chunk message:', audioFormat); |
||||
onChunk(new Uint8Array(0), audioFormat); |
||||
} |
||||
|
||||
// If we have format, process it as audio; otherwise buffer it
|
||||
if (audioFormat) { |
||||
onChunk(audioPayload); |
||||
hasReceivedAudio = true; |
||||
hasProcessedAudioChunks = true; |
||||
} else { |
||||
// Buffer audio chunks until we get format
|
||||
preFormatAudioChunks.push(audioPayload); |
||||
console.log('Wyoming: Buffering audio-chunk payload of', payloadLength, 'bytes (format not yet received)'); |
||||
hasProcessedAudioChunks = true; // Mark that we've seen audio-chunk messages
|
||||
} |
||||
|
||||
// Remove the message and payload from buffer
|
||||
buffer = buffer.subarray(payloadEnd); |
||||
searchStart = 0; // Reset search to start of buffer
|
||||
continue; |
||||
} else { |
||||
// Don't have full payload yet - wait for more data
|
||||
console.log('Wyoming: Waiting for more data, need', payloadEnd, 'have', buffer.length); |
||||
break; |
||||
} |
||||
} |
||||
|
||||
// Other JSON message - skip it and continue searching
|
||||
searchStart = newlineIndex + 1; |
||||
} catch (error) { |
||||
// Not valid JSON - continue searching
|
||||
searchStart = braceIndex + 1; |
||||
} |
||||
} else { |
||||
// Incomplete JSON - continue searching
|
||||
searchStart = braceIndex + 1; |
||||
} |
||||
} |
||||
|
||||
// If we found format, continue processing; otherwise wait for more data
|
||||
if (!formatFound) { |
||||
break; |
||||
} |
||||
} |
||||
}); |
||||
|
||||
socket.on('error', (error: Error) => { |
||||
console.error('Wyoming: TCP error:', error.message); |
||||
abortSignal.removeEventListener('abort', abortHandler); |
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
reject(new Error(`TCP error: ${error.message}`)); |
||||
} |
||||
}); |
||||
|
||||
socket.on('close', () => { |
||||
console.log('Wyoming: Connection closed, hasReceivedAudio:', hasReceivedAudio, 'buffer length:', buffer.length, 'buffered chunks:', preFormatAudioChunks.length); |
||||
if (completionTimer) { |
||||
clearTimeout(completionTimer); |
||||
completionTimer = null; |
||||
} |
||||
abortSignal.removeEventListener('abort', abortHandler); |
||||
cleanup(); |
||||
clearTimeout(timeout); |
||||
|
||||
// If we have buffered audio chunks but no format, use default format
|
||||
if (!abortSignal.aborted && preFormatAudioChunks.length > 0 && !audioFormat) { |
||||
console.warn('Wyoming: Format message never received before connection close, using default format for', preFormatAudioChunks.length, 'buffered chunks'); |
||||
// Default Piper TTS format: 22050 Hz, 16-bit (width=2), mono (channels=1)
|
||||
audioFormat = { |
||||
rate: 22050, |
||||
width: 2, |
||||
channels: 1, |
||||
}; |
||||
console.log('Wyoming: Using default audio format:', audioFormat); |
||||
|
||||
// Send format notification
|
||||
onChunk(new Uint8Array(0), audioFormat); |
||||
|
||||
// Process buffered chunks
|
||||
for (const chunk of preFormatAudioChunks) { |
||||
onChunk(chunk); |
||||
hasReceivedAudio = true; |
||||
} |
||||
preFormatAudioChunks.length = 0; |
||||
hasProcessedAudioChunks = true; |
||||
} |
||||
|
||||
// Only process remaining buffer if not aborted
|
||||
if (!abortSignal.aborted && buffer.length > 0 && audioFormat) { |
||||
console.log('Wyoming: Streaming remaining buffer:', buffer.length, 'bytes'); |
||||
onChunk(new Uint8Array(buffer)); |
||||
hasReceivedAudio = true; |
||||
} |
||||
|
||||
if (!isResolved) { |
||||
isResolved = true; |
||||
if (abortSignal.aborted) { |
||||
console.log('Wyoming: Connection closed after abort'); |
||||
reject(new Error('Operation cancelled')); |
||||
} else if (hasReceivedAudio) { |
||||
console.log('Wyoming: Resolving - audio received'); |
||||
resolve(); |
||||
} else { |
||||
console.error('Wyoming: Rejecting - no audio received'); |
||||
reject(new Error('Connection closed without audio data')); |
||||
} |
||||
} |
||||
}); |
||||
}); |
||||
} |
||||
|
||||
function getTcpConfig(): { hostname: string; port: number } { |
||||
// Allow override via environment variable
|
||||
const piperHost = process.env.PIPER_TTS_HOST || process.env.PIPER_HOST; |
||||
const piperPort = process.env.PIPER_TTS_PORT || process.env.PIPER_PORT; |
||||
|
||||
if (piperHost && piperPort) { |
||||
return { |
||||
hostname: piperHost, |
||||
port: parseInt(piperPort, 10), |
||||
}; |
||||
} |
||||
|
||||
// Default: use Docker service name in production, localhost in development
|
||||
const isDevelopment = process.env.NODE_ENV === 'development'; |
||||
return { |
||||
hostname: isDevelopment ? 'localhost' : 'piper-tts', |
||||
port: 10200, |
||||
}; |
||||
} |
||||
|
||||
function createWavHeader(sampleRate: number, bytesPerSample: number, channels: number, dataSize: number): Uint8Array { |
||||
const header = new ArrayBuffer(44); |
||||
const view = new DataView(header); |
||||
|
||||
// RIFF header
|
||||
view.setUint8(0, 0x52); // 'R'
|
||||
view.setUint8(1, 0x49); // 'I'
|
||||
view.setUint8(2, 0x46); // 'F'
|
||||
view.setUint8(3, 0x46); // 'F'
|
||||
view.setUint32(4, 36 + dataSize, true); // File size - 8
|
||||
|
||||
// WAVE header
|
||||
view.setUint8(8, 0x57); // 'W'
|
||||
view.setUint8(9, 0x41); // 'A'
|
||||
view.setUint8(10, 0x56); // 'V'
|
||||
view.setUint8(11, 0x45); // 'E'
|
||||
|
||||
// fmt chunk
|
||||
view.setUint8(12, 0x66); // 'f'
|
||||
view.setUint8(13, 0x6D); // 'm'
|
||||
view.setUint8(14, 0x74); // 't'
|
||||
view.setUint8(15, 0x20); // ' '
|
||||
view.setUint32(16, 16, true); // fmt chunk size
|
||||
view.setUint16(20, 1, true); // Audio format (1 = PCM)
|
||||
view.setUint16(22, channels, true); // Number of channels
|
||||
view.setUint32(24, sampleRate, true); // Sample rate
|
||||
view.setUint32(28, sampleRate * channels * bytesPerSample, true); // Byte rate
|
||||
view.setUint16(32, channels * bytesPerSample, true); // Block align
|
||||
view.setUint16(34, bytesPerSample * 8, true); // Bits per sample
|
||||
|
||||
// data chunk
|
||||
view.setUint8(36, 0x64); // 'd'
|
||||
view.setUint8(37, 0x61); // 'a'
|
||||
view.setUint8(38, 0x74); // 't'
|
||||
view.setUint8(39, 0x61); // 'a'
|
||||
view.setUint32(40, dataSize, true); // Data size
|
||||
|
||||
return new Uint8Array(header); |
||||
} |
||||
|
||||
function filterCryptographicContent(text: string): string { |
||||
let filtered = text; |
||||
|
||||
// Remove URLs
|
||||
filtered = filtered.replace(/https?:\/\/[^\s]+/gi, ''); |
||||
filtered = filtered.replace(/www\.[^\s]+/gi, ''); |
||||
|
||||
// Remove Nostr URIs and bech32 addresses
|
||||
filtered = filtered.replace(/nostr:[^\s]+/gi, ''); |
||||
filtered = filtered.replace(/\b(npub|note|nevent|naddr|nprofile|nsec|ncryptsec)1[a-z0-9]{20,}\b/gi, ''); |
||||
|
||||
// Remove hex strings
|
||||
filtered = filtered.replace(/\b[0-9a-f]{64}\b/gi, ''); |
||||
filtered = filtered.replace(/\b[0-9a-f]{32,63}\b/gi, ''); |
||||
|
||||
// Remove emojis
|
||||
filtered = filtered.replace(/[\u{1F300}-\u{1F9FF}]/gu, ''); |
||||
filtered = filtered.replace(/[\u{1F600}-\u{1F64F}]/gu, ''); |
||||
filtered = filtered.replace(/[\u{2600}-\u{26FF}]/gu, ''); |
||||
filtered = filtered.replace(/[\u{2700}-\u{27BF}]/gu, ''); |
||||
|
||||
// Remove markdown and asciidoc markup
|
||||
|
||||
// Code blocks (markdown and asciidoc)
|
||||
filtered = filtered.replace(/```[\s\S]*?```/g, ''); |
||||
filtered = filtered.replace(/`[^`]+`/g, ''); |
||||
filtered = filtered.replace(/----[\s\S]*?----/g, ''); // AsciiDoc code blocks
|
||||
filtered = filtered.replace(/\[source[^\]]*\][\s\S]*?----/g, ''); // AsciiDoc source blocks
|
||||
|
||||
// Headers (markdown and asciidoc)
|
||||
filtered = filtered.replace(/^#+\s+/gm, ''); // Markdown headers at start of line
|
||||
filtered = filtered.replace(/\s+#+\s+/g, ' '); // Markdown headers in middle of text
|
||||
filtered = filtered.replace(/^=+\s*$/gm, ''); // AsciiDoc headers (single line)
|
||||
filtered = filtered.replace(/^=+\s+/gm, ''); // AsciiDoc headers at start of line
|
||||
filtered = filtered.replace(/\s+=+\s+/g, ' '); // AsciiDoc headers in middle of text
|
||||
|
||||
// Links (markdown and asciidoc)
|
||||
filtered = filtered.replace(/\[([^\]]+)\]\([^\)]+\)/g, '$1'); // Markdown links
|
||||
filtered = filtered.replace(/\[\[([^\]]+)\]\]/g, '$1'); // AsciiDoc links
|
||||
filtered = filtered.replace(/link:([^\[]+)\[([^\]]+)\]/g, '$2'); // AsciiDoc link: syntax
|
||||
|
||||
// Images (markdown and asciidoc)
|
||||
filtered = filtered.replace(/!\[([^\]]*)\]\([^\)]+\)/g, ''); // Markdown images
|
||||
filtered = filtered.replace(/image::?[^\[]+\[([^\]]*)\]/g, '$1'); // AsciiDoc images
|
||||
|
||||
// Emphasis and formatting
|
||||
filtered = filtered.replace(/\*\*([^*]+)\*\*/g, '$1'); // Bold markdown
|
||||
filtered = filtered.replace(/\*([^*]+)\*/g, '$1'); // Italic markdown
|
||||
filtered = filtered.replace(/__([^_]+)__/g, '$1'); // Bold markdown (underscore)
|
||||
filtered = filtered.replace(/_([^_]+)_/g, '$1'); // Italic markdown (underscore)
|
||||
filtered = filtered.replace(/\*\*([^*]+)\*\*/g, '$1'); // Bold asciidoc
|
||||
filtered = filtered.replace(/\*([^*]+)\*/g, '$1'); // Italic asciidoc
|
||||
filtered = filtered.replace(/\+\+([^+]+)\+\+/g, '$1'); // Monospace asciidoc
|
||||
filtered = filtered.replace(/~~([^~]+)~~/g, '$1'); // Strikethrough markdown
|
||||
|
||||
// Lists (markdown and asciidoc)
|
||||
filtered = filtered.replace(/^[\*\-\+]\s+/gm, ''); // Markdown unordered lists
|
||||
filtered = filtered.replace(/^\d+\.\s+/gm, ''); // Markdown ordered lists
|
||||
filtered = filtered.replace(/^\.\s+/gm, ''); // AsciiDoc unordered lists
|
||||
filtered = filtered.replace(/^\d+\.\s+/gm, ''); // AsciiDoc ordered lists
|
||||
|
||||
// Blockquotes
|
||||
filtered = filtered.replace(/^>\s+/gm, ''); // Markdown blockquotes
|
||||
filtered = filtered.replace(/^\[quote[^\]]*\][\s\S]*?\[quote\]/g, ''); // AsciiDoc quotes
|
||||
|
||||
// Horizontal rules
|
||||
filtered = filtered.replace(/^[-*_]{3,}\s*$/gm, ''); // Markdown horizontal rules
|
||||
filtered = filtered.replace(/^'''+\s*$/gm, ''); // AsciiDoc horizontal rules
|
||||
|
||||
// Tables (markdown and asciidoc)
|
||||
filtered = filtered.replace(/\|/g, ' '); // Remove table separators
|
||||
filtered = filtered.replace(/^\|.+\|\s*$/gm, ''); // Remove table rows
|
||||
filtered = filtered.replace(/^\[cols?=[^\]]*\][\s\S]*?\|===\s*$/gm, ''); // AsciiDoc tables
|
||||
|
||||
// Other asciidoc syntax
|
||||
filtered = filtered.replace(/\[\[([^\]]+)\]\]/g, ''); // AsciiDoc anchors
|
||||
filtered = filtered.replace(/\[NOTE\]/gi, ''); |
||||
filtered = filtered.replace(/\[TIP\]/gi, ''); |
||||
filtered = filtered.replace(/\[WARNING\]/gi, ''); |
||||
filtered = filtered.replace(/\[IMPORTANT\]/gi, ''); |
||||
filtered = filtered.replace(/\[CAUTION\]/gi, ''); |
||||
filtered = filtered.replace(/\[source[^\]]*\]/gi, ''); |
||||
filtered = filtered.replace(/\[caption[^\]]*\]/gi, ''); |
||||
|
||||
// Clean up whitespace
|
||||
filtered = filtered.replace(/\s+/g, ' ').trim(); |
||||
|
||||
return filtered; |
||||
} |
||||
|
||||
function splitIntoSentences(text: string): string[] { |
||||
const cleaned = text |
||||
.replace(/^#+\s+/gm, '') |
||||
.replace(/\n+/g, ' ') |
||||
.trim(); |
||||
|
||||
const sentences: string[] = []; |
||||
const regex = /([.!?]+)\s+/g; |
||||
let lastIndex = 0; |
||||
let match; |
||||
|
||||
while ((match = regex.exec(cleaned)) !== null) { |
||||
const sentence = cleaned.substring(lastIndex, match.index + match[1].length).trim(); |
||||
if (sentence.length > 0) { |
||||
sentences.push(sentence); |
||||
} |
||||
lastIndex = match.index + match[0].length; |
||||
} |
||||
|
||||
const remaining = cleaned.substring(lastIndex).trim(); |
||||
if (remaining.length > 0) { |
||||
sentences.push(remaining); |
||||
} |
||||
|
||||
return sentences.length > 0 ? sentences : [cleaned]; |
||||
} |
||||
|
||||
function errorResponse(status: number, message: string): Response { |
||||
return new Response(JSON.stringify({ error: message }), { |
||||
status, |
||||
headers: { 'Content-Type': 'application/json' }, |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Simple language detection based on character patterns |
||||
* Returns language code (e.g., 'en', 'de', 'fr', 'es', etc.) |
||||
*/ |
||||
function detectLanguage(text: string): string { |
||||
if (!text || text.length === 0) return 'en'; |
||||
|
||||
// Count character patterns to detect language
|
||||
const sample = text.substring(0, Math.min(500, text.length)); |
||||
|
||||
// German: ä, ö, ü, ß
|
||||
const germanChars = (sample.match(/[äöüßÄÖÜ]/g) || []).length; |
||||
// French: é, è, ê, ç, à, etc.
|
||||
const frenchChars = (sample.match(/[éèêëàâäçôùûüÉÈÊËÀÂÄÇÔÙÛÜ]/g) || []).length; |
||||
// Spanish: ñ, á, é, í, ó, ú, ¿, ¡
|
||||
const spanishChars = (sample.match(/[ñáéíóúüÑÁÉÍÓÚÜ¿¡]/g) || []).length; |
||||
// Italian: à, è, é, ì, ò, ù
|
||||
const italianChars = (sample.match(/[àèéìòùÀÈÉÌÒÙ]/g) || []).length; |
||||
// Russian/Cyrillic
|
||||
const cyrillicChars = (sample.match(/[а-яёА-ЯЁ]/g) || []).length; |
||||
// Chinese/Japanese/Korean (CJK)
|
||||
const cjkChars = (sample.match(/[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/g) || []).length; |
||||
// Arabic
|
||||
const arabicChars = (sample.match(/[\u0600-\u06ff]/g) || []).length; |
||||
|
||||
// Calculate ratios
|
||||
const total = sample.length; |
||||
const germanRatio = germanChars / total; |
||||
const frenchRatio = frenchChars / total; |
||||
const spanishRatio = spanishChars / total; |
||||
const italianRatio = italianChars / total; |
||||
const cyrillicRatio = cyrillicChars / total; |
||||
const cjkRatio = cjkChars / total; |
||||
const arabicRatio = arabicChars / total; |
||||
|
||||
// Detect based on highest ratio
|
||||
if (cyrillicRatio > 0.1) return 'ru'; |
||||
if (cjkRatio > 0.1) return 'zh'; // Default to Chinese for CJK
|
||||
if (arabicRatio > 0.1) return 'ar'; |
||||
if (germanRatio > 0.02) return 'de'; |
||||
if (frenchRatio > 0.02) return 'fr'; |
||||
if (spanishRatio > 0.02) return 'es'; |
||||
if (italianRatio > 0.02) return 'it'; |
||||
|
||||
// Default to English
|
||||
return 'en'; |
||||
} |
||||
|
||||
/** |
||||
* Map language code to Piper voice name |
||||
* Returns voice name (always returns a value, defaults to English) |
||||
* Voice names follow pattern: {lang}_{locale}-{voice}-{quality} |
||||
*
|
||||
* Note: These are common voice names. You may need to adjust based on |
||||
* which voices are actually available in your piper-data directory. |
||||
* To see available voices, check the piper-data folder or Wyoming server logs. |
||||
*/ |
||||
function getVoiceForLanguage(lang: string): string { |
||||
// Common voice mappings - adjust based on available voices in your piper-data directory
|
||||
const voiceMap: Record<string, string> = { |
||||
'en': 'en_US-lessac-medium', // Default English voice
|
||||
'de': 'de_DE-thorsten-medium', // German
|
||||
'fr': 'fr_FR-siwis-medium', // French
|
||||
'es': 'es_ES-davefx-medium', // Spanish
|
||||
// 'it': 'it_IT-riccardo-medium', // Italian - not available
|
||||
'ru': 'ru_RU-ruslan-medium', // Russian
|
||||
'zh': 'zh_CN-huayan-medium', // Chinese
|
||||
// 'ar': 'ar_SA-hafez-medium', // Arabic - not available
|
||||
'pl': 'pl_PL-darkman-medium', // Polish
|
||||
// 'pt': 'pt_BR-edresson-medium', // Portuguese - not available
|
||||
'nl': 'nl_NL-mls-medium', // Dutch
|
||||
'cs': 'cs_CZ-jirka-medium', // Czech
|
||||
'tr': 'tr_TR-dfki-medium', // Turkish
|
||||
// 'ja': 'ja_JP-nanami-medium', // Japanese - not available
|
||||
// 'ko': 'ko_KR-kyungha-medium', // Korean - not available
|
||||
}; |
||||
|
||||
return voiceMap[lang] || voiceMap['en']; // Fall back to English
|
||||
} |
||||
Loading…
Reference in new issue