Files
opencode-skill/skills/minimax-multimodal-toolkit/scripts/tts/generate_voice.sh
Kunthawat Greethong 7edf5bc4d0 feat: Import 35+ skills, merge duplicates, add openclaw installer
Major updates:
- Added 35+ new skills from awesome-opencode-skills and antigravity repos
- Merged SEO skills into seo-master
- Merged architecture skills into architecture
- Merged security skills into security-auditor and security-coder
- Merged testing skills into testing-master and testing-patterns
- Merged pentesting skills into pentesting
- Renamed website-creator to thai-frontend-dev
- Replaced skill-creator with github version
- Removed Chutes references (use MiniMax API instead)
- Added install-openclaw-skills.sh for cross-platform installation
- Updated .env.example with MiniMax API credentials
2026-03-26 11:37:39 +07:00

935 lines
27 KiB
Bash
Executable File

#!/usr/bin/env bash
# MiniMax Voice CLI — Unified TTS command-line interface (pure bash)
#
# Usage:
# bash scripts/tts/generate_voice.sh tts "Hello world" -o hello.mp3
# bash scripts/tts/generate_voice.sh clone my_voice.mp3 --voice-id my-custom-voice
# bash scripts/tts/generate_voice.sh design "A gentle female voice" --voice-id designed-voice-1
# bash scripts/tts/generate_voice.sh list-voices
# bash scripts/tts/generate_voice.sh validate segments.json
# bash scripts/tts/generate_voice.sh generate segments.json -o output.mp3
# bash scripts/tts/generate_voice.sh merge file1.mp3 file2.mp3 -o combined.mp3
# bash scripts/tts/generate_voice.sh convert input.wav -o output.mp3
# bash scripts/tts/generate_voice.sh check-env
set -euo pipefail
# ============================================================================
# Configuration
# ============================================================================
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# ============================================================================
# Common functions
# ============================================================================
load_env() {
local env_file
for env_file in "$PROJECT_ROOT/.env" "$(pwd)/.env"; do
if [[ -f "$env_file" ]]; then
while IFS= read -r line || [[ -n "$line" ]]; do
line="${line%%#*}" # strip comments
line="$(echo "$line" | xargs)" # trim whitespace
[[ -z "$line" || "$line" != *=* ]] && continue
local key="${line%%=*}"
local val="${line#*=}"
key="$(echo "$key" | xargs)"
val="$(echo "$val" | xargs)"
# Remove surrounding quotes
if [[ ${#val} -ge 2 ]]; then
case "$val" in
\"*\") val="${val:1:${#val}-2}" ;;
\'*\') val="${val:1:${#val}-2}" ;;
esac
fi
# Only set if not already in environment
if [[ -z "${!key:-}" ]]; then
export "$key=$val"
fi
done < "$env_file"
return 0
fi
done
return 0
}
check_api_key() {
if [[ -z "${MINIMAX_API_KEY:-}" ]]; then
echo "Error: MINIMAX_API_KEY environment variable is not set" >&2
echo " export MINIMAX_API_KEY='your-key'" >&2
exit 1
fi
}
ensure_dir() {
local dir="$1"
[[ -n "$dir" ]] && mkdir -p "$dir"
}
API_BASE="${MINIMAX_API_HOST:-https://api.minimaxi.com}/v1"
api_request() {
# api_request METHOD ENDPOINT [JSON_BODY]
# Outputs raw JSON response to stdout.
local method="$1" endpoint="$2" body="${3:-}"
local url="${API_BASE}/${endpoint#/}"
local args=(
-s -w "\n%{http_code}"
-X "$method"
-H "Authorization: Bearer ${MINIMAX_API_KEY}"
-H "Accept-Encoding: gzip, deflate"
--compressed
--max-time 120
)
if [[ -n "$body" ]]; then
args+=(-H "Content-Type: application/json" -d "$body")
fi
args+=("$url")
local output http_code response
output="$(curl "${args[@]}" 2>/dev/null)" || {
echo "Error: curl request failed" >&2
exit 1
}
http_code="${output##*$'\n'}"
response="${output%$'\n'*}"
if [[ "$http_code" -ge 400 ]] 2>/dev/null; then
echo "Error: API returned HTTP $http_code" >&2
echo "$response" >&2
exit 1
fi
# Check API-level error
local status_code
status_code="$(echo "$response" | jq -r '.base_resp.status_code // 0')" 2>/dev/null || true
if [[ "$status_code" != "0" && -n "$status_code" ]]; then
local status_msg
status_msg="$(echo "$response" | jq -r '.base_resp.status_msg // "Unknown error"')"
echo "Error: API error [$status_code]: $status_msg" >&2
exit 1
fi
echo "$response"
}
api_upload() {
# api_upload ENDPOINT FILE_PATH PURPOSE
local endpoint="$1" file_path="$2" purpose="$3"
local url="${API_BASE}/${endpoint#/}"
local output http_code response
output="$(curl -s -w "\n%{http_code}" \
-X POST \
-H "Authorization: Bearer ${MINIMAX_API_KEY}" \
-H "Accept-Encoding: gzip, deflate" \
--compressed \
-F "file=@${file_path}" \
-F "purpose=${purpose}" \
--max-time 120 \
"$url" 2>/dev/null)" || {
echo "Error: curl upload failed" >&2
exit 1
}
http_code="${output##*$'\n'}"
response="${output%$'\n'*}"
if [[ "$http_code" -ge 400 ]] 2>/dev/null; then
echo "Error: API returned HTTP $http_code" >&2
echo "$response" >&2
exit 1
fi
local status_code
status_code="$(echo "$response" | jq -r '.base_resp.status_code // 0')" 2>/dev/null || true
if [[ "$status_code" != "0" && -n "$status_code" ]]; then
local status_msg
status_msg="$(echo "$response" | jq -r '.base_resp.status_msg // "Unknown error"')"
echo "Error: API error [$status_code]: $status_msg" >&2
exit 1
fi
echo "$response"
}
hex_to_file() {
# hex_to_file HEX_STRING OUTPUT_PATH
local hex="$1" output="$2"
ensure_dir "$(dirname "$output")"
echo "$hex" | xxd -r -p > "$output"
}
# ============================================================================
# Subcommand: tts
# ============================================================================
cmd_tts() {
local text="" voice_id="male-qn-qingse" output="" model="speech-2.8-hd"
local speed=1.0 volume=1.0 pitch=0 emotion="" audio_format="mp3"
local sample_rate=32000 language_boost=""
# First positional arg is text
if [[ $# -gt 0 && "$1" != -* ]]; then
text="$1"; shift
fi
while [[ $# -gt 0 ]]; do
case "$1" in
-v|--voice-id) voice_id="$2"; shift 2 ;;
-o|--output) output="$2"; shift 2 ;;
--model) model="$2"; shift 2 ;;
--speed) speed="$2"; shift 2 ;;
--volume) volume="$2"; shift 2 ;;
--pitch) pitch="$2"; shift 2 ;;
--emotion) emotion="$2"; shift 2 ;;
--format) audio_format="$2"; shift 2 ;;
--sample-rate) sample_rate="$2"; shift 2 ;;
--language-boost) language_boost="$2"; shift 2 ;;
*) text="$1"; shift ;;
esac
done
if [[ -z "$text" ]]; then
echo "Error: text is required" >&2
echo "Usage: $(basename "$0") tts \"Text to speak\" -o output.mp3" >&2
exit 1
fi
# Build voice_setting
local voice_setting
voice_setting=$(jq -n \
--arg vid "$voice_id" \
--argjson spd "$speed" \
--argjson vol "$volume" \
--argjson pit "$pitch" \
'{voice_id: $vid, speed: $spd, vol: $vol, pitch: $pit}')
if [[ -n "$emotion" ]]; then
voice_setting=$(echo "$voice_setting" | jq --arg e "$emotion" '. + {emotion: $e}')
fi
# Build payload
local payload
payload=$(jq -n \
--arg model "$model" \
--arg text "$text" \
--argjson vs "$voice_setting" \
--arg fmt "$audio_format" \
--argjson sr "$sample_rate" \
'{
model: $model,
text: $text,
voice_setting: $vs,
audio_setting: {sample_rate: $sr, bitrate: 128000, format: $fmt, channel: 1},
stream: false,
subtitle_enable: false,
output_format: "hex"
}')
if [[ -n "$language_boost" ]]; then
payload=$(echo "$payload" | jq --arg lb "$language_boost" '. + {language_boost: $lb}')
fi
echo "Synthesizing: ${text:0:50}..."
local response
response="$(api_request POST t2a_v2 "$payload")"
# Extract hex audio
local audio_hex
audio_hex="$(echo "$response" | jq -r '.data.audio // .extra_info.audio // empty')"
if [[ -z "$audio_hex" ]]; then
echo "Error: No audio data returned from API" >&2
exit 1
fi
if [[ -n "$output" ]]; then
hex_to_file "$audio_hex" "$output"
echo "Done: $output"
else
echo "Generated ${#audio_hex} hex chars of audio"
fi
}
# ============================================================================
# Subcommand: clone
# ============================================================================
cmd_clone() {
local audio_file="" voice_id="" preview_text="" preview_output=""
# First positional arg is audio file
if [[ $# -gt 0 && "$1" != -* ]]; then
audio_file="$1"; shift
fi
while [[ $# -gt 0 ]]; do
case "$1" in
--voice-id) voice_id="$2"; shift 2 ;;
--preview) preview_text="$2"; shift 2 ;;
--preview-output) preview_output="$2"; shift 2 ;;
*) [[ -z "$audio_file" ]] && audio_file="$1"; shift ;;
esac
done
if [[ -z "$audio_file" ]]; then
echo "Error: audio file is required" >&2
echo "Usage: $(basename "$0") clone audio.mp3 --voice-id my-voice" >&2
exit 1
fi
if [[ ! -f "$audio_file" ]]; then
echo "Error: Audio file not found: $audio_file" >&2
exit 1
fi
if [[ -z "$voice_id" ]]; then
echo "Error: --voice-id is required" >&2
exit 1
fi
echo "Cloning voice from: $audio_file"
echo "Voice ID: $voice_id"
# Step 1: Upload audio
local upload_response file_id
upload_response="$(api_upload files/upload "$audio_file" voice_clone)"
file_id="$(echo "$upload_response" | jq -r '.file.file_id // .file_id // empty')"
if [[ -z "$file_id" ]]; then
echo "Error: Upload succeeded but no file_id was returned" >&2
exit 1
fi
# Step 2: Clone voice
local clone_payload
clone_payload=$(jq -n \
--arg vid "$voice_id" \
--argjson fid "$file_id" \
'{voice_id: $vid, file_id: $fid}')
api_request POST voice_clone "$clone_payload" > /dev/null
echo "Voice cloned successfully: $voice_id"
# Step 3: Preview if requested
if [[ -n "$preview_text" ]]; then
echo "Generating preview..."
local pout="${preview_output:-${voice_id}_preview.mp3}"
cmd_tts "$preview_text" -v "$voice_id" -o "$pout"
echo "Preview saved to: $pout"
fi
}
# ============================================================================
# Subcommand: design
# ============================================================================
cmd_design() {
local description="" voice_id="" preview_text="" preview_output=""
if [[ $# -gt 0 && "$1" != -* ]]; then
description="$1"; shift
fi
while [[ $# -gt 0 ]]; do
case "$1" in
--voice-id) voice_id="$2"; shift 2 ;;
--preview) preview_text="$2"; shift 2 ;;
--preview-output) preview_output="$2"; shift 2 ;;
*) [[ -z "$description" ]] && description="$1"; shift ;;
esac
done
if [[ -z "$description" ]]; then
echo "Error: description is required" >&2
echo "Usage: $(basename \"$0\") design \"A warm female voice\" --voice-id narrator" >&2
exit 1
fi
local ptext="${preview_text:-This is a preview of the designed voice.}"
echo "Designing voice from: \"$description\""
[[ -n "$voice_id" ]] && echo "Voice ID: $voice_id"
local payload
payload=$(jq -n \
--arg prompt "$description" \
--arg pt "$ptext" \
'{prompt: $prompt, preview_text: $pt}')
if [[ -n "$voice_id" ]]; then
payload=$(echo "$payload" | jq --arg vid "$voice_id" '. + {voice_id: $vid}')
fi
local response
response="$(api_request POST voice_design "$payload")"
local actual_voice_id
actual_voice_id="${voice_id:-$(echo "$response" | jq -r '.voice_id // "unknown"')}"
echo "Voice designed: $actual_voice_id"
local trial_audio
trial_audio="$(echo "$response" | jq -r '.trial_audio // empty')"
if [[ -n "$trial_audio" ]]; then
local pout="${preview_output:-${actual_voice_id}_preview.mp3}"
hex_to_file "$trial_audio" "$pout"
echo "Preview saved to: $pout"
fi
}
# ============================================================================
# Subcommand: list-voices
# ============================================================================
cmd_list_voices() {
echo "=== System Voices ==="
local sys_response
sys_response="$(api_request POST voice/list '{"voice_type":"system"}' 2>/dev/null)" || true
if [[ -n "$sys_response" ]]; then
local count
count="$(echo "$sys_response" | jq '.voice_list | length')" 2>/dev/null || count=0
if [[ "$count" -gt 0 ]]; then
echo "$sys_response" | jq -r '.voice_list[:10][] | " \(.voice_id): \(.name // "N/A")"'
if [[ "$count" -gt 10 ]]; then
echo " ... and $((count - 10)) more"
fi
else
echo " (None found)"
fi
else
echo " (Could not fetch system voices)"
fi
echo ""
echo "=== Custom Voices ==="
local clone_response design_response
clone_response="$(api_request POST voice/list '{"voice_type":"voice_cloning"}' 2>/dev/null)" || true
design_response="$(api_request POST voice/list '{"voice_type":"voice_generation"}' 2>/dev/null)" || true
local has_custom=false
if [[ -n "$clone_response" ]]; then
local cc
cc="$(echo "$clone_response" | jq '.voice_list | length')" 2>/dev/null || cc=0
if [[ "$cc" -gt 0 ]]; then
has_custom=true
echo "Cloned ($cc):"
echo "$clone_response" | jq -r '.voice_list[] | " \(.voice_id)"'
fi
fi
if [[ -n "$design_response" ]]; then
local dc
dc="$(echo "$design_response" | jq '.voice_list | length')" 2>/dev/null || dc=0
if [[ "$dc" -gt 0 ]]; then
has_custom=true
echo "Designed ($dc):"
echo "$design_response" | jq -r '.voice_list[] | " \(.voice_id)"'
fi
fi
if ! $has_custom; then
echo " (None found)"
fi
}
# ============================================================================
# Subcommand: validate
# ============================================================================
cmd_validate() {
local segments_file="" model="speech-2.8-hd" strict=false verbose=false
if [[ $# -gt 0 && "$1" != -* ]]; then
segments_file="$1"; shift
fi
while [[ $# -gt 0 ]]; do
case "$1" in
--model) model="$2"; shift 2 ;;
--strict) strict=true; shift ;;
-v|--verbose) verbose=true; shift ;;
--validate-voices) shift ;; # Not implemented in bash version
*) [[ -z "$segments_file" ]] && segments_file="$1"; shift ;;
esac
done
if [[ -z "$segments_file" || ! -f "$segments_file" ]]; then
echo "Error: Segments file not found: ${segments_file:-<none>}" >&2
exit 1
fi
echo "Validating: $segments_file"
echo "Model: $model"
local valid_emotions="happy sad angry fearful disgusted surprised calm fluent whisper"
echo "Valid emotions: $valid_emotions"
echo ""
# Parse JSON
local segments count
segments="$(jq -r 'if type == "array" then . elif type == "object" and has("segments") then .segments else empty end' "$segments_file" 2>/dev/null)" || {
echo "Error: Invalid JSON in $segments_file" >&2
exit 1
}
if [[ -z "$segments" || "$segments" == "null" ]]; then
echo "Error: No segments found in file" >&2
exit 1
fi
count="$(echo "$segments" | jq 'length')"
local errors=0
for ((i=0; i<count; i++)); do
local text voice_id emotion
text="$(echo "$segments" | jq -r ".[$i].text // \"\"")"
voice_id="$(echo "$segments" | jq -r ".[$i].voice_id // \"\"")"
emotion="$(echo "$segments" | jq -r ".[$i].emotion // \"\"")"
if [[ -z "$text" ]]; then
echo " - Segment $i: 'text' is required and must not be empty"
errors=$((errors + 1))
fi
if [[ -z "$voice_id" ]]; then
echo " - Segment $i: 'voice_id' is required"
errors=$((errors + 1))
fi
if [[ -n "$emotion" ]]; then
if ! echo "$valid_emotions" | grep -qw "$emotion"; then
echo " - Segment $i: invalid emotion '$emotion'"
errors=$((errors + 1))
fi
fi
done
if [[ $errors -eq 0 ]]; then
echo "Validation passed: $count segments"
if $verbose; then
echo ""
echo "=== Segment Summary ==="
for ((i=0; i<count; i++)); do
local text voice_id emotion
text="$(echo "$segments" | jq -r ".[$i].text // \"\"")"
voice_id="$(echo "$segments" | jq -r ".[$i].voice_id // \"\"")"
emotion="$(echo "$segments" | jq -r ".[$i].emotion // \"\"")"
local elabel="${emotion:-AUTO}"
printf " %d: [%-10s] voice=%-20s \"%s\"\n" "$i" "${elabel^^}" "${voice_id:0:20}" "${text:0:40}"
done
fi
return 0
else
echo "Validation failed ($errors errors)"
return 1
fi
}
# ============================================================================
# Subcommand: generate (multi-segment pipeline)
# ============================================================================
cmd_generate() {
local segments_file="" output="" model="speech-2.8-hd" crossfade=200
local no_normalize=false temp_dir="" continue_on_error=false
if [[ $# -gt 0 && "$1" != -* ]]; then
segments_file="$1"; shift
fi
while [[ $# -gt 0 ]]; do
case "$1" in
-o|--output) output="$2"; shift 2 ;;
--model) model="$2"; shift 2 ;;
--crossfade) crossfade="$2"; shift 2 ;;
--no-normalize) no_normalize=true; shift ;;
--temp-dir) temp_dir="$2"; shift 2 ;;
--continue-on-error) continue_on_error=true; shift ;;
*) [[ -z "$segments_file" ]] && segments_file="$1"; shift ;;
esac
done
if [[ -z "$segments_file" || ! -f "$segments_file" ]]; then
echo "Error: Segments file not found: ${segments_file:-<none>}" >&2
exit 1
fi
if [[ -z "$output" ]]; then
echo "Error: -o/--output is required" >&2
exit 1
fi
# Validate first
echo "Validating segments file..."
local segments count
segments="$(jq -r 'if type == "array" then . elif type == "object" and has("segments") then .segments else empty end' "$segments_file")"
count="$(echo "$segments" | jq 'length')"
if [[ "$count" -eq 0 ]]; then
echo "Error: No segments found" >&2
exit 1
fi
echo "Found $count valid segments"
echo ""
# Setup temp dir
if [[ -z "$temp_dir" ]]; then
temp_dir="$(dirname "$(cd "$(dirname "$output")" 2>/dev/null && pwd || echo ".")/$(basename "$output")")/tmp"
fi
mkdir -p "$temp_dir"
echo "Temp directory: $temp_dir"
# Generate each segment
local succeeded=0 failed=0
local segment_files=()
for ((i=0; i<count; i++)); do
local text voice_id emotion speed vol pitch
text="$(echo "$segments" | jq -r ".[$i].text")"
voice_id="$(echo "$segments" | jq -r ".[$i].voice_id")"
emotion="$(echo "$segments" | jq -r ".[$i].emotion // \"\"")"
speed="$(echo "$segments" | jq -r ".[$i].speed // 1.0")"
vol="$(echo "$segments" | jq -r ".[$i].volume // 1.0")"
pitch="$(echo "$segments" | jq -r ".[$i].pitch // 0")"
printf " Generating segment %d/%d: %s...\n" "$((i+1))" "$count" "${text:0:40}"
local seg_output="$temp_dir/segment_$(printf '%04d' "$i").mp3"
# Build voice_setting
local voice_setting
voice_setting=$(jq -n \
--arg vid "$voice_id" \
--argjson spd "$speed" \
--argjson vol "$vol" \
--argjson pit "$pitch" \
'{voice_id: $vid, speed: $spd, vol: $vol, pitch: $pit}')
if [[ -n "$emotion" ]]; then
voice_setting=$(echo "$voice_setting" | jq --arg e "$emotion" '. + {emotion: $e}')
fi
local payload
payload=$(jq -n \
--arg model "$model" \
--arg text "$text" \
--argjson vs "$voice_setting" \
'{
model: $model,
text: $text,
voice_setting: $vs,
audio_setting: {sample_rate: 32000, bitrate: 128000, format: "mp3", channel: 1},
stream: false,
output_format: "hex"
}')
local response audio_hex
if response="$(api_request POST t2a_v2 "$payload" 2>&1)"; then
audio_hex="$(echo "$response" | jq -r '.data.audio // .extra_info.audio // empty')"
if [[ -n "$audio_hex" ]]; then
hex_to_file "$audio_hex" "$seg_output"
segment_files+=("$seg_output")
succeeded=$((succeeded + 1))
echo " ✓ Saved: $seg_output"
else
failed=$((failed + 1))
echo " ✗ Error: No audio data in response"
if ! $continue_on_error; then break; fi
fi
else
failed=$((failed + 1))
echo " ✗ Error: $response"
if ! $continue_on_error; then break; fi
fi
done
if [[ ${#segment_files[@]} -eq 0 ]]; then
echo "Error: No segments were generated successfully" >&2
exit 1
fi
# Merge segments
ensure_dir "$(dirname "$output")"
if [[ ${#segment_files[@]} -eq 1 ]]; then
cp "${segment_files[0]}" "$output"
else
_merge_audio_files "$output" "$crossfade" "$no_normalize" "${segment_files[@]}"
fi
echo ""
echo "Audio saved to: $output"
echo " Generated: $succeeded/$count segments"
echo ""
echo " Intermediate files in: $temp_dir"
echo " Delete with: rm -rf $temp_dir"
}
# ============================================================================
# Subcommand: merge
# ============================================================================
cmd_merge() {
local output="" format="mp3" crossfade=300 normalize=true
local input_files=()
while [[ $# -gt 0 ]]; do
case "$1" in
-o|--output) output="$2"; shift 2 ;;
--format) format="$2"; shift 2 ;;
--crossfade) crossfade="$2"; shift 2 ;;
--no-normalize) normalize=false; shift ;;
*) input_files+=("$1"); shift ;;
esac
done
if [[ ${#input_files[@]} -lt 2 ]]; then
echo "Error: At least 2 input files required" >&2
exit 1
fi
if [[ -z "$output" ]]; then
echo "Error: -o/--output is required" >&2
exit 1
fi
for f in "${input_files[@]}"; do
if [[ ! -f "$f" ]]; then
echo "Error: File not found: $f" >&2
exit 1
fi
done
echo "Merging ${#input_files[@]} files..."
local no_norm="false"
$normalize || no_norm="true"
_merge_audio_files "$output" "$crossfade" "$no_norm" "${input_files[@]}"
echo "Merged audio saved to: $output"
}
_merge_audio_files() {
# _merge_audio_files OUTPUT CROSSFADE_MS NO_NORMALIZE FILE1 FILE2 ...
local output="$1" crossfade_ms="$2" no_normalize="$3"
shift 3
local files=("$@")
local n=${#files[@]}
ensure_dir "$(dirname "$output")"
if [[ "$crossfade_ms" -gt 0 && $n -ge 2 ]]; then
# Use acrossfade filter for crossfade between segments
local crossfade_sec
crossfade_sec=$(echo "scale=3; $crossfade_ms / 1000" | bc)
local inputs=()
local filter_parts=()
for ((i=0; i<n; i++)); do
inputs+=(-i "${files[$i]}")
filter_parts+=("[${i}:a]aresample=32000,aformat=sample_fmts=fltp:channel_layouts=mono[s${i}]")
done
# Build acrossfade chain
if [[ $n -eq 2 ]]; then
filter_parts+=("[s0][s1]acrossfade=d=${crossfade_sec}[merged]")
else
filter_parts+=("[s0][s1]acrossfade=d=${crossfade_sec}[m1]")
for ((i=2; i<n; i++)); do
local prev="[m$((i-1))]"
if [[ $i -eq $((n-1)) ]]; then
filter_parts+=("${prev}[s${i}]acrossfade=d=${crossfade_sec}[merged]")
else
filter_parts+=("${prev}[s${i}]acrossfade=d=${crossfade_sec}[m${i}]")
fi
done
fi
local final_filter="[merged]aformat=sample_fmts=fltp"
if [[ "$no_normalize" != "true" ]]; then
final_filter+=",loudnorm=I=-16:TP=-1.5:LRA=11"
fi
final_filter+="[final]"
filter_parts+=("$final_filter")
local filter_complex
filter_complex="$(IFS=';'; echo "${filter_parts[*]}")"
if ffmpeg -y "${inputs[@]}" \
-filter_complex "$filter_complex" \
-map "[final]" \
-ar 32000 -ac 1 -acodec libmp3lame \
"$output" 2>/dev/null; then
return 0
fi
echo " Crossfade merge failed, falling back to concat demuxer..." >&2
fi
# Fallback: concat demuxer (no crossfade)
local concat_file
concat_file="$(mktemp /tmp/concat_XXXXXX.txt)"
for f in "${files[@]}"; do
echo "file '$(cd "$(dirname "$f")" && pwd)/$(basename "$f")'" >> "$concat_file"
done
if [[ "$no_normalize" != "true" ]]; then
local tmp_concat
tmp_concat="$(mktemp /tmp/concat_out_XXXXXX.mp3)"
ffmpeg -y -f concat -safe 0 -i "$concat_file" -c copy "$tmp_concat" 2>/dev/null
ffmpeg -y -i "$tmp_concat" -af "loudnorm=I=-16:TP=-1.5:LRA=11" -acodec libmp3lame "$output" 2>/dev/null
rm -f "$tmp_concat"
else
ffmpeg -y -f concat -safe 0 -i "$concat_file" -c copy "$output" 2>/dev/null
fi
rm -f "$concat_file"
}
# ============================================================================
# Subcommand: convert
# ============================================================================
cmd_convert() {
local input_file="" output="" format="mp3" sample_rate="" bitrate="" channels=""
if [[ $# -gt 0 && "$1" != -* ]]; then
input_file="$1"; shift
fi
while [[ $# -gt 0 ]]; do
case "$1" in
-o|--output) output="$2"; shift 2 ;;
--format) format="$2"; shift 2 ;;
--sample-rate) sample_rate="$2"; shift 2 ;;
--bitrate) bitrate="$2"; shift 2 ;;
--channels) channels="$2"; shift 2 ;;
*) [[ -z "$input_file" ]] && input_file="$1"; shift ;;
esac
done
if [[ -z "$input_file" || ! -f "$input_file" ]]; then
echo "Error: Input file not found: ${input_file:-<none>}" >&2
exit 1
fi
if [[ -z "$output" ]]; then
echo "Error: -o/--output is required" >&2
exit 1
fi
ensure_dir "$(dirname "$output")"
# Determine codec
local codec="copy"
case "$format" in
mp3) codec="libmp3lame" ;;
wav) codec="pcm_s16le" ;;
flac) codec="flac" ;;
ogg) codec="libvorbis" ;;
aac) codec="aac" ;;
m4a) codec="aac" ;;
*) codec="copy" ;;
esac
local args=(-y -i "$input_file" -acodec "$codec")
[[ -n "$sample_rate" ]] && args+=(-ar "$sample_rate")
[[ -n "$channels" ]] && args+=(-ac "$channels")
[[ -n "$bitrate" ]] && args+=(-b:a "$bitrate")
args+=("$output")
echo "Converting $input_file to $format..."
ffmpeg "${args[@]}" 2>/dev/null
echo "Converted audio saved to: $output"
}
# ============================================================================
# Subcommand: check-env
# ============================================================================
cmd_check_env() {
local check_script="$SCRIPT_DIR/../check_environment.sh"
if [[ -f "$check_script" ]]; then
bash "$check_script" "$@"
else
echo "check_environment.sh not found" >&2
exit 1
fi
}
# ============================================================================
# Main dispatcher
# ============================================================================
usage() {
cat <<'EOF'
MiniMax Voice CLI — Unified TTS interface
Usage:
generate_voice.sh <command> [options]
Commands:
tts Basic text-to-speech
clone Clone voice from audio sample
design Design voice from description
list-voices List available voices
validate Validate segments.json file
generate Generate audio from segments.json
merge Merge multiple audio files
convert Convert audio format
check-env Check environment setup
Examples:
generate_voice.sh tts "Hello world" -o hello.mp3
generate_voice.sh tts "你好" -v female-shaonv -o hello_cn.mp3
generate_voice.sh clone my_voice.mp3 --voice-id my-custom-voice
generate_voice.sh design "A warm female voice" --voice-id narrator-1
generate_voice.sh list-voices
generate_voice.sh validate segments.json --verbose
generate_voice.sh generate segments.json -o output.mp3
generate_voice.sh merge part1.mp3 part2.mp3 -o combined.mp3
generate_voice.sh convert input.wav -o output.mp3
generate_voice.sh check-env --test-api
EOF
}
main() {
load_env
if [[ $# -eq 0 ]]; then
usage
exit 0
fi
local command="$1"; shift
case "$command" in
tts)
check_api_key
cmd_tts "$@"
;;
clone)
check_api_key
cmd_clone "$@"
;;
design)
check_api_key
cmd_design "$@"
;;
list-voices)
check_api_key
cmd_list_voices "$@"
;;
validate)
cmd_validate "$@"
;;
generate)
check_api_key
cmd_generate "$@"
;;
merge)
cmd_merge "$@"
;;
convert)
cmd_convert "$@"
;;
check-env)
cmd_check_env "$@"
;;
-h|--help|help)
usage
;;
*)
echo "Unknown command: $command" >&2
usage >&2
exit 1
;;
esac
}
main "$@"