#!/usr/bin/env bash # MiniMax Voice CLI — Unified TTS command-line interface (pure bash) # # Usage: # bash scripts/tts/generate_voice.sh tts "Hello world" -o hello.mp3 # bash scripts/tts/generate_voice.sh clone my_voice.mp3 --voice-id my-custom-voice # bash scripts/tts/generate_voice.sh design "A gentle female voice" --voice-id designed-voice-1 # bash scripts/tts/generate_voice.sh list-voices # bash scripts/tts/generate_voice.sh validate segments.json # bash scripts/tts/generate_voice.sh generate segments.json -o output.mp3 # bash scripts/tts/generate_voice.sh merge file1.mp3 file2.mp3 -o combined.mp3 # bash scripts/tts/generate_voice.sh convert input.wav -o output.mp3 # bash scripts/tts/generate_voice.sh check-env set -euo pipefail # ============================================================================ # Configuration # ============================================================================ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" # ============================================================================ # Common functions # ============================================================================ load_env() { local env_file for env_file in "$PROJECT_ROOT/.env" "$(pwd)/.env"; do if [[ -f "$env_file" ]]; then while IFS= read -r line || [[ -n "$line" ]]; do line="${line%%#*}" # strip comments line="$(echo "$line" | xargs)" # trim whitespace [[ -z "$line" || "$line" != *=* ]] && continue local key="${line%%=*}" local val="${line#*=}" key="$(echo "$key" | xargs)" val="$(echo "$val" | xargs)" # Remove surrounding quotes if [[ ${#val} -ge 2 ]]; then case "$val" in \"*\") val="${val:1:${#val}-2}" ;; \'*\') val="${val:1:${#val}-2}" ;; esac fi # Only set if not already in environment if [[ -z "${!key:-}" ]]; then export "$key=$val" fi done < "$env_file" return 0 fi done return 0 } check_api_key() { if [[ -z "${MINIMAX_API_KEY:-}" ]]; then echo "Error: MINIMAX_API_KEY environment variable is not set" >&2 echo " export MINIMAX_API_KEY='your-key'" >&2 exit 1 fi } ensure_dir() { local dir="$1" [[ -n "$dir" ]] && mkdir -p "$dir" } API_BASE="${MINIMAX_API_HOST:-https://api.minimaxi.com}/v1" api_request() { # api_request METHOD ENDPOINT [JSON_BODY] # Outputs raw JSON response to stdout. local method="$1" endpoint="$2" body="${3:-}" local url="${API_BASE}/${endpoint#/}" local args=( -s -w "\n%{http_code}" -X "$method" -H "Authorization: Bearer ${MINIMAX_API_KEY}" -H "Accept-Encoding: gzip, deflate" --compressed --max-time 120 ) if [[ -n "$body" ]]; then args+=(-H "Content-Type: application/json" -d "$body") fi args+=("$url") local output http_code response output="$(curl "${args[@]}" 2>/dev/null)" || { echo "Error: curl request failed" >&2 exit 1 } http_code="${output##*$'\n'}" response="${output%$'\n'*}" if [[ "$http_code" -ge 400 ]] 2>/dev/null; then echo "Error: API returned HTTP $http_code" >&2 echo "$response" >&2 exit 1 fi # Check API-level error local status_code status_code="$(echo "$response" | jq -r '.base_resp.status_code // 0')" 2>/dev/null || true if [[ "$status_code" != "0" && -n "$status_code" ]]; then local status_msg status_msg="$(echo "$response" | jq -r '.base_resp.status_msg // "Unknown error"')" echo "Error: API error [$status_code]: $status_msg" >&2 exit 1 fi echo "$response" } api_upload() { # api_upload ENDPOINT FILE_PATH PURPOSE local endpoint="$1" file_path="$2" purpose="$3" local url="${API_BASE}/${endpoint#/}" local output http_code response output="$(curl -s -w "\n%{http_code}" \ -X POST \ -H "Authorization: Bearer ${MINIMAX_API_KEY}" \ -H "Accept-Encoding: gzip, deflate" \ --compressed \ -F "file=@${file_path}" \ -F "purpose=${purpose}" \ --max-time 120 \ "$url" 2>/dev/null)" || { echo "Error: curl upload failed" >&2 exit 1 } http_code="${output##*$'\n'}" response="${output%$'\n'*}" if [[ "$http_code" -ge 400 ]] 2>/dev/null; then echo "Error: API returned HTTP $http_code" >&2 echo "$response" >&2 exit 1 fi local status_code status_code="$(echo "$response" | jq -r '.base_resp.status_code // 0')" 2>/dev/null || true if [[ "$status_code" != "0" && -n "$status_code" ]]; then local status_msg status_msg="$(echo "$response" | jq -r '.base_resp.status_msg // "Unknown error"')" echo "Error: API error [$status_code]: $status_msg" >&2 exit 1 fi echo "$response" } hex_to_file() { # hex_to_file HEX_STRING OUTPUT_PATH local hex="$1" output="$2" ensure_dir "$(dirname "$output")" echo "$hex" | xxd -r -p > "$output" } # ============================================================================ # Subcommand: tts # ============================================================================ cmd_tts() { local text="" voice_id="male-qn-qingse" output="" model="speech-2.8-hd" local speed=1.0 volume=1.0 pitch=0 emotion="" audio_format="mp3" local sample_rate=32000 language_boost="" # First positional arg is text if [[ $# -gt 0 && "$1" != -* ]]; then text="$1"; shift fi while [[ $# -gt 0 ]]; do case "$1" in -v|--voice-id) voice_id="$2"; shift 2 ;; -o|--output) output="$2"; shift 2 ;; --model) model="$2"; shift 2 ;; --speed) speed="$2"; shift 2 ;; --volume) volume="$2"; shift 2 ;; --pitch) pitch="$2"; shift 2 ;; --emotion) emotion="$2"; shift 2 ;; --format) audio_format="$2"; shift 2 ;; --sample-rate) sample_rate="$2"; shift 2 ;; --language-boost) language_boost="$2"; shift 2 ;; *) text="$1"; shift ;; esac done if [[ -z "$text" ]]; then echo "Error: text is required" >&2 echo "Usage: $(basename "$0") tts \"Text to speak\" -o output.mp3" >&2 exit 1 fi # Build voice_setting local voice_setting voice_setting=$(jq -n \ --arg vid "$voice_id" \ --argjson spd "$speed" \ --argjson vol "$volume" \ --argjson pit "$pitch" \ '{voice_id: $vid, speed: $spd, vol: $vol, pitch: $pit}') if [[ -n "$emotion" ]]; then voice_setting=$(echo "$voice_setting" | jq --arg e "$emotion" '. + {emotion: $e}') fi # Build payload local payload payload=$(jq -n \ --arg model "$model" \ --arg text "$text" \ --argjson vs "$voice_setting" \ --arg fmt "$audio_format" \ --argjson sr "$sample_rate" \ '{ model: $model, text: $text, voice_setting: $vs, audio_setting: {sample_rate: $sr, bitrate: 128000, format: $fmt, channel: 1}, stream: false, subtitle_enable: false, output_format: "hex" }') if [[ -n "$language_boost" ]]; then payload=$(echo "$payload" | jq --arg lb "$language_boost" '. + {language_boost: $lb}') fi echo "Synthesizing: ${text:0:50}..." local response response="$(api_request POST t2a_v2 "$payload")" # Extract hex audio local audio_hex audio_hex="$(echo "$response" | jq -r '.data.audio // .extra_info.audio // empty')" if [[ -z "$audio_hex" ]]; then echo "Error: No audio data returned from API" >&2 exit 1 fi if [[ -n "$output" ]]; then hex_to_file "$audio_hex" "$output" echo "Done: $output" else echo "Generated ${#audio_hex} hex chars of audio" fi } # ============================================================================ # Subcommand: clone # ============================================================================ cmd_clone() { local audio_file="" voice_id="" preview_text="" preview_output="" # First positional arg is audio file if [[ $# -gt 0 && "$1" != -* ]]; then audio_file="$1"; shift fi while [[ $# -gt 0 ]]; do case "$1" in --voice-id) voice_id="$2"; shift 2 ;; --preview) preview_text="$2"; shift 2 ;; --preview-output) preview_output="$2"; shift 2 ;; *) [[ -z "$audio_file" ]] && audio_file="$1"; shift ;; esac done if [[ -z "$audio_file" ]]; then echo "Error: audio file is required" >&2 echo "Usage: $(basename "$0") clone audio.mp3 --voice-id my-voice" >&2 exit 1 fi if [[ ! -f "$audio_file" ]]; then echo "Error: Audio file not found: $audio_file" >&2 exit 1 fi if [[ -z "$voice_id" ]]; then echo "Error: --voice-id is required" >&2 exit 1 fi echo "Cloning voice from: $audio_file" echo "Voice ID: $voice_id" # Step 1: Upload audio local upload_response file_id upload_response="$(api_upload files/upload "$audio_file" voice_clone)" file_id="$(echo "$upload_response" | jq -r '.file.file_id // .file_id // empty')" if [[ -z "$file_id" ]]; then echo "Error: Upload succeeded but no file_id was returned" >&2 exit 1 fi # Step 2: Clone voice local clone_payload clone_payload=$(jq -n \ --arg vid "$voice_id" \ --argjson fid "$file_id" \ '{voice_id: $vid, file_id: $fid}') api_request POST voice_clone "$clone_payload" > /dev/null echo "Voice cloned successfully: $voice_id" # Step 3: Preview if requested if [[ -n "$preview_text" ]]; then echo "Generating preview..." local pout="${preview_output:-${voice_id}_preview.mp3}" cmd_tts "$preview_text" -v "$voice_id" -o "$pout" echo "Preview saved to: $pout" fi } # ============================================================================ # Subcommand: design # ============================================================================ cmd_design() { local description="" voice_id="" preview_text="" preview_output="" if [[ $# -gt 0 && "$1" != -* ]]; then description="$1"; shift fi while [[ $# -gt 0 ]]; do case "$1" in --voice-id) voice_id="$2"; shift 2 ;; --preview) preview_text="$2"; shift 2 ;; --preview-output) preview_output="$2"; shift 2 ;; *) [[ -z "$description" ]] && description="$1"; shift ;; esac done if [[ -z "$description" ]]; then echo "Error: description is required" >&2 echo "Usage: $(basename \"$0\") design \"A warm female voice\" --voice-id narrator" >&2 exit 1 fi local ptext="${preview_text:-This is a preview of the designed voice.}" echo "Designing voice from: \"$description\"" [[ -n "$voice_id" ]] && echo "Voice ID: $voice_id" local payload payload=$(jq -n \ --arg prompt "$description" \ --arg pt "$ptext" \ '{prompt: $prompt, preview_text: $pt}') if [[ -n "$voice_id" ]]; then payload=$(echo "$payload" | jq --arg vid "$voice_id" '. + {voice_id: $vid}') fi local response response="$(api_request POST voice_design "$payload")" local actual_voice_id actual_voice_id="${voice_id:-$(echo "$response" | jq -r '.voice_id // "unknown"')}" echo "Voice designed: $actual_voice_id" local trial_audio trial_audio="$(echo "$response" | jq -r '.trial_audio // empty')" if [[ -n "$trial_audio" ]]; then local pout="${preview_output:-${actual_voice_id}_preview.mp3}" hex_to_file "$trial_audio" "$pout" echo "Preview saved to: $pout" fi } # ============================================================================ # Subcommand: list-voices # ============================================================================ cmd_list_voices() { echo "=== System Voices ===" local sys_response sys_response="$(api_request POST voice/list '{"voice_type":"system"}' 2>/dev/null)" || true if [[ -n "$sys_response" ]]; then local count count="$(echo "$sys_response" | jq '.voice_list | length')" 2>/dev/null || count=0 if [[ "$count" -gt 0 ]]; then echo "$sys_response" | jq -r '.voice_list[:10][] | " \(.voice_id): \(.name // "N/A")"' if [[ "$count" -gt 10 ]]; then echo " ... and $((count - 10)) more" fi else echo " (None found)" fi else echo " (Could not fetch system voices)" fi echo "" echo "=== Custom Voices ===" local clone_response design_response clone_response="$(api_request POST voice/list '{"voice_type":"voice_cloning"}' 2>/dev/null)" || true design_response="$(api_request POST voice/list '{"voice_type":"voice_generation"}' 2>/dev/null)" || true local has_custom=false if [[ -n "$clone_response" ]]; then local cc cc="$(echo "$clone_response" | jq '.voice_list | length')" 2>/dev/null || cc=0 if [[ "$cc" -gt 0 ]]; then has_custom=true echo "Cloned ($cc):" echo "$clone_response" | jq -r '.voice_list[] | " \(.voice_id)"' fi fi if [[ -n "$design_response" ]]; then local dc dc="$(echo "$design_response" | jq '.voice_list | length')" 2>/dev/null || dc=0 if [[ "$dc" -gt 0 ]]; then has_custom=true echo "Designed ($dc):" echo "$design_response" | jq -r '.voice_list[] | " \(.voice_id)"' fi fi if ! $has_custom; then echo " (None found)" fi } # ============================================================================ # Subcommand: validate # ============================================================================ cmd_validate() { local segments_file="" model="speech-2.8-hd" strict=false verbose=false if [[ $# -gt 0 && "$1" != -* ]]; then segments_file="$1"; shift fi while [[ $# -gt 0 ]]; do case "$1" in --model) model="$2"; shift 2 ;; --strict) strict=true; shift ;; -v|--verbose) verbose=true; shift ;; --validate-voices) shift ;; # Not implemented in bash version *) [[ -z "$segments_file" ]] && segments_file="$1"; shift ;; esac done if [[ -z "$segments_file" || ! -f "$segments_file" ]]; then echo "Error: Segments file not found: ${segments_file:-}" >&2 exit 1 fi echo "Validating: $segments_file" echo "Model: $model" local valid_emotions="happy sad angry fearful disgusted surprised calm fluent whisper" echo "Valid emotions: $valid_emotions" echo "" # Parse JSON local segments count segments="$(jq -r 'if type == "array" then . elif type == "object" and has("segments") then .segments else empty end' "$segments_file" 2>/dev/null)" || { echo "Error: Invalid JSON in $segments_file" >&2 exit 1 } if [[ -z "$segments" || "$segments" == "null" ]]; then echo "Error: No segments found in file" >&2 exit 1 fi count="$(echo "$segments" | jq 'length')" local errors=0 for ((i=0; i}" >&2 exit 1 fi if [[ -z "$output" ]]; then echo "Error: -o/--output is required" >&2 exit 1 fi # Validate first echo "Validating segments file..." local segments count segments="$(jq -r 'if type == "array" then . elif type == "object" and has("segments") then .segments else empty end' "$segments_file")" count="$(echo "$segments" | jq 'length')" if [[ "$count" -eq 0 ]]; then echo "Error: No segments found" >&2 exit 1 fi echo "Found $count valid segments" echo "" # Setup temp dir if [[ -z "$temp_dir" ]]; then temp_dir="$(dirname "$(cd "$(dirname "$output")" 2>/dev/null && pwd || echo ".")/$(basename "$output")")/tmp" fi mkdir -p "$temp_dir" echo "Temp directory: $temp_dir" # Generate each segment local succeeded=0 failed=0 local segment_files=() for ((i=0; i&1)"; then audio_hex="$(echo "$response" | jq -r '.data.audio // .extra_info.audio // empty')" if [[ -n "$audio_hex" ]]; then hex_to_file "$audio_hex" "$seg_output" segment_files+=("$seg_output") succeeded=$((succeeded + 1)) echo " ✓ Saved: $seg_output" else failed=$((failed + 1)) echo " ✗ Error: No audio data in response" if ! $continue_on_error; then break; fi fi else failed=$((failed + 1)) echo " ✗ Error: $response" if ! $continue_on_error; then break; fi fi done if [[ ${#segment_files[@]} -eq 0 ]]; then echo "Error: No segments were generated successfully" >&2 exit 1 fi # Merge segments ensure_dir "$(dirname "$output")" if [[ ${#segment_files[@]} -eq 1 ]]; then cp "${segment_files[0]}" "$output" else _merge_audio_files "$output" "$crossfade" "$no_normalize" "${segment_files[@]}" fi echo "" echo "Audio saved to: $output" echo " Generated: $succeeded/$count segments" echo "" echo " Intermediate files in: $temp_dir" echo " Delete with: rm -rf $temp_dir" } # ============================================================================ # Subcommand: merge # ============================================================================ cmd_merge() { local output="" format="mp3" crossfade=300 normalize=true local input_files=() while [[ $# -gt 0 ]]; do case "$1" in -o|--output) output="$2"; shift 2 ;; --format) format="$2"; shift 2 ;; --crossfade) crossfade="$2"; shift 2 ;; --no-normalize) normalize=false; shift ;; *) input_files+=("$1"); shift ;; esac done if [[ ${#input_files[@]} -lt 2 ]]; then echo "Error: At least 2 input files required" >&2 exit 1 fi if [[ -z "$output" ]]; then echo "Error: -o/--output is required" >&2 exit 1 fi for f in "${input_files[@]}"; do if [[ ! -f "$f" ]]; then echo "Error: File not found: $f" >&2 exit 1 fi done echo "Merging ${#input_files[@]} files..." local no_norm="false" $normalize || no_norm="true" _merge_audio_files "$output" "$crossfade" "$no_norm" "${input_files[@]}" echo "Merged audio saved to: $output" } _merge_audio_files() { # _merge_audio_files OUTPUT CROSSFADE_MS NO_NORMALIZE FILE1 FILE2 ... local output="$1" crossfade_ms="$2" no_normalize="$3" shift 3 local files=("$@") local n=${#files[@]} ensure_dir "$(dirname "$output")" if [[ "$crossfade_ms" -gt 0 && $n -ge 2 ]]; then # Use acrossfade filter for crossfade between segments local crossfade_sec crossfade_sec=$(echo "scale=3; $crossfade_ms / 1000" | bc) local inputs=() local filter_parts=() for ((i=0; i/dev/null; then return 0 fi echo " Crossfade merge failed, falling back to concat demuxer..." >&2 fi # Fallback: concat demuxer (no crossfade) local concat_file concat_file="$(mktemp /tmp/concat_XXXXXX.txt)" for f in "${files[@]}"; do echo "file '$(cd "$(dirname "$f")" && pwd)/$(basename "$f")'" >> "$concat_file" done if [[ "$no_normalize" != "true" ]]; then local tmp_concat tmp_concat="$(mktemp /tmp/concat_out_XXXXXX.mp3)" ffmpeg -y -f concat -safe 0 -i "$concat_file" -c copy "$tmp_concat" 2>/dev/null ffmpeg -y -i "$tmp_concat" -af "loudnorm=I=-16:TP=-1.5:LRA=11" -acodec libmp3lame "$output" 2>/dev/null rm -f "$tmp_concat" else ffmpeg -y -f concat -safe 0 -i "$concat_file" -c copy "$output" 2>/dev/null fi rm -f "$concat_file" } # ============================================================================ # Subcommand: convert # ============================================================================ cmd_convert() { local input_file="" output="" format="mp3" sample_rate="" bitrate="" channels="" if [[ $# -gt 0 && "$1" != -* ]]; then input_file="$1"; shift fi while [[ $# -gt 0 ]]; do case "$1" in -o|--output) output="$2"; shift 2 ;; --format) format="$2"; shift 2 ;; --sample-rate) sample_rate="$2"; shift 2 ;; --bitrate) bitrate="$2"; shift 2 ;; --channels) channels="$2"; shift 2 ;; *) [[ -z "$input_file" ]] && input_file="$1"; shift ;; esac done if [[ -z "$input_file" || ! -f "$input_file" ]]; then echo "Error: Input file not found: ${input_file:-}" >&2 exit 1 fi if [[ -z "$output" ]]; then echo "Error: -o/--output is required" >&2 exit 1 fi ensure_dir "$(dirname "$output")" # Determine codec local codec="copy" case "$format" in mp3) codec="libmp3lame" ;; wav) codec="pcm_s16le" ;; flac) codec="flac" ;; ogg) codec="libvorbis" ;; aac) codec="aac" ;; m4a) codec="aac" ;; *) codec="copy" ;; esac local args=(-y -i "$input_file" -acodec "$codec") [[ -n "$sample_rate" ]] && args+=(-ar "$sample_rate") [[ -n "$channels" ]] && args+=(-ac "$channels") [[ -n "$bitrate" ]] && args+=(-b:a "$bitrate") args+=("$output") echo "Converting $input_file to $format..." ffmpeg "${args[@]}" 2>/dev/null echo "Converted audio saved to: $output" } # ============================================================================ # Subcommand: check-env # ============================================================================ cmd_check_env() { local check_script="$SCRIPT_DIR/../check_environment.sh" if [[ -f "$check_script" ]]; then bash "$check_script" "$@" else echo "check_environment.sh not found" >&2 exit 1 fi } # ============================================================================ # Main dispatcher # ============================================================================ usage() { cat <<'EOF' MiniMax Voice CLI — Unified TTS interface Usage: generate_voice.sh [options] Commands: tts Basic text-to-speech clone Clone voice from audio sample design Design voice from description list-voices List available voices validate Validate segments.json file generate Generate audio from segments.json merge Merge multiple audio files convert Convert audio format check-env Check environment setup Examples: generate_voice.sh tts "Hello world" -o hello.mp3 generate_voice.sh tts "你好" -v female-shaonv -o hello_cn.mp3 generate_voice.sh clone my_voice.mp3 --voice-id my-custom-voice generate_voice.sh design "A warm female voice" --voice-id narrator-1 generate_voice.sh list-voices generate_voice.sh validate segments.json --verbose generate_voice.sh generate segments.json -o output.mp3 generate_voice.sh merge part1.mp3 part2.mp3 -o combined.mp3 generate_voice.sh convert input.wav -o output.mp3 generate_voice.sh check-env --test-api EOF } main() { load_env if [[ $# -eq 0 ]]; then usage exit 0 fi local command="$1"; shift case "$command" in tts) check_api_key cmd_tts "$@" ;; clone) check_api_key cmd_clone "$@" ;; design) check_api_key cmd_design "$@" ;; list-voices) check_api_key cmd_list_voices "$@" ;; validate) cmd_validate "$@" ;; generate) check_api_key cmd_generate "$@" ;; merge) cmd_merge "$@" ;; convert) cmd_convert "$@" ;; check-env) cmd_check_env "$@" ;; -h|--help|help) usage ;; *) echo "Unknown command: $command" >&2 usage >&2 exit 1 ;; esac } main "$@"