feat: Import 35+ skills, merge duplicates, add openclaw installer

Major updates: - Added 35+ new skills from awesome-opencode-skills and antigravity repos - Merged SEO skills into seo-master - Merged architecture skills into architecture - Merged security skills into security-auditor and security-coder - Merged testing skills into testing-master and testing-patterns - Merged pentesting skills into pentesting - Renamed website-creator to thai-frontend-dev - Replaced skill-creator with github version - Removed Chutes references (use MiniMax API instead) - Added install-openclaw-skills.sh for cross-platform installation - Updated .env.example with MiniMax API credentials
2026-03-26 11:37:39 +07:00
parent 48595100a1
commit 7edf5bc4d0
469 changed files with 131580 additions and 417 deletions
--- a/skills/minimax-multimodal-toolkit/scripts/tts/generate_voice.sh
+++ b/skills/minimax-multimodal-toolkit/scripts/tts/generate_voice.sh
@@ -0,0 +1,934 @@
+#!/usr/bin/env bash
+# MiniMax Voice CLI — Unified TTS command-line interface (pure bash)
+#
+# Usage:
+#   bash scripts/tts/generate_voice.sh tts "Hello world" -o hello.mp3
+#   bash scripts/tts/generate_voice.sh clone my_voice.mp3 --voice-id my-custom-voice
+#   bash scripts/tts/generate_voice.sh design "A gentle female voice" --voice-id designed-voice-1
+#   bash scripts/tts/generate_voice.sh list-voices
+#   bash scripts/tts/generate_voice.sh validate segments.json
+#   bash scripts/tts/generate_voice.sh generate segments.json -o output.mp3
+#   bash scripts/tts/generate_voice.sh merge file1.mp3 file2.mp3 -o combined.mp3
+#   bash scripts/tts/generate_voice.sh convert input.wav -o output.mp3
+#   bash scripts/tts/generate_voice.sh check-env
+set -euo pipefail
+
+# ============================================================================
+# Configuration
+# ============================================================================
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# ============================================================================
+# Common functions
+# ============================================================================
+
+load_env() {
+  local env_file
+  for env_file in "$PROJECT_ROOT/.env" "$(pwd)/.env"; do
+    if [[ -f "$env_file" ]]; then
+      while IFS= read -r line || [[ -n "$line" ]]; do
+        line="${line%%#*}"              # strip comments
+        line="$(echo "$line" | xargs)"  # trim whitespace
+        [[ -z "$line" || "$line" != *=* ]] && continue
+        local key="${line%%=*}"
+        local val="${line#*=}"
+        key="$(echo "$key" | xargs)"
+        val="$(echo "$val" | xargs)"
+        # Remove surrounding quotes
+        if [[ ${#val} -ge 2 ]]; then
+          case "$val" in
+            \"*\") val="${val:1:${#val}-2}" ;;
+            \'*\') val="${val:1:${#val}-2}" ;;
+          esac
+        fi
+        # Only set if not already in environment
+        if [[ -z "${!key:-}" ]]; then
+          export "$key=$val"
+        fi
+      done < "$env_file"
+      return 0
+    fi
+  done
+  return 0
+}
+
+check_api_key() {
+  if [[ -z "${MINIMAX_API_KEY:-}" ]]; then
+    echo "Error: MINIMAX_API_KEY environment variable is not set" >&2
+    echo "  export MINIMAX_API_KEY='your-key'" >&2
+    exit 1
+  fi
+}
+
+ensure_dir() {
+  local dir="$1"
+  [[ -n "$dir" ]] && mkdir -p "$dir"
+}
+
+API_BASE="${MINIMAX_API_HOST:-https://api.minimaxi.com}/v1"
+
+api_request() {
+  # api_request METHOD ENDPOINT [JSON_BODY]
+  # Outputs raw JSON response to stdout.
+  local method="$1" endpoint="$2" body="${3:-}"
+  local url="${API_BASE}/${endpoint#/}"
+
+  local args=(
+    -s -w "\n%{http_code}"
+    -X "$method"
+    -H "Authorization: Bearer ${MINIMAX_API_KEY}"
+    -H "Accept-Encoding: gzip, deflate"
+    --compressed
+    --max-time 120
+  )
+  if [[ -n "$body" ]]; then
+    args+=(-H "Content-Type: application/json" -d "$body")
+  fi
+  args+=("$url")
+
+  local output http_code response
+  output="$(curl "${args[@]}" 2>/dev/null)" || {
+    echo "Error: curl request failed" >&2
+    exit 1
+  }
+  http_code="${output##*$'\n'}"
+  response="${output%$'\n'*}"
+
+  if [[ "$http_code" -ge 400 ]] 2>/dev/null; then
+    echo "Error: API returned HTTP $http_code" >&2
+    echo "$response" >&2
+    exit 1
+  fi
+
+  # Check API-level error
+  local status_code
+  status_code="$(echo "$response" | jq -r '.base_resp.status_code // 0')" 2>/dev/null || true
+  if [[ "$status_code" != "0" && -n "$status_code" ]]; then
+    local status_msg
+    status_msg="$(echo "$response" | jq -r '.base_resp.status_msg // "Unknown error"')"
+    echo "Error: API error [$status_code]: $status_msg" >&2
+    exit 1
+  fi
+
+  echo "$response"
+}
+
+api_upload() {
+  # api_upload ENDPOINT FILE_PATH PURPOSE
+  local endpoint="$1" file_path="$2" purpose="$3"
+  local url="${API_BASE}/${endpoint#/}"
+
+  local output http_code response
+  output="$(curl -s -w "\n%{http_code}" \
+    -X POST \
+    -H "Authorization: Bearer ${MINIMAX_API_KEY}" \
+    -H "Accept-Encoding: gzip, deflate" \
+    --compressed \
+    -F "file=@${file_path}" \
+    -F "purpose=${purpose}" \
+    --max-time 120 \
+    "$url" 2>/dev/null)" || {
+    echo "Error: curl upload failed" >&2
+    exit 1
+  }
+  http_code="${output##*$'\n'}"
+  response="${output%$'\n'*}"
+
+  if [[ "$http_code" -ge 400 ]] 2>/dev/null; then
+    echo "Error: API returned HTTP $http_code" >&2
+    echo "$response" >&2
+    exit 1
+  fi
+
+  local status_code
+  status_code="$(echo "$response" | jq -r '.base_resp.status_code // 0')" 2>/dev/null || true
+  if [[ "$status_code" != "0" && -n "$status_code" ]]; then
+    local status_msg
+    status_msg="$(echo "$response" | jq -r '.base_resp.status_msg // "Unknown error"')"
+    echo "Error: API error [$status_code]: $status_msg" >&2
+    exit 1
+  fi
+
+  echo "$response"
+}
+
+hex_to_file() {
+  # hex_to_file HEX_STRING OUTPUT_PATH
+  local hex="$1" output="$2"
+  ensure_dir "$(dirname "$output")"
+  echo "$hex" | xxd -r -p > "$output"
+}
+
+# ============================================================================
+# Subcommand: tts
+# ============================================================================
+cmd_tts() {
+  local text="" voice_id="male-qn-qingse" output="" model="speech-2.8-hd"
+  local speed=1.0 volume=1.0 pitch=0 emotion="" audio_format="mp3"
+  local sample_rate=32000 language_boost=""
+
+  # First positional arg is text
+  if [[ $# -gt 0 && "$1" != -* ]]; then
+    text="$1"; shift
+  fi
+
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -v|--voice-id) voice_id="$2"; shift 2 ;;
+      -o|--output) output="$2"; shift 2 ;;
+      --model) model="$2"; shift 2 ;;
+      --speed) speed="$2"; shift 2 ;;
+      --volume) volume="$2"; shift 2 ;;
+      --pitch) pitch="$2"; shift 2 ;;
+      --emotion) emotion="$2"; shift 2 ;;
+      --format) audio_format="$2"; shift 2 ;;
+      --sample-rate) sample_rate="$2"; shift 2 ;;
+      --language-boost) language_boost="$2"; shift 2 ;;
+      *) text="$1"; shift ;;
+    esac
+  done
+
+  if [[ -z "$text" ]]; then
+    echo "Error: text is required" >&2
+    echo "Usage: $(basename "$0") tts \"Text to speak\" -o output.mp3" >&2
+    exit 1
+  fi
+
+  # Build voice_setting
+  local voice_setting
+  voice_setting=$(jq -n \
+    --arg vid "$voice_id" \
+    --argjson spd "$speed" \
+    --argjson vol "$volume" \
+    --argjson pit "$pitch" \
+    '{voice_id: $vid, speed: $spd, vol: $vol, pitch: $pit}')
+
+  if [[ -n "$emotion" ]]; then
+    voice_setting=$(echo "$voice_setting" | jq --arg e "$emotion" '. + {emotion: $e}')
+  fi
+
+  # Build payload
+  local payload
+  payload=$(jq -n \
+    --arg model "$model" \
+    --arg text "$text" \
+    --argjson vs "$voice_setting" \
+    --arg fmt "$audio_format" \
+    --argjson sr "$sample_rate" \
+    '{
+      model: $model,
+      text: $text,
+      voice_setting: $vs,
+      audio_setting: {sample_rate: $sr, bitrate: 128000, format: $fmt, channel: 1},
+      stream: false,
+      subtitle_enable: false,
+      output_format: "hex"
+    }')
+
+  if [[ -n "$language_boost" ]]; then
+    payload=$(echo "$payload" | jq --arg lb "$language_boost" '. + {language_boost: $lb}')
+  fi
+
+  echo "Synthesizing: ${text:0:50}..."
+  local response
+  response="$(api_request POST t2a_v2 "$payload")"
+
+  # Extract hex audio
+  local audio_hex
+  audio_hex="$(echo "$response" | jq -r '.data.audio // .extra_info.audio // empty')"
+
+  if [[ -z "$audio_hex" ]]; then
+    echo "Error: No audio data returned from API" >&2
+    exit 1
+  fi
+
+  if [[ -n "$output" ]]; then
+    hex_to_file "$audio_hex" "$output"
+    echo "Done: $output"
+  else
+    echo "Generated ${#audio_hex} hex chars of audio"
+  fi
+}
+
+# ============================================================================
+# Subcommand: clone
+# ============================================================================
+cmd_clone() {
+  local audio_file="" voice_id="" preview_text="" preview_output=""
+
+  # First positional arg is audio file
+  if [[ $# -gt 0 && "$1" != -* ]]; then
+    audio_file="$1"; shift
+  fi
+
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --voice-id) voice_id="$2"; shift 2 ;;
+      --preview) preview_text="$2"; shift 2 ;;
+      --preview-output) preview_output="$2"; shift 2 ;;
+      *) [[ -z "$audio_file" ]] && audio_file="$1"; shift ;;
+    esac
+  done
+
+  if [[ -z "$audio_file" ]]; then
+    echo "Error: audio file is required" >&2
+    echo "Usage: $(basename "$0") clone audio.mp3 --voice-id my-voice" >&2
+    exit 1
+  fi
+  if [[ ! -f "$audio_file" ]]; then
+    echo "Error: Audio file not found: $audio_file" >&2
+    exit 1
+  fi
+  if [[ -z "$voice_id" ]]; then
+    echo "Error: --voice-id is required" >&2
+    exit 1
+  fi
+
+  echo "Cloning voice from: $audio_file"
+  echo "Voice ID: $voice_id"
+
+  # Step 1: Upload audio
+  local upload_response file_id
+  upload_response="$(api_upload files/upload "$audio_file" voice_clone)"
+  file_id="$(echo "$upload_response" | jq -r '.file.file_id // .file_id // empty')"
+
+  if [[ -z "$file_id" ]]; then
+    echo "Error: Upload succeeded but no file_id was returned" >&2
+    exit 1
+  fi
+
+  # Step 2: Clone voice
+  local clone_payload
+  clone_payload=$(jq -n \
+    --arg vid "$voice_id" \
+    --argjson fid "$file_id" \
+    '{voice_id: $vid, file_id: $fid}')
+
+  api_request POST voice_clone "$clone_payload" > /dev/null
+  echo "Voice cloned successfully: $voice_id"
+
+  # Step 3: Preview if requested
+  if [[ -n "$preview_text" ]]; then
+    echo "Generating preview..."
+    local pout="${preview_output:-${voice_id}_preview.mp3}"
+    cmd_tts "$preview_text" -v "$voice_id" -o "$pout"
+    echo "Preview saved to: $pout"
+  fi
+}
+
+# ============================================================================
+# Subcommand: design
+# ============================================================================
+cmd_design() {
+  local description="" voice_id="" preview_text="" preview_output=""
+
+  if [[ $# -gt 0 && "$1" != -* ]]; then
+    description="$1"; shift
+  fi
+
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --voice-id) voice_id="$2"; shift 2 ;;
+      --preview) preview_text="$2"; shift 2 ;;
+      --preview-output) preview_output="$2"; shift 2 ;;
+      *) [[ -z "$description" ]] && description="$1"; shift ;;
+    esac
+  done
+
+  if [[ -z "$description" ]]; then
+    echo "Error: description is required" >&2
+    echo "Usage: $(basename \"$0\") design \"A warm female voice\" --voice-id narrator" >&2
+    exit 1
+  fi
+
+  local ptext="${preview_text:-This is a preview of the designed voice.}"
+
+  echo "Designing voice from: \"$description\""
+  [[ -n "$voice_id" ]] && echo "Voice ID: $voice_id"
+
+  local payload
+  payload=$(jq -n \
+    --arg prompt "$description" \
+    --arg pt "$ptext" \
+    '{prompt: $prompt, preview_text: $pt}')
+
+  if [[ -n "$voice_id" ]]; then
+    payload=$(echo "$payload" | jq --arg vid "$voice_id" '. + {voice_id: $vid}')
+  fi
+
+  local response
+  response="$(api_request POST voice_design "$payload")"
+
+  local actual_voice_id
+  actual_voice_id="${voice_id:-$(echo "$response" | jq -r '.voice_id // "unknown"')}"
+  echo "Voice designed: $actual_voice_id"
+
+  local trial_audio
+  trial_audio="$(echo "$response" | jq -r '.trial_audio // empty')"
+  if [[ -n "$trial_audio" ]]; then
+    local pout="${preview_output:-${actual_voice_id}_preview.mp3}"
+    hex_to_file "$trial_audio" "$pout"
+    echo "Preview saved to: $pout"
+  fi
+}
+
+# ============================================================================
+# Subcommand: list-voices
+# ============================================================================
+cmd_list_voices() {
+  echo "=== System Voices ==="
+  local sys_response
+  sys_response="$(api_request POST voice/list '{"voice_type":"system"}' 2>/dev/null)" || true
+
+  if [[ -n "$sys_response" ]]; then
+    local count
+    count="$(echo "$sys_response" | jq '.voice_list | length')" 2>/dev/null || count=0
+    if [[ "$count" -gt 0 ]]; then
+      echo "$sys_response" | jq -r '.voice_list[:10][] | "  \(.voice_id): \(.name // "N/A")"'
+      if [[ "$count" -gt 10 ]]; then
+        echo "  ... and $((count - 10)) more"
+      fi
+    else
+      echo "  (None found)"
+    fi
+  else
+    echo "  (Could not fetch system voices)"
+  fi
+
+  echo ""
+  echo "=== Custom Voices ==="
+
+  local clone_response design_response
+  clone_response="$(api_request POST voice/list '{"voice_type":"voice_cloning"}' 2>/dev/null)" || true
+  design_response="$(api_request POST voice/list '{"voice_type":"voice_generation"}' 2>/dev/null)" || true
+
+  local has_custom=false
+
+  if [[ -n "$clone_response" ]]; then
+    local cc
+    cc="$(echo "$clone_response" | jq '.voice_list | length')" 2>/dev/null || cc=0
+    if [[ "$cc" -gt 0 ]]; then
+      has_custom=true
+      echo "Cloned ($cc):"
+      echo "$clone_response" | jq -r '.voice_list[] | "  \(.voice_id)"'
+    fi
+  fi
+
+  if [[ -n "$design_response" ]]; then
+    local dc
+    dc="$(echo "$design_response" | jq '.voice_list | length')" 2>/dev/null || dc=0
+    if [[ "$dc" -gt 0 ]]; then
+      has_custom=true
+      echo "Designed ($dc):"
+      echo "$design_response" | jq -r '.voice_list[] | "  \(.voice_id)"'
+    fi
+  fi
+
+  if ! $has_custom; then
+    echo "  (None found)"
+  fi
+}
+
+# ============================================================================
+# Subcommand: validate
+# ============================================================================
+cmd_validate() {
+  local segments_file="" model="speech-2.8-hd" strict=false verbose=false
+
+  if [[ $# -gt 0 && "$1" != -* ]]; then
+    segments_file="$1"; shift
+  fi
+
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --model) model="$2"; shift 2 ;;
+      --strict) strict=true; shift ;;
+      -v|--verbose) verbose=true; shift ;;
+      --validate-voices) shift ;; # Not implemented in bash version
+      *) [[ -z "$segments_file" ]] && segments_file="$1"; shift ;;
+    esac
+  done
+
+  if [[ -z "$segments_file" || ! -f "$segments_file" ]]; then
+    echo "Error: Segments file not found: ${segments_file:-<none>}" >&2
+    exit 1
+  fi
+
+  echo "Validating: $segments_file"
+  echo "Model: $model"
+
+  local valid_emotions="happy sad angry fearful disgusted surprised calm fluent whisper"
+  echo "Valid emotions: $valid_emotions"
+  echo ""
+
+  # Parse JSON
+  local segments count
+  segments="$(jq -r 'if type == "array" then . elif type == "object" and has("segments") then .segments else empty end' "$segments_file" 2>/dev/null)" || {
+    echo "Error: Invalid JSON in $segments_file" >&2
+    exit 1
+  }
+
+  if [[ -z "$segments" || "$segments" == "null" ]]; then
+    echo "Error: No segments found in file" >&2
+    exit 1
+  fi
+
+  count="$(echo "$segments" | jq 'length')"
+  local errors=0
+
+  for ((i=0; i<count; i++)); do
+    local text voice_id emotion
+    text="$(echo "$segments" | jq -r ".[$i].text // \"\"")"
+    voice_id="$(echo "$segments" | jq -r ".[$i].voice_id // \"\"")"
+    emotion="$(echo "$segments" | jq -r ".[$i].emotion // \"\"")"
+
+    if [[ -z "$text" ]]; then
+      echo "  - Segment $i: 'text' is required and must not be empty"
+      errors=$((errors + 1))
+    fi
+    if [[ -z "$voice_id" ]]; then
+      echo "  - Segment $i: 'voice_id' is required"
+      errors=$((errors + 1))
+    fi
+    if [[ -n "$emotion" ]]; then
+      if ! echo "$valid_emotions" | grep -qw "$emotion"; then
+        echo "  - Segment $i: invalid emotion '$emotion'"
+        errors=$((errors + 1))
+      fi
+    fi
+  done
+
+  if [[ $errors -eq 0 ]]; then
+    echo "Validation passed: $count segments"
+    if $verbose; then
+      echo ""
+      echo "=== Segment Summary ==="
+      for ((i=0; i<count; i++)); do
+        local text voice_id emotion
+        text="$(echo "$segments" | jq -r ".[$i].text // \"\"")"
+        voice_id="$(echo "$segments" | jq -r ".[$i].voice_id // \"\"")"
+        emotion="$(echo "$segments" | jq -r ".[$i].emotion // \"\"")"
+        local elabel="${emotion:-AUTO}"
+        printf "  %d: [%-10s] voice=%-20s \"%s\"\n" "$i" "${elabel^^}" "${voice_id:0:20}" "${text:0:40}"
+      done
+    fi
+    return 0
+  else
+    echo "Validation failed ($errors errors)"
+    return 1
+  fi
+}
+
+# ============================================================================
+# Subcommand: generate (multi-segment pipeline)
+# ============================================================================
+cmd_generate() {
+  local segments_file="" output="" model="speech-2.8-hd" crossfade=200
+  local no_normalize=false temp_dir="" continue_on_error=false
+
+  if [[ $# -gt 0 && "$1" != -* ]]; then
+    segments_file="$1"; shift
+  fi
+
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -o|--output) output="$2"; shift 2 ;;
+      --model) model="$2"; shift 2 ;;
+      --crossfade) crossfade="$2"; shift 2 ;;
+      --no-normalize) no_normalize=true; shift ;;
+      --temp-dir) temp_dir="$2"; shift 2 ;;
+      --continue-on-error) continue_on_error=true; shift ;;
+      *) [[ -z "$segments_file" ]] && segments_file="$1"; shift ;;
+    esac
+  done
+
+  if [[ -z "$segments_file" || ! -f "$segments_file" ]]; then
+    echo "Error: Segments file not found: ${segments_file:-<none>}" >&2
+    exit 1
+  fi
+  if [[ -z "$output" ]]; then
+    echo "Error: -o/--output is required" >&2
+    exit 1
+  fi
+
+  # Validate first
+  echo "Validating segments file..."
+  local segments count
+  segments="$(jq -r 'if type == "array" then . elif type == "object" and has("segments") then .segments else empty end' "$segments_file")"
+  count="$(echo "$segments" | jq 'length')"
+
+  if [[ "$count" -eq 0 ]]; then
+    echo "Error: No segments found" >&2
+    exit 1
+  fi
+  echo "Found $count valid segments"
+  echo ""
+
+  # Setup temp dir
+  if [[ -z "$temp_dir" ]]; then
+    temp_dir="$(dirname "$(cd "$(dirname "$output")" 2>/dev/null && pwd || echo ".")/$(basename "$output")")/tmp"
+  fi
+  mkdir -p "$temp_dir"
+  echo "Temp directory: $temp_dir"
+
+  # Generate each segment
+  local succeeded=0 failed=0
+  local segment_files=()
+
+  for ((i=0; i<count; i++)); do
+    local text voice_id emotion speed vol pitch
+    text="$(echo "$segments" | jq -r ".[$i].text")"
+    voice_id="$(echo "$segments" | jq -r ".[$i].voice_id")"
+    emotion="$(echo "$segments" | jq -r ".[$i].emotion // \"\"")"
+    speed="$(echo "$segments" | jq -r ".[$i].speed // 1.0")"
+    vol="$(echo "$segments" | jq -r ".[$i].volume // 1.0")"
+    pitch="$(echo "$segments" | jq -r ".[$i].pitch // 0")"
+
+    printf "  Generating segment %d/%d: %s...\n" "$((i+1))" "$count" "${text:0:40}"
+
+    local seg_output="$temp_dir/segment_$(printf '%04d' "$i").mp3"
+
+    # Build voice_setting
+    local voice_setting
+    voice_setting=$(jq -n \
+      --arg vid "$voice_id" \
+      --argjson spd "$speed" \
+      --argjson vol "$vol" \
+      --argjson pit "$pitch" \
+      '{voice_id: $vid, speed: $spd, vol: $vol, pitch: $pit}')
+    if [[ -n "$emotion" ]]; then
+      voice_setting=$(echo "$voice_setting" | jq --arg e "$emotion" '. + {emotion: $e}')
+    fi
+
+    local payload
+    payload=$(jq -n \
+      --arg model "$model" \
+      --arg text "$text" \
+      --argjson vs "$voice_setting" \
+      '{
+        model: $model,
+        text: $text,
+        voice_setting: $vs,
+        audio_setting: {sample_rate: 32000, bitrate: 128000, format: "mp3", channel: 1},
+        stream: false,
+        output_format: "hex"
+      }')
+
+    local response audio_hex
+    if response="$(api_request POST t2a_v2 "$payload" 2>&1)"; then
+      audio_hex="$(echo "$response" | jq -r '.data.audio // .extra_info.audio // empty')"
+      if [[ -n "$audio_hex" ]]; then
+        hex_to_file "$audio_hex" "$seg_output"
+        segment_files+=("$seg_output")
+        succeeded=$((succeeded + 1))
+        echo "    ✓ Saved: $seg_output"
+      else
+        failed=$((failed + 1))
+        echo "    ✗ Error: No audio data in response"
+        if ! $continue_on_error; then break; fi
+      fi
+    else
+      failed=$((failed + 1))
+      echo "    ✗ Error: $response"
+      if ! $continue_on_error; then break; fi
+    fi
+  done
+
+  if [[ ${#segment_files[@]} -eq 0 ]]; then
+    echo "Error: No segments were generated successfully" >&2
+    exit 1
+  fi
+
+  # Merge segments
+  ensure_dir "$(dirname "$output")"
+
+  if [[ ${#segment_files[@]} -eq 1 ]]; then
+    cp "${segment_files[0]}" "$output"
+  else
+    _merge_audio_files "$output" "$crossfade" "$no_normalize" "${segment_files[@]}"
+  fi
+
+  echo ""
+  echo "Audio saved to: $output"
+  echo "  Generated: $succeeded/$count segments"
+  echo ""
+  echo "  Intermediate files in: $temp_dir"
+  echo "  Delete with: rm -rf $temp_dir"
+}
+
+# ============================================================================
+# Subcommand: merge
+# ============================================================================
+cmd_merge() {
+  local output="" format="mp3" crossfade=300 normalize=true
+  local input_files=()
+
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -o|--output) output="$2"; shift 2 ;;
+      --format) format="$2"; shift 2 ;;
+      --crossfade) crossfade="$2"; shift 2 ;;
+      --no-normalize) normalize=false; shift ;;
+      *) input_files+=("$1"); shift ;;
+    esac
+  done
+
+  if [[ ${#input_files[@]} -lt 2 ]]; then
+    echo "Error: At least 2 input files required" >&2
+    exit 1
+  fi
+  if [[ -z "$output" ]]; then
+    echo "Error: -o/--output is required" >&2
+    exit 1
+  fi
+
+  for f in "${input_files[@]}"; do
+    if [[ ! -f "$f" ]]; then
+      echo "Error: File not found: $f" >&2
+      exit 1
+    fi
+  done
+
+  echo "Merging ${#input_files[@]} files..."
+  local no_norm="false"
+  $normalize || no_norm="true"
+  _merge_audio_files "$output" "$crossfade" "$no_norm" "${input_files[@]}"
+  echo "Merged audio saved to: $output"
+}
+
+_merge_audio_files() {
+  # _merge_audio_files OUTPUT CROSSFADE_MS NO_NORMALIZE FILE1 FILE2 ...
+  local output="$1" crossfade_ms="$2" no_normalize="$3"
+  shift 3
+  local files=("$@")
+  local n=${#files[@]}
+
+  ensure_dir "$(dirname "$output")"
+
+  if [[ "$crossfade_ms" -gt 0 && $n -ge 2 ]]; then
+    # Use acrossfade filter for crossfade between segments
+    local crossfade_sec
+    crossfade_sec=$(echo "scale=3; $crossfade_ms / 1000" | bc)
+
+    local inputs=()
+    local filter_parts=()
+
+    for ((i=0; i<n; i++)); do
+      inputs+=(-i "${files[$i]}")
+      filter_parts+=("[${i}:a]aresample=32000,aformat=sample_fmts=fltp:channel_layouts=mono[s${i}]")
+    done
+
+    # Build acrossfade chain
+    if [[ $n -eq 2 ]]; then
+      filter_parts+=("[s0][s1]acrossfade=d=${crossfade_sec}[merged]")
+    else
+      filter_parts+=("[s0][s1]acrossfade=d=${crossfade_sec}[m1]")
+      for ((i=2; i<n; i++)); do
+        local prev="[m$((i-1))]"
+        if [[ $i -eq $((n-1)) ]]; then
+          filter_parts+=("${prev}[s${i}]acrossfade=d=${crossfade_sec}[merged]")
+        else
+          filter_parts+=("${prev}[s${i}]acrossfade=d=${crossfade_sec}[m${i}]")
+        fi
+      done
+    fi
+
+    local final_filter="[merged]aformat=sample_fmts=fltp"
+    if [[ "$no_normalize" != "true" ]]; then
+      final_filter+=",loudnorm=I=-16:TP=-1.5:LRA=11"
+    fi
+    final_filter+="[final]"
+    filter_parts+=("$final_filter")
+
+    local filter_complex
+    filter_complex="$(IFS=';'; echo "${filter_parts[*]}")"
+
+    if ffmpeg -y "${inputs[@]}" \
+      -filter_complex "$filter_complex" \
+      -map "[final]" \
+      -ar 32000 -ac 1 -acodec libmp3lame \
+      "$output" 2>/dev/null; then
+      return 0
+    fi
+    echo "  Crossfade merge failed, falling back to concat demuxer..." >&2
+  fi
+
+  # Fallback: concat demuxer (no crossfade)
+  local concat_file
+  concat_file="$(mktemp /tmp/concat_XXXXXX.txt)"
+  for f in "${files[@]}"; do
+    echo "file '$(cd "$(dirname "$f")" && pwd)/$(basename "$f")'" >> "$concat_file"
+  done
+
+  if [[ "$no_normalize" != "true" ]]; then
+    local tmp_concat
+    tmp_concat="$(mktemp /tmp/concat_out_XXXXXX.mp3)"
+    ffmpeg -y -f concat -safe 0 -i "$concat_file" -c copy "$tmp_concat" 2>/dev/null
+    ffmpeg -y -i "$tmp_concat" -af "loudnorm=I=-16:TP=-1.5:LRA=11" -acodec libmp3lame "$output" 2>/dev/null
+    rm -f "$tmp_concat"
+  else
+    ffmpeg -y -f concat -safe 0 -i "$concat_file" -c copy "$output" 2>/dev/null
+  fi
+
+  rm -f "$concat_file"
+}
+
+# ============================================================================
+# Subcommand: convert
+# ============================================================================
+cmd_convert() {
+  local input_file="" output="" format="mp3" sample_rate="" bitrate="" channels=""
+
+  if [[ $# -gt 0 && "$1" != -* ]]; then
+    input_file="$1"; shift
+  fi
+
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -o|--output) output="$2"; shift 2 ;;
+      --format) format="$2"; shift 2 ;;
+      --sample-rate) sample_rate="$2"; shift 2 ;;
+      --bitrate) bitrate="$2"; shift 2 ;;
+      --channels) channels="$2"; shift 2 ;;
+      *) [[ -z "$input_file" ]] && input_file="$1"; shift ;;
+    esac
+  done
+
+  if [[ -z "$input_file" || ! -f "$input_file" ]]; then
+    echo "Error: Input file not found: ${input_file:-<none>}" >&2
+    exit 1
+  fi
+  if [[ -z "$output" ]]; then
+    echo "Error: -o/--output is required" >&2
+    exit 1
+  fi
+
+  ensure_dir "$(dirname "$output")"
+
+  # Determine codec
+  local codec="copy"
+  case "$format" in
+    mp3) codec="libmp3lame" ;;
+    wav) codec="pcm_s16le" ;;
+    flac) codec="flac" ;;
+    ogg) codec="libvorbis" ;;
+    aac) codec="aac" ;;
+    m4a) codec="aac" ;;
+    *) codec="copy" ;;
+  esac
+
+  local args=(-y -i "$input_file" -acodec "$codec")
+  [[ -n "$sample_rate" ]] && args+=(-ar "$sample_rate")
+  [[ -n "$channels" ]] && args+=(-ac "$channels")
+  [[ -n "$bitrate" ]] && args+=(-b:a "$bitrate")
+  args+=("$output")
+
+  echo "Converting $input_file to $format..."
+  ffmpeg "${args[@]}" 2>/dev/null
+  echo "Converted audio saved to: $output"
+}
+
+# ============================================================================
+# Subcommand: check-env
+# ============================================================================
+cmd_check_env() {
+  local check_script="$SCRIPT_DIR/../check_environment.sh"
+  if [[ -f "$check_script" ]]; then
+    bash "$check_script" "$@"
+  else
+    echo "check_environment.sh not found" >&2
+    exit 1
+  fi
+}
+
+# ============================================================================
+# Main dispatcher
+# ============================================================================
+usage() {
+  cat <<'EOF'
+MiniMax Voice CLI — Unified TTS interface
+
+Usage:
+  generate_voice.sh <command> [options]
+
+Commands:
+  tts          Basic text-to-speech
+  clone        Clone voice from audio sample
+  design       Design voice from description
+  list-voices  List available voices
+  validate     Validate segments.json file
+  generate     Generate audio from segments.json
+  merge        Merge multiple audio files
+  convert      Convert audio format
+  check-env    Check environment setup
+
+Examples:
+  generate_voice.sh tts "Hello world" -o hello.mp3
+  generate_voice.sh tts "你好" -v female-shaonv -o hello_cn.mp3
+  generate_voice.sh clone my_voice.mp3 --voice-id my-custom-voice
+  generate_voice.sh design "A warm female voice" --voice-id narrator-1
+  generate_voice.sh list-voices
+  generate_voice.sh validate segments.json --verbose
+  generate_voice.sh generate segments.json -o output.mp3
+  generate_voice.sh merge part1.mp3 part2.mp3 -o combined.mp3
+  generate_voice.sh convert input.wav -o output.mp3
+  generate_voice.sh check-env --test-api
+EOF
+}
+
+main() {
+  load_env
+
+  if [[ $# -eq 0 ]]; then
+    usage
+    exit 0
+  fi
+
+  local command="$1"; shift
+
+  case "$command" in
+    tts)
+      check_api_key
+      cmd_tts "$@"
+      ;;
+    clone)
+      check_api_key
+      cmd_clone "$@"
+      ;;
+    design)
+      check_api_key
+      cmd_design "$@"
+      ;;
+    list-voices)
+      check_api_key
+      cmd_list_voices "$@"
+      ;;
+    validate)
+      cmd_validate "$@"
+      ;;
+    generate)
+      check_api_key
+      cmd_generate "$@"
+      ;;
+    merge)
+      cmd_merge "$@"
+      ;;
+    convert)
+      cmd_convert "$@"
+      ;;
+    check-env)
+      cmd_check_env "$@"
+      ;;
+    -h|--help|help)
+      usage
+      ;;
+    *)
+      echo "Unknown command: $command" >&2
+      usage >&2
+      exit 1
+      ;;
+  esac
+}
+
+main "$@"