#!/bin/bash # DGX Spark - Qwen3-Coder-Next-FP8 起動スクリプト # https://docs.techswan.online/tech/dgx-spark-qwen3-coder-next/ set -e CONTAINER_NAME="qwen3-coder-next-fp8" IMAGE="nvcr.io/nvidia/vllm:25.11-py3" MODEL="Qwen/Qwen3-Coder-Next-FP8" PORT="${PORT:-8000}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" MAX_NUM_SEQS="${MAX_NUM_SEQS:-32}" GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.85}" echo "=== DGX Spark - Qwen3-Coder-Next-FP8 起動スクリプト ===" echo "" # 既存コンテナ確認 if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo "⚠️ 既存コンテナを停止・削除..." docker stop "$CONTAINER_NAME" 2>/dev/null || true docker rm "$CONTAINER_NAME" 2>/dev/null || true fi # イメージ確認・取得 if ! docker images --format '{{.Repository}}:{{.Tag}}' | grep -q "^${IMAGE}$"; then echo "📦 NGC vLLMイメージを取得中..." docker pull "$IMAGE" fi echo "" echo "🚀 コンテナ起動..." echo " Model: $MODEL" echo " Port: $PORT" echo " Max Context: $MAX_MODEL_LEN" echo " GPU Memory: ${GPU_MEM_UTIL}%" echo "" docker run -d --name "$CONTAINER_NAME" \ --gpus all \ -p "${PORT}:8000" \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --ipc=host \ "$IMAGE" \ vllm serve "$MODEL" \ --max-model-len "$MAX_MODEL_LEN" \ --max-num-seqs "$MAX_NUM_SEQS" \ --gpu-memory-utilization "$GPU_MEM_UTIL" \ --trust-remote-code echo "" echo "✅ コンテナ起動しました!" echo "" echo "📋 ログ確認:" echo " docker logs -f $CONTAINER_NAME" echo "" echo "🔍 起動確認(Application startup complete が出るまで待つ):" echo " 初回起動は15-20分かかります(モデルダウンロード + ロード)" echo "" echo "🧪 テスト:" echo " curl http://localhost:${PORT}/health" echo "" echo "💬 チャット:" cat << 'CURL_EXAMPLE' curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "Qwen/Qwen3-Coder-Next-FP8", "messages": [{"role": "user", "content": "Hello!"}] }' CURL_EXAMPLE