#!/usr/bin/env bash set -euo pipefail # run-llama-qwen35-122b.sh # DGX Spark / llama.cpp 向け起動スクリプト(OpenAI互換API) # # One-liner: # curl -sL https://www.techswan.online/scripts/run-llama-qwen35-122b.sh | bash # # Env overrides: # MODEL="unsloth/Qwen3.5-122B-A10B-GGUF:Q4_K_M" # PORT=8080 # CTX=204800 # BATCH=512 # UBATCH=256 # THREADS=8 # PARALLEL=1 # CACHE_K=q8_0 # CACHE_V=q8_0 # SWA_FULL=1 # NGL=999 # KILL_OLD=1 LLAMA_BIN="${LLAMA_BIN:-$HOME/llama.cpp/build-max/bin/llama-server}" MODEL="${MODEL:-unsloth/Qwen3.5-122B-A10B-GGUF:Q4_K_M}" PORT="${PORT:-8080}" CTX="${CTX:-204800}" BATCH="${BATCH:-512}" UBATCH="${UBATCH:-256}" THREADS="${THREADS:-8}" PARALLEL="${PARALLEL:-1}" CACHE_K="${CACHE_K:-q8_0}" CACHE_V="${CACHE_V:-q8_0}" SWA_FULL="${SWA_FULL:-1}" NGL="${NGL:-999}" KILL_OLD="${KILL_OLD:-1}" LOG_FILE="${LOG_FILE:-/tmp/llama-qwen35-122b.log}" if [[ ! -x "$LLAMA_BIN" ]]; then echo "[ERROR] llama-server not found: $LLAMA_BIN" >&2 echo " Build llama.cpp first (GGML_CUDA=ON)." >&2 exit 1 fi if [[ "$KILL_OLD" == "1" ]]; then pkill -f "llama-server.*Qwen3.5-122B-A10B" 2>/dev/null || true pkill -f "llama-server .*${PORT}" 2>/dev/null || true sleep 1 fi ARGS=( -hf "$MODEL" --host 0.0.0.0 --port "$PORT" -ngl "$NGL" -c "$CTX" -b "$BATCH" -ub "$UBATCH" -t "$THREADS" --parallel "$PARALLEL" --cache-type-k "$CACHE_K" --cache-type-v "$CACHE_V" --flash-attn on ) if [[ "$SWA_FULL" == "1" ]]; then ARGS+=(--swa-full) fi echo "[INFO] Starting llama-server" echo " bin: $LLAMA_BIN" echo " model: $MODEL" echo " port: $PORT" echo " ctx: $CTX" echo " batch: $BATCH / ubatch: $UBATCH" echo " kv: $CACHE_K / $CACHE_V" echo " parallel:$PARALLEL" echo " log: $LOG_FILE" nohup "$LLAMA_BIN" "${ARGS[@]}" > "$LOG_FILE" 2>&1 & PID=$! echo "[OK] pid=$PID" echo "[INFO] Waiting health..." for i in {1..30}; do if curl -fsS "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then echo "[OK] health: http://127.0.0.1:${PORT}/health" exit 0 fi sleep 2 done echo "[WARN] still loading. tail logs:" echo " tail -f $LOG_FILE"