From d8b3ac91028a9a49c42d0660364f68feaef78a5a Mon Sep 17 00:00:00 2001 From: koide Date: Thu, 5 Mar 2026 00:15:44 +0000 Subject: [PATCH] Add one-liner DGX script to run llama-server with Qwen3.5-122B --- static/scripts/run-llama-qwen35-122b.sh | 95 +++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100755 static/scripts/run-llama-qwen35-122b.sh diff --git a/static/scripts/run-llama-qwen35-122b.sh b/static/scripts/run-llama-qwen35-122b.sh new file mode 100755 index 0000000..65b220e --- /dev/null +++ b/static/scripts/run-llama-qwen35-122b.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +set -euo pipefail + +# run-llama-qwen35-122b.sh +# DGX Spark / llama.cpp 向け起動スクリプト(OpenAI互換API) +# +# One-liner: +# curl -sL https://www.techswan.online/scripts/run-llama-qwen35-122b.sh | bash +# +# Env overrides: +# MODEL="unsloth/Qwen3.5-122B-A10B-GGUF:Q4_K_M" +# PORT=8080 +# CTX=204800 +# BATCH=512 +# UBATCH=256 +# THREADS=8 +# PARALLEL=1 +# CACHE_K=q8_0 +# CACHE_V=q8_0 +# SWA_FULL=1 +# NGL=999 +# KILL_OLD=1 + +LLAMA_BIN="${LLAMA_BIN:-$HOME/llama.cpp/build-max/bin/llama-server}" +MODEL="${MODEL:-unsloth/Qwen3.5-122B-A10B-GGUF:Q4_K_M}" +PORT="${PORT:-8080}" +CTX="${CTX:-204800}" +BATCH="${BATCH:-512}" +UBATCH="${UBATCH:-256}" +THREADS="${THREADS:-8}" +PARALLEL="${PARALLEL:-1}" +CACHE_K="${CACHE_K:-q8_0}" +CACHE_V="${CACHE_V:-q8_0}" +SWA_FULL="${SWA_FULL:-1}" +NGL="${NGL:-999}" +KILL_OLD="${KILL_OLD:-1}" +LOG_FILE="${LOG_FILE:-/tmp/llama-qwen35-122b.log}" + +if [[ ! -x "$LLAMA_BIN" ]]; then + echo "[ERROR] llama-server not found: $LLAMA_BIN" >&2 + echo " Build llama.cpp first (GGML_CUDA=ON)." >&2 + exit 1 +fi + +if [[ "$KILL_OLD" == "1" ]]; then + pkill -f "llama-server.*Qwen3.5-122B-A10B" 2>/dev/null || true + pkill -f "llama-server .*${PORT}" 2>/dev/null || true + sleep 1 +fi + +ARGS=( + -hf "$MODEL" + --host 0.0.0.0 --port "$PORT" + -ngl "$NGL" + -c "$CTX" + -b "$BATCH" + -ub "$UBATCH" + -t "$THREADS" + --parallel "$PARALLEL" + --cache-type-k "$CACHE_K" + --cache-type-v "$CACHE_V" + --flash-attn on +) + +if [[ "$SWA_FULL" == "1" ]]; then + ARGS+=(--swa-full) +fi + +echo "[INFO] Starting llama-server" +echo " bin: $LLAMA_BIN" +echo " model: $MODEL" +echo " port: $PORT" +echo " ctx: $CTX" +echo " batch: $BATCH / ubatch: $UBATCH" +echo " kv: $CACHE_K / $CACHE_V" +echo " parallel:$PARALLEL" +echo " log: $LOG_FILE" + +nohup "$LLAMA_BIN" "${ARGS[@]}" > "$LOG_FILE" 2>&1 & +PID=$! + +echo "[OK] pid=$PID" + +echo "[INFO] Waiting health..." +for i in {1..30}; do + if curl -fsS "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then + echo "[OK] health: http://127.0.0.1:${PORT}/health" + exit 0 + fi + sleep 2 + +done + +echo "[WARN] still loading. tail logs:" +echo " tail -f $LOG_FILE"