Add one-liner DGX script to run llama-server with Qwen3.5-122B
All checks were successful
Deploy Docusaurus Site / deploy (push) Successful in 56s
All checks were successful
Deploy Docusaurus Site / deploy (push) Successful in 56s
This commit is contained in:
parent
421ace0650
commit
d8b3ac9102
95
static/scripts/run-llama-qwen35-122b.sh
Executable file
95
static/scripts/run-llama-qwen35-122b.sh
Executable file
@ -0,0 +1,95 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# run-llama-qwen35-122b.sh
|
||||||
|
# DGX Spark / llama.cpp 向け起動スクリプト(OpenAI互換API)
|
||||||
|
#
|
||||||
|
# One-liner:
|
||||||
|
# curl -sL https://www.techswan.online/scripts/run-llama-qwen35-122b.sh | bash
|
||||||
|
#
|
||||||
|
# Env overrides:
|
||||||
|
# MODEL="unsloth/Qwen3.5-122B-A10B-GGUF:Q4_K_M"
|
||||||
|
# PORT=8080
|
||||||
|
# CTX=204800
|
||||||
|
# BATCH=512
|
||||||
|
# UBATCH=256
|
||||||
|
# THREADS=8
|
||||||
|
# PARALLEL=1
|
||||||
|
# CACHE_K=q8_0
|
||||||
|
# CACHE_V=q8_0
|
||||||
|
# SWA_FULL=1
|
||||||
|
# NGL=999
|
||||||
|
# KILL_OLD=1
|
||||||
|
|
||||||
|
LLAMA_BIN="${LLAMA_BIN:-$HOME/llama.cpp/build-max/bin/llama-server}"
|
||||||
|
MODEL="${MODEL:-unsloth/Qwen3.5-122B-A10B-GGUF:Q4_K_M}"
|
||||||
|
PORT="${PORT:-8080}"
|
||||||
|
CTX="${CTX:-204800}"
|
||||||
|
BATCH="${BATCH:-512}"
|
||||||
|
UBATCH="${UBATCH:-256}"
|
||||||
|
THREADS="${THREADS:-8}"
|
||||||
|
PARALLEL="${PARALLEL:-1}"
|
||||||
|
CACHE_K="${CACHE_K:-q8_0}"
|
||||||
|
CACHE_V="${CACHE_V:-q8_0}"
|
||||||
|
SWA_FULL="${SWA_FULL:-1}"
|
||||||
|
NGL="${NGL:-999}"
|
||||||
|
KILL_OLD="${KILL_OLD:-1}"
|
||||||
|
LOG_FILE="${LOG_FILE:-/tmp/llama-qwen35-122b.log}"
|
||||||
|
|
||||||
|
if [[ ! -x "$LLAMA_BIN" ]]; then
|
||||||
|
echo "[ERROR] llama-server not found: $LLAMA_BIN" >&2
|
||||||
|
echo " Build llama.cpp first (GGML_CUDA=ON)." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$KILL_OLD" == "1" ]]; then
|
||||||
|
pkill -f "llama-server.*Qwen3.5-122B-A10B" 2>/dev/null || true
|
||||||
|
pkill -f "llama-server .*${PORT}" 2>/dev/null || true
|
||||||
|
sleep 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ARGS=(
|
||||||
|
-hf "$MODEL"
|
||||||
|
--host 0.0.0.0 --port "$PORT"
|
||||||
|
-ngl "$NGL"
|
||||||
|
-c "$CTX"
|
||||||
|
-b "$BATCH"
|
||||||
|
-ub "$UBATCH"
|
||||||
|
-t "$THREADS"
|
||||||
|
--parallel "$PARALLEL"
|
||||||
|
--cache-type-k "$CACHE_K"
|
||||||
|
--cache-type-v "$CACHE_V"
|
||||||
|
--flash-attn on
|
||||||
|
)
|
||||||
|
|
||||||
|
if [[ "$SWA_FULL" == "1" ]]; then
|
||||||
|
ARGS+=(--swa-full)
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] Starting llama-server"
|
||||||
|
echo " bin: $LLAMA_BIN"
|
||||||
|
echo " model: $MODEL"
|
||||||
|
echo " port: $PORT"
|
||||||
|
echo " ctx: $CTX"
|
||||||
|
echo " batch: $BATCH / ubatch: $UBATCH"
|
||||||
|
echo " kv: $CACHE_K / $CACHE_V"
|
||||||
|
echo " parallel:$PARALLEL"
|
||||||
|
echo " log: $LOG_FILE"
|
||||||
|
|
||||||
|
nohup "$LLAMA_BIN" "${ARGS[@]}" > "$LOG_FILE" 2>&1 &
|
||||||
|
PID=$!
|
||||||
|
|
||||||
|
echo "[OK] pid=$PID"
|
||||||
|
|
||||||
|
echo "[INFO] Waiting health..."
|
||||||
|
for i in {1..30}; do
|
||||||
|
if curl -fsS "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then
|
||||||
|
echo "[OK] health: http://127.0.0.1:${PORT}/health"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "[WARN] still loading. tail logs:"
|
||||||
|
echo " tail -f $LOG_FILE"
|
||||||
Loading…
x
Reference in New Issue
Block a user