All checks were successful
Deploy Docusaurus Site / deploy (push) Successful in 51s
98 lines
2.5 KiB
Bash
Executable File
98 lines
2.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
# install-and-run-llama-qwen35-122b.sh
|
||
# 新規DGX Spark環境向け: llama.cpp ビルド + llama-server 起動(Qwen3.5-122B)
|
||
#
|
||
# One-liner:
|
||
# curl -sL https://www.techswan.online/scripts/install-and-run-llama-qwen35-122b.sh | bash
|
||
|
||
REPO_DIR="${REPO_DIR:-$HOME/llama.cpp}"
|
||
BUILD_DIR="${BUILD_DIR:-build-max}"
|
||
JOBS="${JOBS:-$(nproc)}"
|
||
|
||
MODEL="${MODEL:-unsloth/Qwen3.5-122B-A10B-GGUF:Q4_K_M}"
|
||
PORT="${PORT:-8080}"
|
||
CTX="${CTX:-204800}"
|
||
BATCH="${BATCH:-512}"
|
||
UBATCH="${UBATCH:-256}"
|
||
THREADS="${THREADS:-8}"
|
||
PARALLEL="${PARALLEL:-1}"
|
||
CACHE_K="${CACHE_K:-q8_0}"
|
||
CACHE_V="${CACHE_V:-q8_0}"
|
||
SWA_FULL="${SWA_FULL:-1}"
|
||
NGL="${NGL:-999}"
|
||
LOG_FILE="${LOG_FILE:-/tmp/llama-qwen35-122b.log}"
|
||
|
||
echo "[1/5] Install build deps"
|
||
sudo apt-get update -y
|
||
sudo apt-get install -y git cmake build-essential libssl-dev curl
|
||
|
||
echo "[2/5] Clone/Update llama.cpp"
|
||
if [[ -d "$REPO_DIR/.git" ]]; then
|
||
git -C "$REPO_DIR" pull --ff-only
|
||
else
|
||
git clone https://github.com/ggml-org/llama.cpp.git "$REPO_DIR"
|
||
fi
|
||
|
||
echo "[3/5] Configure + Build (CUDA ON)"
|
||
cmake -S "$REPO_DIR" -B "$REPO_DIR/$BUILD_DIR" -DGGML_CUDA=ON -DLLAMA_OPENSSL=ON
|
||
cmake --build "$REPO_DIR/$BUILD_DIR" -j "$JOBS" --target llama-server llama-cli
|
||
|
||
BIN="$REPO_DIR/$BUILD_DIR/bin/llama-server"
|
||
if [[ ! -x "$BIN" ]]; then
|
||
echo "[ERROR] build succeeded but llama-server not found: $BIN" >&2
|
||
exit 1
|
||
fi
|
||
|
||
echo "[4/5] Write run helper"
|
||
cat > "$HOME/run-llama-qwen35-122b.sh" <<EOF
|
||
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
MODEL="${MODEL}"
|
||
PORT="${PORT}"
|
||
CTX="${CTX}"
|
||
BATCH="${BATCH}"
|
||
UBATCH="${UBATCH}"
|
||
THREADS="${THREADS}"
|
||
PARALLEL="${PARALLEL}"
|
||
CACHE_K="${CACHE_K}"
|
||
CACHE_V="${CACHE_V}"
|
||
SWA_FULL="${SWA_FULL}"
|
||
NGL="${NGL}"
|
||
LOG_FILE="${LOG_FILE}"
|
||
pkill -f "llama-server.*Qwen3.5-122B-A10B" 2>/dev/null || true
|
||
sleep 1
|
||
ARGS=(
|
||
-hf "\$MODEL"
|
||
--host 0.0.0.0 --port "\$PORT"
|
||
-ngl "\$NGL"
|
||
-c "\$CTX" -b "\$BATCH" -ub "\$UBATCH"
|
||
-t "\$THREADS"
|
||
--parallel "\$PARALLEL"
|
||
--cache-type-k "\$CACHE_K"
|
||
--cache-type-v "\$CACHE_V"
|
||
--flash-attn on
|
||
)
|
||
if [[ "\$SWA_FULL" == "1" ]]; then
|
||
ARGS+=(--swa-full)
|
||
fi
|
||
nohup "$BIN" "\${ARGS[@]}" > "\$LOG_FILE" 2>&1 &
|
||
echo "started: pid=\$! log=\$LOG_FILE"
|
||
EOF
|
||
chmod +x "$HOME/run-llama-qwen35-122b.sh"
|
||
|
||
echo "[5/5] Start server"
|
||
"$HOME/run-llama-qwen35-122b.sh"
|
||
|
||
echo "[INFO] Waiting health..."
|
||
for i in {1..40}; do
|
||
if curl -fsS "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then
|
||
echo "[OK] health ready: http://127.0.0.1:${PORT}/health"
|
||
exit 0
|
||
fi
|
||
sleep 3
|
||
done
|
||
|
||
echo "[WARN] still loading model. check logs: tail -f ${LOG_FILE}"
|