#!/usr/bin/env bash set -euo pipefail export DEBIAN_FRONTEND=noninteractive API_URL="https://control-1.pike-vimba.ts.net:10000" INSTALL_SCRIPT_URL="https://get.aisdom.com" RESUME_FLAG="/var/tmp/aisdom_install_resume" PERSISTENT_SCRIPT="/usr/local/bin/aisdom-install.sh" # --------------------------------------------------------- # 1. Input Handling & Variable Extraction # --------------------------------------------------------- if [ -z "${1:-}" ]; then echo "Error: Missing configuration blob." exit 1 fi # Decoded format: NODE_ID|API_TOKEN|TS_KEY|REG_USER|REG_PASS|K3S_URL|K3S_TOKEN DECODED=$(echo "$1" | base64 --decode) IFS='|' read -r NODE_ID API_TOKEN TAILSCALE_KEY REG_USER REG_PASS K3S_URL K3S_TOKEN <<< "$DECODED" # --------------------------------------------------------- # Functions # --------------------------------------------------------- send_system_info() { local os_version="${1:-}" local cuda_version="${2:-}" local ip_address="${3:-}" # Build JSON payload with only provided values local json_payload="{" local first=true if [ -n "$os_version" ]; then json_payload+="\"os_version\": \"$os_version\"" first=false fi if [ -n "$cuda_version" ]; then if [ "$first" = false ]; then json_payload+=", " fi json_payload+="\"cuda_version\": \"$cuda_version\"" first=false fi if [ -n "$ip_address" ]; then if [ "$first" = false ]; then json_payload+=", " fi json_payload+="\"ip_address\": \"$ip_address\"" fi json_payload+="}" echo ">>> Sending system info: OS=$os_version, CUDA=$cuda_version, IP=$ip_address" curl -s -X POST "$API_URL/nodes/$NODE_ID/system-info" \ -H "Authorization: Bearer $API_TOKEN" \ -H "Content-Type: application/json" \ -d "$json_payload" || true } send_status() { local status="$1" local msg="$2" echo ">>> [$status] $msg" # Note: Using the API token to authenticate status updates local response=$(curl -v -X POST "$API_URL/nodes/$NODE_ID/status" \ -H "Authorization: Bearer $API_TOKEN" \ -H "Content-Type: application/json" \ -d "{\"status\": \"$status\"}" 2>&1) local exit_code=$? echo "$response" if [ $exit_code -ne 0 ]; then echo ">>> ERROR: Status update failed with exit code $exit_code" >&2 fi } detect_cuda_version() { local cuda_ver="Unknown" # Check if nvidia-smi exists if ! command -v nvidia-smi &> /dev/null; then echo "Unknown" return 0 fi # Parse CUDA version from default nvidia-smi output # nvidia-smi shows "CUDA Version: X.Y" in its output local output output=$(timeout 3 nvidia-smi 2>/dev/null || echo "") if [ -n "$output" ]; then # Extract "CUDA Version: X.Y" line and get the version number cuda_ver=$(echo "$output" | grep -i "cuda version" | head -n1 | sed -E 's/.*CUDA Version: *([0-9]+\.[0-9]+).*/\1/' | xargs 2>/dev/null || echo "Unknown") if [ -z "$cuda_ver" ] || [ "$cuda_ver" = "Unknown" ]; then cuda_ver="Unknown" fi fi echo "$cuda_ver" return 0 # Always return success } # --------------------------------------------------------- # 2. Reboot & Resume Logic # --------------------------------------------------------- if [ -f "$RESUME_FLAG" ]; then send_status "RESUMING" "Resuming installation after reboot..." else # Download and save the script to a persistent location sudo curl -fsSL -o "$PERSISTENT_SCRIPT" "$INSTALL_SCRIPT_URL" sudo chmod +x "$PERSISTENT_SCRIPT" send_status "CHECKING_OS" "Verifying Ubuntu version..." UBUNTU_VER=$(lsb_release -rs) if [[ "$UBUNTU_VER" != "22.04" && "$UBUNTU_VER" != "24.04" ]]; then send_status "FAILED" "Error: Only Ubuntu 22.04 and 24.04 are supported." exit 1 fi send_status "INSTALLING_DRIVERS" "Installing NVIDIA drivers..." sudo apt-get update sudo apt-get install -y ubuntu-drivers-common nfs-common sudo ubuntu-drivers install # Create the resume service cat </dev/null | grep -qE '(127\.0\.0\.1|::1)'; then echo "127.0.0.1 registry.localhost" | sudo tee -a /etc/hosts > /dev/null fi # Inject the private registry authentication sudo mkdir -p /etc/rancher/k3s cat < /dev/null; then curl -fsSL https://tailscale.com/install.sh | sudo sh fi sudo tailscale up --authkey="$TAILSCALE_KEY" --hostname="provider-$NODE_ID" TS_IP=$(tailscale ip -4) # --------------------------------------------------------- # 6. K3s Agent Installation # --------------------------------------------------------- send_status "JOINING_CLUSTER" "Connecting node to GPU Orchestrator..." curl -sfL https://get.k3s.io | K3S_URL="$K3S_URL" K3S_TOKEN="$K3S_TOKEN" sudo -E sh -s - \ agent \ --flannel-iface=tailscale0 \ --node-ip="$TS_IP" \ --node-name="provider-$NODE_ID" # Use the function CUDA_VER=$(detect_cuda_version) if [ "$CUDA_VER" != "Unknown" ]; then echo ">>> Detected CUDA version: $CUDA_VER" else echo ">>> CUDA version: Unknown" fi #send system info at end of script send_system_info "Ubuntu $(lsb_release -rs)" "$CUDA_VER" "$TS_IP" # Final Cleanup sudo rm -f "$PERSISTENT_SCRIPT" sudo rm -f "$RESUME_FLAG" send_status "SETUP_COMPLETE" "Node setup complete, starting verification..."