#!/usr/bin/env python3 """Agent Status Health Checker v6 — 메모리/VRAM 실시간 수집 + 사양/박스 구조""" import json, subprocess, time, re from pathlib import Path from datetime import datetime, timezone, timedelta KST = timezone(timedelta(hours=9)) STATUS_FILE = Path(__file__).parent / "health.json" # ===== 정적 사양 정보 ===== PC_SPECS = { "macmini": "M4 Pro · 64GB · 통합 GPU", "mainpc": "Intel Core i7-14700K · 96GB · RTX 3090 24GB", "subpc": "AMD Ryzen 9 3900X · 32GB · RTX 3080 12GB", } # ===== 기본 체크 함수 ===== def http_check(url, timeout=5): try: r = subprocess.run( ["curl", "-s", "-o", "/dev/null", "-w", "%{http_code}:%{time_total}", url, "--max-time", str(timeout)], capture_output=True, text=True, timeout=timeout+2) parts = r.stdout.strip().split(":") code = parts[0] if parts else "000" return {"status": "online" if code.startswith(("2","3")) else "error", "detail": f"HTTP {code}"} except: return {"status": "offline", "detail": "timeout"} def ssh_check(host, timeout=8): try: r = subprocess.run( ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no", host, "echo alive"], capture_output=True, text=True, timeout=timeout) alive = r.returncode == 0 and "alive" in r.stdout return {"status": "online" if alive else "offline", "detail": "alive" if alive else r.stderr.strip()[:80]} except: return {"status": "offline", "detail": "timeout"} def tailscale_ping(host, timeout=8): try: r = subprocess.run( ["tailscale", "ping", "--until", "3s", "--c", "1", host], capture_output=True, text=True, timeout=timeout) alive = "pong" in r.stdout or "is local" in r.stdout return {"status": "online" if alive else "offline", "detail": "" if alive else "no pong"} except: return {"status": "offline", "detail": "timeout"} def proc_check(process, via=None): try: if via: cmd = ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no", via, f"pgrep -f '{process}' || echo NOT_FOUND"] else: cmd = ["pgrep", "-f", process] r = subprocess.run(cmd, capture_output=True, text=True, timeout=8) out = r.stdout.strip() alive = out not in ["NOT_FOUND", ""] and r.returncode == 0 return {"status": "online" if alive else "offline", "detail": f"PID {out[:20]}" if alive else "not running"} except: return {"status": "offline", "detail": "error"} # ===== 상태 수집 ===== def get_local_tailscale_ip(): try: r = subprocess.run(["tailscale", "ip", "-4"], capture_output=True, text=True, timeout=5) return r.stdout.strip() except: return "?" def get_remote_tailscale_ip(via): try: r = subprocess.run( ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no", via, "tailscale ip -4"], capture_output=True, text=True, timeout=8) return r.stdout.strip() if r.returncode == 0 else "?" except: return "?" def get_version(label): try: if label == "openclaw": r = subprocess.run(["openclaw", "--version"], capture_output=True, text=True, timeout=5) v = r.stdout.strip().replace("OpenClaw ", "") v = re.sub(r'\s+\([^)]+\)', '', v).strip() return v if v else "?" elif label == "hermes": r = subprocess.run([ "/Users/pinksky/.hermes/hermes-agent/venv/bin/python", "-m", "hermes_cli.main", "--version" ], capture_output=True, text=True, timeout=5) m = re.search(r'v[\d.]+', r.stdout.strip()) return m.group(0) if m else "?" except: return "?" def get_uptime(pid_str): try: r = subprocess.run(["ps", "-p", pid_str, "-o", "etime="], capture_output=True, text=True, timeout=5) return r.stdout.strip() except: return "?" def get_local_macos_memory(): try: r = subprocess.run(["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=5) total_gb = round(int(r.stdout.strip()) / (1024**3), 1) r = subprocess.run(["vm_stat"], capture_output=True, text=True, timeout=5) lines = r.stdout.strip().split("\n") page_size = 16384 total_pages = 0 for line in lines: m = re.search(r'Pages (\w+):\s+([0-9,]+)', line) if m: key, val = m.group(1).lower(), int(m.group(2).replace(",","")) if key in ("active","inactive","wired","compressed"): total_pages += val used_gb = round(total_pages * page_size / (1024**3), 1) return f"{used_gb}GB/{total_gb}GB" except: return "?.?" def get_remote_memory(via): result = {"sys": "?", "gpu": None} try: r = subprocess.run( ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no", via, "free -m | awk '/Mem:/ {printf \"%.1f/%.1f\", ($2-$7)/1024, \$2/1024}'"], capture_output=True, text=True, timeout=8) out = r.stdout.strip() if out and r.returncode == 0: result["sys"] = f"{out}GB" except: pass try: r = subprocess.run( ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no", via, "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits"], capture_output=True, text=True, timeout=8) out = r.stdout.strip() if out and r.returncode == 0: parts = [x.strip() for x in out.split(",")] if len(parts) >= 2: u, t = float(parts[0]), float(parts[1]) result["gpu"] = f"VRAM {round(u/1024,1)}GB/{round(t/1024,1)}GB" except: pass return result def proc_details(name_pattern, version_label=None, engine_name="?", via=None): result = {"status": "offline", "version": "?", "uptime": "?", "proc_count": 0, "engine": engine_name, "detail": "not running", "mem": None, "gpu": None} try: if via: r = subprocess.run( ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no", via, f"ps aux | grep '{name_pattern}' | grep -v grep | grep -v health_check || true"], capture_output=True, text=True, timeout=8) lines = [l for l in r.stdout.strip().split("\n") if l.strip()] if lines: result["status"] = "online" result["proc_count"] = len(lines) if version_label: result["version"] = "?" # 원격 버전 체크는 복잡하므로 여기서는 생략 mem_data = get_remote_memory(via) result["mem"] = mem_data["sys"] result["gpu"] = mem_data["gpu"] result["detail"] = f"{result['proc_count']}프로세스" else: r = subprocess.run(["ps", "aux"], capture_output=True, text=True, timeout=5) lines = [l for l in r.stdout.strip().split("\n") if name_pattern in l and "grep" not in l and "health_check" not in l] if lines: result["status"] = "online" result["proc_count"] = len(lines) first = lines[0].strip().split(None, 2) if len(first) >= 2: result["uptime"] = get_uptime(first[1]) if version_label: result["version"] = get_version(version_label) result["mem"] = get_local_macos_memory() result["detail"] = f"{result['uptime']} · {result['proc_count']}프로세스" except: pass return result # ===== 메인 ===== def run_checks(): ts = datetime.now(KST) ts_ip = get_local_tailscale_ip() # ===== pinksky.kr (아지트) ===== pinksky_domain = http_check("https://pinksky.kr/") macmini_detail = f"{ts_ip} · pinksky.kr" mainpc_ts = get_remote_tailscale_ip("mainpc-wsl") mimi = proc_details("openclaw-gateway", "openclaw", "openclaw") ruki = proc_details("hermes", "hermes", "hermes") mainpc = ssh_check("mainpc-win", 6) pink_hermes = proc_details("hermes", engine_name="hermes", via="mainpc-wsl") pinksky = { "domain": pinksky_domain, "pcs": { "macmini": { "name": "맥미니 M4 Pro", "emoji": "🖥️", "tailscale_ip": ts_ip, "dns": "pinksky.kr", "status": "online", "detail": macmini_detail, "spec": PC_SPECS["macmini"], "agents": { "mimi": { "name": "미미", "emoji": "🦞", **{k:v for k,v in mimi.items() if k not in ("status","detail")}, "status": mimi["status"], "detail": mimi["detail"] }, "ruki": { "name": "루키", "emoji": "🌱", **{k:v for k,v in ruki.items() if k not in ("status","detail")}, "status": ruki["status"], "detail": ruki["detail"] } } }, "mainpc": { "name": "메인컴", "emoji": "🖥️", "tailscale_ip": mainpc_ts, "dns": "mainpc-wsl.pinksky.kr", **mainpc, "spec": PC_SPECS["mainpc"], "agents": { "pink_hermes": { "name": "분홍", "emoji": "🤖", **{k:v for k,v in pink_hermes.items() if k not in ("status","detail")}, "status": pink_hermes["status"], "detail": pink_hermes["detail"] } } } } } # ===== minicity.kr (집 NAS) ===== minicity_domain = http_check("https://minicity.kr/") nas_pc = ssh_check("mh-nas", 6) minicity = { "domain": minicity_domain, "pcs": { "nas": { "name": "NAS (minicity)", "emoji": "🗄️", **nas_pc, "agents": {} } } } # ===== pinksky.iptime.org (집 DDNS / 보조컴) ===== subpc_ts = get_remote_tailscale_ip("subpc-wsl") subpc = ssh_check("subpc-wsl", 6) sky_hermes = proc_details("hermes", engine_name="hermes", via="subpc-wsl") iptime = { "domain": {"status": "online" if subpc["status"] == "online" else "error", "detail": "DDNS"}, "pcs": { "subpc": { "name": "보조컴", "emoji": "🖥️", "tailscale_ip": subpc_ts, "dns": "subpc-wsl.pinksky.kr", **subpc, "spec": PC_SPECS["subpc"], "agents": { "sky_hermes": { "name": "하늘", "emoji": "🤖", **{k:v for k,v in sky_hermes.items() if k not in ("status","detail")}, "status": sky_hermes["status"], "detail": sky_hermes["detail"] } } } } } # ===== gwenc.kr (회사) ===== gwenc_domain = http_check("https://gwenc.kr/") server5600 = tailscale_ping("gw-ps-5600", 6) proxmox = tailscale_ping("proxmox-minicity", 6) office_nas = tailscale_ping("gwenc-nas2", 6) backup_nas = tailscale_ping("ps-bk-nas", 6) office_pc = tailscale_ping("ps-i14700k-win", 6) office_pc_wsl = tailscale_ping("ps-i14700k-wsl", 6) gwenc = { "domain": gwenc_domain, "pcs": { "server5600": {"name": "서버컴 5600", "emoji": "🖥️", **server5600, "agents": {}}, "proxmox": {"name": "Proxmox", "emoji": "🔶", **proxmox, "agents": {}}, "office_nas": {"name": "회사 NAS", "emoji": "🗄️", **office_nas, "agents": {}}, "backup_nas": {"name": "백업 NAS", "emoji": "🗄️", **backup_nas, "agents": {}}, "office_pc": { "name": "회사 메인PC", "emoji": "🖥️", **office_pc, "agents": { "office_pc_wsl": {"name": "회사 메인PC WSL", "emoji": "🖥️", **office_pc_wsl} } } } } # ===== 서브도메인 ===== subdomains = [ {"name": "wiki.pinksky.kr", "url": "https://wiki.pinksky.kr/", "group": "pinksky.kr", "desc": "MiniCITY 지식정원"}, {"name": "openclaw.pinksky.kr", "url": "https://openclaw.pinksky.kr/", "group": "pinksky.kr", "desc": "OpenClaw 대시보드"}, {"name": "agent.pinksky.kr", "url": "https://agent.pinksky.kr/", "group": "pinksky.kr", "desc": "에이전트 현황판"}, {"name": "search.pinksky.kr", "url": "https://search.pinksky.kr/search?q=test", "group": "pinksky.kr", "desc": "SearXNG 메타서치"}, {"name": "erp.pinksky.kr", "url": "https://erp.pinksky.kr/", "group": "pinksky.kr", "desc": "공사관리 ERP"}, {"name": "dify.pinksky.kr", "url": "https://dify.pinksky.kr:8443/", "group": "pinksky.kr", "desc": "Dify 워크플로우"}, {"name": "gwenc.kr", "url": "https://gwenc.kr/", "group": "gwenc.kr", "desc": "회사 메인 사이트"}, {"name": "minicity.kr", "url": "https://minicity.kr/", "group": "minicity.kr", "desc": "집 NAS 웹 UI"}, ] subdomain_results = [{"name": sd["name"], **http_check(sd["url"], 4), "group": sd["group"], "desc": sd["desc"]} for sd in subdomains] # ===== 요약 ===== all_domains = {"pinksky.kr": pinksky, "minicity.kr": minicity, "pinksky.iptime.org": iptime, "gwenc.kr": gwenc} all_pcs = [] for d in all_domains.values(): for pc in d["pcs"].values(): all_pcs.append(pc) all_pcs.extend(pc["agents"].values()) online = sum(1 for p in all_pcs if p["status"] == "online") total = len(all_pcs) domain_online = sum(1 for d in all_domains.values() if d["domain"]["status"] == "online") data = { "subdomains": subdomain_results, "domains": all_domains, "summary": { "total": total, "online": online, "offline": total - online, "domain_total": len(all_domains), "domain_online": domain_online }, "timestamp": ts.isoformat(), "timestamp_epoch": int(ts.timestamp()) } STATUS_FILE.write_text(json.dumps(data, indent=2, ensure_ascii=False)) print(f"[{ts.strftime('%H:%M:%S')}] 온라인 {online}/{total} | 도메인 {domain_online}/{len(all_domains)}") if __name__ == "__main__": run_checks()