Initial commit: agent-status dashboard (agent.pinksky.kr)

This commit is contained in:
2026-05-05 21:14:29 +09:00
commit 0ef19098ce
11 changed files with 4443 additions and 0 deletions
+338
View File
@@ -0,0 +1,338 @@
#!/usr/bin/env python3
"""Agent Status Health Checker v6 — 메모리/VRAM 실시간 수집 + 사양/박스 구조"""
import json, subprocess, time, re
from pathlib import Path
from datetime import datetime, timezone, timedelta
KST = timezone(timedelta(hours=9))
STATUS_FILE = Path(__file__).parent / "health.json"
# ===== 정적 사양 정보 =====
PC_SPECS = {
"macmini": "M4 Pro · 64GB · 통합 GPU",
"mainpc": "Intel Core i7-14700K · 96GB · RTX 3090 24GB",
"subpc": "AMD Ryzen 9 3900X · 32GB · RTX 3080 12GB",
}
# ===== 기본 체크 함수 =====
def http_check(url, timeout=5):
try:
r = subprocess.run(
["curl", "-s", "-o", "/dev/null", "-w", "%{http_code}:%{time_total}",
url, "--max-time", str(timeout)],
capture_output=True, text=True, timeout=timeout+2)
parts = r.stdout.strip().split(":")
code = parts[0] if parts else "000"
return {"status": "online" if code.startswith(("2","3")) else "error",
"detail": f"HTTP {code}"}
except: return {"status": "offline", "detail": "timeout"}
def ssh_check(host, timeout=8):
try:
r = subprocess.run(
["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
host, "echo alive"],
capture_output=True, text=True, timeout=timeout)
alive = r.returncode == 0 and "alive" in r.stdout
return {"status": "online" if alive else "offline",
"detail": "alive" if alive else r.stderr.strip()[:80]}
except: return {"status": "offline", "detail": "timeout"}
def tailscale_ping(host, timeout=8):
try:
r = subprocess.run(
["tailscale", "ping", "--until", "3s", "--c", "1", host],
capture_output=True, text=True, timeout=timeout)
alive = "pong" in r.stdout or "is local" in r.stdout
return {"status": "online" if alive else "offline",
"detail": "" if alive else "no pong"}
except: return {"status": "offline", "detail": "timeout"}
def proc_check(process, via=None):
try:
if via:
cmd = ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
via, f"pgrep -f '{process}' || echo NOT_FOUND"]
else:
cmd = ["pgrep", "-f", process]
r = subprocess.run(cmd, capture_output=True, text=True, timeout=8)
out = r.stdout.strip()
alive = out not in ["NOT_FOUND", ""] and r.returncode == 0
return {"status": "online" if alive else "offline",
"detail": f"PID {out[:20]}" if alive else "not running"}
except: return {"status": "offline", "detail": "error"}
# ===== 상태 수집 =====
def get_local_tailscale_ip():
try:
r = subprocess.run(["tailscale", "ip", "-4"], capture_output=True, text=True, timeout=5)
return r.stdout.strip()
except: return "?"
def get_remote_tailscale_ip(via):
try:
r = subprocess.run(
["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
via, "tailscale ip -4"],
capture_output=True, text=True, timeout=8)
return r.stdout.strip() if r.returncode == 0 else "?"
except: return "?"
def get_version(label):
try:
if label == "openclaw":
r = subprocess.run(["openclaw", "--version"], capture_output=True, text=True, timeout=5)
v = r.stdout.strip().replace("OpenClaw ", "")
v = re.sub(r'\s+\([^)]+\)', '', v).strip()
return v if v else "?"
elif label == "hermes":
r = subprocess.run([
"/Users/pinksky/.hermes/hermes-agent/venv/bin/python",
"-m", "hermes_cli.main", "--version"
], capture_output=True, text=True, timeout=5)
m = re.search(r'v[\d.]+', r.stdout.strip())
return m.group(0) if m else "?"
except: return "?"
def get_uptime(pid_str):
try:
r = subprocess.run(["ps", "-p", pid_str, "-o", "etime="], capture_output=True, text=True, timeout=5)
return r.stdout.strip()
except: return "?"
def get_local_macos_memory():
try:
r = subprocess.run(["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=5)
total_gb = round(int(r.stdout.strip()) / (1024**3), 1)
r = subprocess.run(["vm_stat"], capture_output=True, text=True, timeout=5)
lines = r.stdout.strip().split("\n")
page_size = 16384
total_pages = 0
for line in lines:
m = re.search(r'Pages (\w+):\s+([0-9,]+)', line)
if m:
key, val = m.group(1).lower(), int(m.group(2).replace(",",""))
if key in ("active","inactive","wired","compressed"):
total_pages += val
used_gb = round(total_pages * page_size / (1024**3), 1)
return f"{used_gb}GB/{total_gb}GB"
except: return "?.?"
def get_remote_memory(via):
result = {"sys": "?", "gpu": None}
try:
r = subprocess.run(
["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
via, "free -m | awk '/Mem:/ {printf \"%.1f/%.1f\", ($2-$7)/1024, \$2/1024}'"],
capture_output=True, text=True, timeout=8)
out = r.stdout.strip()
if out and r.returncode == 0:
result["sys"] = f"{out}GB"
except: pass
try:
r = subprocess.run(
["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
via, "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=8)
out = r.stdout.strip()
if out and r.returncode == 0:
parts = [x.strip() for x in out.split(",")]
if len(parts) >= 2:
u, t = float(parts[0]), float(parts[1])
result["gpu"] = f"VRAM {round(u/1024,1)}GB/{round(t/1024,1)}GB"
except: pass
return result
def proc_details(name_pattern, version_label=None, engine_name="?", via=None):
result = {"status": "offline", "version": "?", "uptime": "?", "proc_count": 0,
"engine": engine_name, "detail": "not running", "mem": None, "gpu": None}
try:
if via:
r = subprocess.run(
["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
via, f"ps aux | grep '{name_pattern}' | grep -v grep | grep -v health_check || true"],
capture_output=True, text=True, timeout=8)
lines = [l for l in r.stdout.strip().split("\n") if l.strip()]
if lines:
result["status"] = "online"
result["proc_count"] = len(lines)
if version_label:
result["version"] = "?" # 원격 버전 체크는 복잡하므로 여기서는 생략
mem_data = get_remote_memory(via)
result["mem"] = mem_data["sys"]
result["gpu"] = mem_data["gpu"]
result["detail"] = f"{result['proc_count']}프로세스"
else:
r = subprocess.run(["ps", "aux"], capture_output=True, text=True, timeout=5)
lines = [l for l in r.stdout.strip().split("\n") if name_pattern in l and "grep" not in l and "health_check" not in l]
if lines:
result["status"] = "online"
result["proc_count"] = len(lines)
first = lines[0].strip().split(None, 2)
if len(first) >= 2:
result["uptime"] = get_uptime(first[1])
if version_label:
result["version"] = get_version(version_label)
result["mem"] = get_local_macos_memory()
result["detail"] = f"{result['uptime']} · {result['proc_count']}프로세스"
except: pass
return result
# ===== 메인 =====
def run_checks():
ts = datetime.now(KST)
ts_ip = get_local_tailscale_ip()
# ===== pinksky.kr (아지트) =====
pinksky_domain = http_check("https://pinksky.kr/")
macmini_detail = f"{ts_ip} · pinksky.kr"
mainpc_ts = get_remote_tailscale_ip("mainpc-wsl")
mimi = proc_details("openclaw-gateway", "openclaw", "openclaw")
ruki = proc_details("hermes", "hermes", "hermes")
mainpc = ssh_check("mainpc-win", 6)
pink_hermes = proc_details("hermes", engine_name="hermes", via="mainpc-wsl")
pinksky = {
"domain": pinksky_domain,
"pcs": {
"macmini": {
"name": "맥미니 M4 Pro", "emoji": "🖥️",
"tailscale_ip": ts_ip, "dns": "pinksky.kr",
"status": "online", "detail": macmini_detail,
"spec": PC_SPECS["macmini"],
"agents": {
"mimi": {
"name": "미미", "emoji": "🦞",
**{k:v for k,v in mimi.items() if k not in ("status","detail")},
"status": mimi["status"], "detail": mimi["detail"]
},
"ruki": {
"name": "루키", "emoji": "🌱",
**{k:v for k,v in ruki.items() if k not in ("status","detail")},
"status": ruki["status"], "detail": ruki["detail"]
}
}
},
"mainpc": {
"name": "메인컴", "emoji": "🖥️",
"tailscale_ip": mainpc_ts, "dns": "mainpc-wsl.pinksky.kr",
**mainpc,
"spec": PC_SPECS["mainpc"],
"agents": {
"pink_hermes": {
"name": "분홍", "emoji": "🤖",
**{k:v for k,v in pink_hermes.items() if k not in ("status","detail")},
"status": pink_hermes["status"], "detail": pink_hermes["detail"]
}
}
}
}
}
# ===== minicity.kr (집 NAS) =====
minicity_domain = http_check("https://minicity.kr/")
nas_pc = ssh_check("mh-nas", 6)
minicity = {
"domain": minicity_domain,
"pcs": {
"nas": {
"name": "NAS (minicity)", "emoji": "🗄️",
**nas_pc, "agents": {}
}
}
}
# ===== pinksky.iptime.org (집 DDNS / 보조컴) =====
subpc_ts = get_remote_tailscale_ip("subpc-wsl")
subpc = ssh_check("subpc-wsl", 6)
sky_hermes = proc_details("hermes", engine_name="hermes", via="subpc-wsl")
iptime = {
"domain": {"status": "online" if subpc["status"] == "online" else "error", "detail": "DDNS"},
"pcs": {
"subpc": {
"name": "보조컴", "emoji": "🖥️",
"tailscale_ip": subpc_ts, "dns": "subpc-wsl.pinksky.kr",
**subpc,
"spec": PC_SPECS["subpc"],
"agents": {
"sky_hermes": {
"name": "하늘", "emoji": "🤖",
**{k:v for k,v in sky_hermes.items() if k not in ("status","detail")},
"status": sky_hermes["status"], "detail": sky_hermes["detail"]
}
}
}
}
}
# ===== gwenc.kr (회사) =====
gwenc_domain = http_check("https://gwenc.kr/")
server5600 = tailscale_ping("gw-ps-5600", 6)
proxmox = tailscale_ping("proxmox-minicity", 6)
office_nas = tailscale_ping("gwenc-nas2", 6)
backup_nas = tailscale_ping("ps-bk-nas", 6)
office_pc = tailscale_ping("ps-i14700k-win", 6)
office_pc_wsl = tailscale_ping("ps-i14700k-wsl", 6)
gwenc = {
"domain": gwenc_domain,
"pcs": {
"server5600": {"name": "서버컴 5600", "emoji": "🖥️", **server5600, "agents": {}},
"proxmox": {"name": "Proxmox", "emoji": "🔶", **proxmox, "agents": {}},
"office_nas": {"name": "회사 NAS", "emoji": "🗄️", **office_nas, "agents": {}},
"backup_nas": {"name": "백업 NAS", "emoji": "🗄️", **backup_nas, "agents": {}},
"office_pc": {
"name": "회사 메인PC", "emoji": "🖥️", **office_pc,
"agents": {
"office_pc_wsl": {"name": "회사 메인PC WSL", "emoji": "🖥️", **office_pc_wsl}
}
}
}
}
# ===== 서브도메인 =====
subdomains = [
{"name": "wiki.pinksky.kr", "url": "https://wiki.pinksky.kr/", "group": "pinksky.kr", "desc": "MiniCITY 지식정원"},
{"name": "openclaw.pinksky.kr", "url": "https://openclaw.pinksky.kr/", "group": "pinksky.kr", "desc": "OpenClaw 대시보드"},
{"name": "agent.pinksky.kr", "url": "https://agent.pinksky.kr/", "group": "pinksky.kr", "desc": "에이전트 현황판"},
{"name": "search.pinksky.kr", "url": "https://search.pinksky.kr/search?q=test", "group": "pinksky.kr", "desc": "SearXNG 메타서치"},
{"name": "erp.pinksky.kr", "url": "https://erp.pinksky.kr/", "group": "pinksky.kr", "desc": "공사관리 ERP"},
{"name": "dify.pinksky.kr", "url": "https://dify.pinksky.kr:8443/", "group": "pinksky.kr", "desc": "Dify 워크플로우"},
{"name": "gwenc.kr", "url": "https://gwenc.kr/", "group": "gwenc.kr", "desc": "회사 메인 사이트"},
{"name": "minicity.kr", "url": "https://minicity.kr/", "group": "minicity.kr", "desc": "집 NAS 웹 UI"},
]
subdomain_results = [{"name": sd["name"], **http_check(sd["url"], 4), "group": sd["group"], "desc": sd["desc"]}
for sd in subdomains]
# ===== 요약 =====
all_domains = {"pinksky.kr": pinksky, "minicity.kr": minicity,
"pinksky.iptime.org": iptime, "gwenc.kr": gwenc}
all_pcs = []
for d in all_domains.values():
for pc in d["pcs"].values():
all_pcs.append(pc)
all_pcs.extend(pc["agents"].values())
online = sum(1 for p in all_pcs if p["status"] == "online")
total = len(all_pcs)
domain_online = sum(1 for d in all_domains.values() if d["domain"]["status"] == "online")
data = {
"subdomains": subdomain_results,
"domains": all_domains,
"summary": {
"total": total, "online": online, "offline": total - online,
"domain_total": len(all_domains), "domain_online": domain_online
},
"timestamp": ts.isoformat(),
"timestamp_epoch": int(ts.timestamp())
}
STATUS_FILE.write_text(json.dumps(data, indent=2, ensure_ascii=False))
print(f"[{ts.strftime('%H:%M:%S')}] 온라인 {online}/{total} | 도메인 {domain_online}/{len(all_domains)}")
if __name__ == "__main__":
run_checks()