Initial commit: agent-status dashboard (agent.pinksky.kr)
This commit is contained in:
Executable
+338
@@ -0,0 +1,338 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Agent Status Health Checker v6 — 메모리/VRAM 실시간 수집 + 사양/박스 구조"""
|
||||
import json, subprocess, time, re
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone, timedelta
|
||||
|
||||
KST = timezone(timedelta(hours=9))
|
||||
STATUS_FILE = Path(__file__).parent / "health.json"
|
||||
|
||||
# ===== 정적 사양 정보 =====
|
||||
PC_SPECS = {
|
||||
"macmini": "M4 Pro · 64GB · 통합 GPU",
|
||||
"mainpc": "Intel Core i7-14700K · 96GB · RTX 3090 24GB",
|
||||
"subpc": "AMD Ryzen 9 3900X · 32GB · RTX 3080 12GB",
|
||||
}
|
||||
|
||||
# ===== 기본 체크 함수 =====
|
||||
def http_check(url, timeout=5):
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["curl", "-s", "-o", "/dev/null", "-w", "%{http_code}:%{time_total}",
|
||||
url, "--max-time", str(timeout)],
|
||||
capture_output=True, text=True, timeout=timeout+2)
|
||||
parts = r.stdout.strip().split(":")
|
||||
code = parts[0] if parts else "000"
|
||||
return {"status": "online" if code.startswith(("2","3")) else "error",
|
||||
"detail": f"HTTP {code}"}
|
||||
except: return {"status": "offline", "detail": "timeout"}
|
||||
|
||||
def ssh_check(host, timeout=8):
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
|
||||
host, "echo alive"],
|
||||
capture_output=True, text=True, timeout=timeout)
|
||||
alive = r.returncode == 0 and "alive" in r.stdout
|
||||
return {"status": "online" if alive else "offline",
|
||||
"detail": "alive" if alive else r.stderr.strip()[:80]}
|
||||
except: return {"status": "offline", "detail": "timeout"}
|
||||
|
||||
def tailscale_ping(host, timeout=8):
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["tailscale", "ping", "--until", "3s", "--c", "1", host],
|
||||
capture_output=True, text=True, timeout=timeout)
|
||||
alive = "pong" in r.stdout or "is local" in r.stdout
|
||||
return {"status": "online" if alive else "offline",
|
||||
"detail": "" if alive else "no pong"}
|
||||
except: return {"status": "offline", "detail": "timeout"}
|
||||
|
||||
def proc_check(process, via=None):
|
||||
try:
|
||||
if via:
|
||||
cmd = ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
|
||||
via, f"pgrep -f '{process}' || echo NOT_FOUND"]
|
||||
else:
|
||||
cmd = ["pgrep", "-f", process]
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=8)
|
||||
out = r.stdout.strip()
|
||||
alive = out not in ["NOT_FOUND", ""] and r.returncode == 0
|
||||
return {"status": "online" if alive else "offline",
|
||||
"detail": f"PID {out[:20]}" if alive else "not running"}
|
||||
except: return {"status": "offline", "detail": "error"}
|
||||
|
||||
# ===== 상태 수집 =====
|
||||
def get_local_tailscale_ip():
|
||||
try:
|
||||
r = subprocess.run(["tailscale", "ip", "-4"], capture_output=True, text=True, timeout=5)
|
||||
return r.stdout.strip()
|
||||
except: return "?"
|
||||
|
||||
def get_remote_tailscale_ip(via):
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
|
||||
via, "tailscale ip -4"],
|
||||
capture_output=True, text=True, timeout=8)
|
||||
return r.stdout.strip() if r.returncode == 0 else "?"
|
||||
except: return "?"
|
||||
|
||||
def get_version(label):
|
||||
try:
|
||||
if label == "openclaw":
|
||||
r = subprocess.run(["openclaw", "--version"], capture_output=True, text=True, timeout=5)
|
||||
v = r.stdout.strip().replace("OpenClaw ", "")
|
||||
v = re.sub(r'\s+\([^)]+\)', '', v).strip()
|
||||
return v if v else "?"
|
||||
elif label == "hermes":
|
||||
r = subprocess.run([
|
||||
"/Users/pinksky/.hermes/hermes-agent/venv/bin/python",
|
||||
"-m", "hermes_cli.main", "--version"
|
||||
], capture_output=True, text=True, timeout=5)
|
||||
m = re.search(r'v[\d.]+', r.stdout.strip())
|
||||
return m.group(0) if m else "?"
|
||||
except: return "?"
|
||||
|
||||
def get_uptime(pid_str):
|
||||
try:
|
||||
r = subprocess.run(["ps", "-p", pid_str, "-o", "etime="], capture_output=True, text=True, timeout=5)
|
||||
return r.stdout.strip()
|
||||
except: return "?"
|
||||
|
||||
def get_local_macos_memory():
|
||||
try:
|
||||
r = subprocess.run(["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=5)
|
||||
total_gb = round(int(r.stdout.strip()) / (1024**3), 1)
|
||||
r = subprocess.run(["vm_stat"], capture_output=True, text=True, timeout=5)
|
||||
lines = r.stdout.strip().split("\n")
|
||||
page_size = 16384
|
||||
total_pages = 0
|
||||
for line in lines:
|
||||
m = re.search(r'Pages (\w+):\s+([0-9,]+)', line)
|
||||
if m:
|
||||
key, val = m.group(1).lower(), int(m.group(2).replace(",",""))
|
||||
if key in ("active","inactive","wired","compressed"):
|
||||
total_pages += val
|
||||
used_gb = round(total_pages * page_size / (1024**3), 1)
|
||||
return f"{used_gb}GB/{total_gb}GB"
|
||||
except: return "?.?"
|
||||
|
||||
def get_remote_memory(via):
|
||||
result = {"sys": "?", "gpu": None}
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
|
||||
via, "free -m | awk '/Mem:/ {printf \"%.1f/%.1f\", ($2-$7)/1024, \$2/1024}'"],
|
||||
capture_output=True, text=True, timeout=8)
|
||||
out = r.stdout.strip()
|
||||
if out and r.returncode == 0:
|
||||
result["sys"] = f"{out}GB"
|
||||
except: pass
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
|
||||
via, "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=8)
|
||||
out = r.stdout.strip()
|
||||
if out and r.returncode == 0:
|
||||
parts = [x.strip() for x in out.split(",")]
|
||||
if len(parts) >= 2:
|
||||
u, t = float(parts[0]), float(parts[1])
|
||||
result["gpu"] = f"VRAM {round(u/1024,1)}GB/{round(t/1024,1)}GB"
|
||||
except: pass
|
||||
return result
|
||||
|
||||
def proc_details(name_pattern, version_label=None, engine_name="?", via=None):
|
||||
result = {"status": "offline", "version": "?", "uptime": "?", "proc_count": 0,
|
||||
"engine": engine_name, "detail": "not running", "mem": None, "gpu": None}
|
||||
try:
|
||||
if via:
|
||||
r = subprocess.run(
|
||||
["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
|
||||
via, f"ps aux | grep '{name_pattern}' | grep -v grep | grep -v health_check || true"],
|
||||
capture_output=True, text=True, timeout=8)
|
||||
lines = [l for l in r.stdout.strip().split("\n") if l.strip()]
|
||||
if lines:
|
||||
result["status"] = "online"
|
||||
result["proc_count"] = len(lines)
|
||||
if version_label:
|
||||
result["version"] = "?" # 원격 버전 체크는 복잡하므로 여기서는 생략
|
||||
mem_data = get_remote_memory(via)
|
||||
result["mem"] = mem_data["sys"]
|
||||
result["gpu"] = mem_data["gpu"]
|
||||
result["detail"] = f"{result['proc_count']}프로세스"
|
||||
else:
|
||||
r = subprocess.run(["ps", "aux"], capture_output=True, text=True, timeout=5)
|
||||
lines = [l for l in r.stdout.strip().split("\n") if name_pattern in l and "grep" not in l and "health_check" not in l]
|
||||
if lines:
|
||||
result["status"] = "online"
|
||||
result["proc_count"] = len(lines)
|
||||
first = lines[0].strip().split(None, 2)
|
||||
if len(first) >= 2:
|
||||
result["uptime"] = get_uptime(first[1])
|
||||
if version_label:
|
||||
result["version"] = get_version(version_label)
|
||||
result["mem"] = get_local_macos_memory()
|
||||
result["detail"] = f"{result['uptime']} · {result['proc_count']}프로세스"
|
||||
except: pass
|
||||
return result
|
||||
|
||||
# ===== 메인 =====
|
||||
def run_checks():
|
||||
ts = datetime.now(KST)
|
||||
ts_ip = get_local_tailscale_ip()
|
||||
|
||||
# ===== pinksky.kr (아지트) =====
|
||||
pinksky_domain = http_check("https://pinksky.kr/")
|
||||
macmini_detail = f"{ts_ip} · pinksky.kr"
|
||||
mainpc_ts = get_remote_tailscale_ip("mainpc-wsl")
|
||||
|
||||
mimi = proc_details("openclaw-gateway", "openclaw", "openclaw")
|
||||
ruki = proc_details("hermes", "hermes", "hermes")
|
||||
mainpc = ssh_check("mainpc-win", 6)
|
||||
pink_hermes = proc_details("hermes", engine_name="hermes", via="mainpc-wsl")
|
||||
|
||||
pinksky = {
|
||||
"domain": pinksky_domain,
|
||||
"pcs": {
|
||||
"macmini": {
|
||||
"name": "맥미니 M4 Pro", "emoji": "🖥️",
|
||||
"tailscale_ip": ts_ip, "dns": "pinksky.kr",
|
||||
"status": "online", "detail": macmini_detail,
|
||||
"spec": PC_SPECS["macmini"],
|
||||
"agents": {
|
||||
"mimi": {
|
||||
"name": "미미", "emoji": "🦞",
|
||||
**{k:v for k,v in mimi.items() if k not in ("status","detail")},
|
||||
"status": mimi["status"], "detail": mimi["detail"]
|
||||
},
|
||||
"ruki": {
|
||||
"name": "루키", "emoji": "🌱",
|
||||
**{k:v for k,v in ruki.items() if k not in ("status","detail")},
|
||||
"status": ruki["status"], "detail": ruki["detail"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"mainpc": {
|
||||
"name": "메인컴", "emoji": "🖥️",
|
||||
"tailscale_ip": mainpc_ts, "dns": "mainpc-wsl.pinksky.kr",
|
||||
**mainpc,
|
||||
"spec": PC_SPECS["mainpc"],
|
||||
"agents": {
|
||||
"pink_hermes": {
|
||||
"name": "분홍", "emoji": "🤖",
|
||||
**{k:v for k,v in pink_hermes.items() if k not in ("status","detail")},
|
||||
"status": pink_hermes["status"], "detail": pink_hermes["detail"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# ===== minicity.kr (집 NAS) =====
|
||||
minicity_domain = http_check("https://minicity.kr/")
|
||||
nas_pc = ssh_check("mh-nas", 6)
|
||||
|
||||
minicity = {
|
||||
"domain": minicity_domain,
|
||||
"pcs": {
|
||||
"nas": {
|
||||
"name": "NAS (minicity)", "emoji": "🗄️",
|
||||
**nas_pc, "agents": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# ===== pinksky.iptime.org (집 DDNS / 보조컴) =====
|
||||
subpc_ts = get_remote_tailscale_ip("subpc-wsl")
|
||||
subpc = ssh_check("subpc-wsl", 6)
|
||||
sky_hermes = proc_details("hermes", engine_name="hermes", via="subpc-wsl")
|
||||
|
||||
iptime = {
|
||||
"domain": {"status": "online" if subpc["status"] == "online" else "error", "detail": "DDNS"},
|
||||
"pcs": {
|
||||
"subpc": {
|
||||
"name": "보조컴", "emoji": "🖥️",
|
||||
"tailscale_ip": subpc_ts, "dns": "subpc-wsl.pinksky.kr",
|
||||
**subpc,
|
||||
"spec": PC_SPECS["subpc"],
|
||||
"agents": {
|
||||
"sky_hermes": {
|
||||
"name": "하늘", "emoji": "🤖",
|
||||
**{k:v for k,v in sky_hermes.items() if k not in ("status","detail")},
|
||||
"status": sky_hermes["status"], "detail": sky_hermes["detail"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# ===== gwenc.kr (회사) =====
|
||||
gwenc_domain = http_check("https://gwenc.kr/")
|
||||
server5600 = tailscale_ping("gw-ps-5600", 6)
|
||||
proxmox = tailscale_ping("proxmox-minicity", 6)
|
||||
office_nas = tailscale_ping("gwenc-nas2", 6)
|
||||
backup_nas = tailscale_ping("ps-bk-nas", 6)
|
||||
office_pc = tailscale_ping("ps-i14700k-win", 6)
|
||||
office_pc_wsl = tailscale_ping("ps-i14700k-wsl", 6)
|
||||
|
||||
gwenc = {
|
||||
"domain": gwenc_domain,
|
||||
"pcs": {
|
||||
"server5600": {"name": "서버컴 5600", "emoji": "🖥️", **server5600, "agents": {}},
|
||||
"proxmox": {"name": "Proxmox", "emoji": "🔶", **proxmox, "agents": {}},
|
||||
"office_nas": {"name": "회사 NAS", "emoji": "🗄️", **office_nas, "agents": {}},
|
||||
"backup_nas": {"name": "백업 NAS", "emoji": "🗄️", **backup_nas, "agents": {}},
|
||||
"office_pc": {
|
||||
"name": "회사 메인PC", "emoji": "🖥️", **office_pc,
|
||||
"agents": {
|
||||
"office_pc_wsl": {"name": "회사 메인PC WSL", "emoji": "🖥️", **office_pc_wsl}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# ===== 서브도메인 =====
|
||||
subdomains = [
|
||||
{"name": "wiki.pinksky.kr", "url": "https://wiki.pinksky.kr/", "group": "pinksky.kr", "desc": "MiniCITY 지식정원"},
|
||||
{"name": "openclaw.pinksky.kr", "url": "https://openclaw.pinksky.kr/", "group": "pinksky.kr", "desc": "OpenClaw 대시보드"},
|
||||
{"name": "agent.pinksky.kr", "url": "https://agent.pinksky.kr/", "group": "pinksky.kr", "desc": "에이전트 현황판"},
|
||||
{"name": "search.pinksky.kr", "url": "https://search.pinksky.kr/search?q=test", "group": "pinksky.kr", "desc": "SearXNG 메타서치"},
|
||||
{"name": "erp.pinksky.kr", "url": "https://erp.pinksky.kr/", "group": "pinksky.kr", "desc": "공사관리 ERP"},
|
||||
{"name": "dify.pinksky.kr", "url": "https://dify.pinksky.kr:8443/", "group": "pinksky.kr", "desc": "Dify 워크플로우"},
|
||||
{"name": "gwenc.kr", "url": "https://gwenc.kr/", "group": "gwenc.kr", "desc": "회사 메인 사이트"},
|
||||
{"name": "minicity.kr", "url": "https://minicity.kr/", "group": "minicity.kr", "desc": "집 NAS 웹 UI"},
|
||||
]
|
||||
subdomain_results = [{"name": sd["name"], **http_check(sd["url"], 4), "group": sd["group"], "desc": sd["desc"]}
|
||||
for sd in subdomains]
|
||||
|
||||
# ===== 요약 =====
|
||||
all_domains = {"pinksky.kr": pinksky, "minicity.kr": minicity,
|
||||
"pinksky.iptime.org": iptime, "gwenc.kr": gwenc}
|
||||
all_pcs = []
|
||||
for d in all_domains.values():
|
||||
for pc in d["pcs"].values():
|
||||
all_pcs.append(pc)
|
||||
all_pcs.extend(pc["agents"].values())
|
||||
|
||||
online = sum(1 for p in all_pcs if p["status"] == "online")
|
||||
total = len(all_pcs)
|
||||
domain_online = sum(1 for d in all_domains.values() if d["domain"]["status"] == "online")
|
||||
|
||||
data = {
|
||||
"subdomains": subdomain_results,
|
||||
"domains": all_domains,
|
||||
"summary": {
|
||||
"total": total, "online": online, "offline": total - online,
|
||||
"domain_total": len(all_domains), "domain_online": domain_online
|
||||
},
|
||||
"timestamp": ts.isoformat(),
|
||||
"timestamp_epoch": int(ts.timestamp())
|
||||
}
|
||||
|
||||
STATUS_FILE.write_text(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
print(f"[{ts.strftime('%H:%M:%S')}] 온라인 {online}/{total} | 도메인 {domain_online}/{len(all_domains)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_checks()
|
||||
Reference in New Issue
Block a user