Files
logs/scripts/run_logs_e2e.py
2026-04-27 19:26:57 +08:00

524 lines
25 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
import argparse
import asyncio
import hashlib
import hmac
import json
import socket
import subprocess
import time
import uuid
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from urllib import error, request
import psycopg2
import yaml
from pysnmp.hlapi.v3arch.asyncio import CommunityData, ContextData, NotificationType, ObjectIdentity, ObjectType, OctetString, SnmpEngine, UdpTransportTarget, send_notification
def now_utc() -> datetime:
return datetime.now(timezone.utc)
def rfc3339(dt: datetime) -> str:
return dt.replace(microsecond=0).isoformat().replace("+00:00", "Z")
def parse_pg_dsn(dsn: str) -> str:
parts = dsn.split()
kept = []
timezone_value = None
for p in parts:
if "=" not in p:
kept.append(p)
continue
k, v = p.split("=", 1)
if k.lower() == "timezone":
timezone_value = v
continue
kept.append(p)
if timezone_value:
kept.append(f"options='-c timezone={timezone_value}'")
return " ".join(kept)
def load_token(default_path: Path) -> str:
if default_path.exists():
for raw in default_path.read_text(encoding="utf-8").splitlines():
line = raw.strip()
if line.startswith("JWT_TOKEN="):
token = line.split("=", 1)[1].strip()
if token:
return token.replace("Bearer ", "")
return ""
def http_json(method: str, url: str, token: str = "", body: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) -> Tuple[int, Dict[str, Any]]:
req_headers = {"Content-Type": "application/json"}
if token:
req_headers["Authorization"] = token
if headers:
req_headers.update(headers)
data = None
if body is not None:
data = json.dumps(body, ensure_ascii=False).encode("utf-8")
req = request.Request(url, data=data, method=method.upper(), headers=req_headers)
try:
with request.urlopen(req, timeout=12) as resp:
text = resp.read().decode("utf-8")
return resp.status, json.loads(text) if text else {}
except error.HTTPError as e:
text = e.read().decode("utf-8", errors="ignore")
try:
return e.code, json.loads(text) if text else {}
except json.JSONDecodeError:
return e.code, {"raw": text}
def payload_obj(p: Dict[str, Any]) -> Dict[str, Any]:
if isinstance(p.get("details"), dict):
return p["details"]
if isinstance(p.get("data"), dict):
return p["data"]
return {}
async def send_trap_async(addr: Tuple[str, int], run_id: str) -> None:
await send_notification(
SnmpEngine(),
CommunityData("public", mpModel=1),
await UdpTransportTarget.create(addr),
ContextData(),
"trap",
NotificationType(ObjectIdentity("1.3.6.1.4.1.8072.2.3.0.1")).add_varbinds(
ObjectType(ObjectIdentity("1.3.6.1.2.1.1.1.0"), OctetString(f"E2E-TRAP-{run_id}"))
),
)
@dataclass
class Config:
base_url: str
syslog_addr: Tuple[str, int]
trap_addr: Tuple[str, int]
db_dsn: str
hmac_secret: str
token: str
run_id: str
front_url: str
skip_front: bool
skip_resource_event: bool
skip_trap: bool
class Runner:
def __init__(self, cfg: Config) -> None:
self.cfg = cfg
self.results: List[Dict[str, Any]] = []
self.ctx: Dict[str, Any] = {}
self.failed = False
def add(self, case_id: str, title: str, expected: str, actual: str, ok: bool, steps: List[str], severity: str = "none") -> None:
self.results.append(
{
"id": case_id,
"title": title,
"steps": steps,
"expected": expected,
"actual": actual,
"result": "PASS" if ok else "FAIL",
"severity": severity if not ok else "none",
}
)
if not ok:
self.failed = True
print(f"[{'PASS' if ok else 'FAIL'}] {case_id} {title}")
def query_one(self, sql: str, params: Tuple[Any, ...]) -> Optional[Dict[str, Any]]:
with psycopg2.connect(self.cfg.db_dsn) as conn:
with conn.cursor() as cur:
cur.execute(sql, params)
row = cur.fetchone()
if not row:
return None
cols = [x[0] for x in cur.description]
return {k: row[i] for i, k in enumerate(cols)}
def query_all(self, sql: str, params: Tuple[Any, ...]) -> List[Dict[str, Any]]:
with psycopg2.connect(self.cfg.db_dsn) as conn:
with conn.cursor() as cur:
cur.execute(sql, params)
cols = [x[0] for x in cur.description]
out = []
for row in cur.fetchall():
out.append({k: row[i] for i, k in enumerate(cols)})
return out
def run(self) -> int:
self.case_health()
if self.cfg.skip_front:
self.add("TC-002", "前端关键入口服务可访问", "可按需跳过", "skip(--skip-front)", True, [f"GET {self.cfg.front_url}"], "major")
else:
self.case_front_smoke()
self.case_crud_rules()
if self.cfg.skip_resource_event:
self.add("TC-004", "resource-events 签名/时间窗/幂等", "可按需跳过", "skip(--skip-resource-event)", True, ["POST /resource-events"], "critical")
else:
self.case_resource_events()
self.case_syslog_ingest_and_entries()
if self.cfg.skip_trap:
self.add("TC-007", "Trap 接收与入库", "可按需跳过", "skip(--skip-trap)", True, [f"SNMP trap -> {self.cfg.trap_addr}"], "critical")
else:
self.case_trap_ingest()
self.case_outbox_flow()
self.write_report()
return 1 if self.failed else 0
def case_health(self) -> None:
status, payload = http_json("GET", f"{self.cfg.base_url}/ping/hello")
ok = status == 200 and payload.get("code") == 0
self.add("TC-001", "logs 健康检查", "服务返回 code=0", f"status={status}, payload={payload}", ok, [f"GET {self.cfg.base_url}/ping/hello"], "critical")
def case_front_smoke(self) -> None:
try:
with request.urlopen(self.cfg.front_url, timeout=8) as resp:
text = resp.read().decode("utf-8", errors="ignore")
ok = resp.status == 200 and "<!doctype html" in text.lower()
self.add("TC-002", "前端关键入口服务可访问", "日志页/告警队列入口所在前端可打开", f"http={resp.status}", ok, [f"GET {self.cfg.front_url}"], "major")
except Exception as e:
self.add("TC-002", "前端关键入口服务可访问", "HTTP 200", str(e), False, [f"GET {self.cfg.front_url}"], "major")
def auth_ready(self) -> bool:
if not self.cfg.token:
return False
status, payload = http_json("GET", f"{self.cfg.base_url}/syslog-rules", token=self.cfg.token)
return status == 200 and payload.get("code") == 0
def case_crud_rules(self) -> None:
if not self.auth_ready():
self.add(
"TC-003",
"规则 CRUDsyslog/trap/dictionary/suppression",
"四类规则均可增删改查",
"鉴权失败(缺少有效 JWT 或 token 过期)",
False,
["GET /syslog-rules 验证鉴权", "跳过后续 CRUD"],
"critical",
)
return
suffix = f"[E2E:{self.cfg.run_id}]"
syslog_body = {
"name": f"{suffix}-syslog",
"enabled": True,
"priority": 999,
"device_name_contains": "127.0.0.1",
"keyword_regex": "E2E-SYSLOG",
"alert_name": f"{suffix}-syslog-alert",
"severity_code": "warning",
"policy_id": 0,
}
trap_rule_body = {
"name": f"{suffix}-trap-rule",
"enabled": True,
"priority": 998,
"oid_prefix": "1.3.6.1.4.1.8072",
"varbind_match_regex": "E2E-TRAP",
"alert_name": f"{suffix}-trap-alert",
"severity_code": "warning",
"policy_id": 0,
}
dict_body = {
"oid_prefix": f"1.3.6.1.4.1.8072.{int(time.time()) % 100000}",
"title": f"{suffix}-dict",
"description": "dict for e2e",
"severity_code": "warning",
"recovery_message": "recover",
"enabled": True,
}
suppression_body = {
"name": f"{suffix}-suppress",
"enabled": True,
"source_ip_cidr": "127.0.0.1/32",
"oid_prefix": "1.3.6.1.4.1.8072",
"interface_hint": "no-match",
"time_windows_json": "[]",
}
created_ids: List[Tuple[str, int]] = []
try:
s1, p1 = http_json("POST", f"{self.cfg.base_url}/syslog-rules", token=self.cfg.token, body=syslog_body)
s2, p2 = http_json("POST", f"{self.cfg.base_url}/trap-rules", token=self.cfg.token, body=trap_rule_body)
s3, p3 = http_json("POST", f"{self.cfg.base_url}/trap-dictionary", token=self.cfg.token, body=dict_body)
s4, p4 = http_json("POST", f"{self.cfg.base_url}/trap-suppressions", token=self.cfg.token, body=suppression_body)
objs = [payload_obj(x) for x in [p1, p2, p3, p4]]
statuses_ok = all(x == 200 for x in [s1, s2, s3, s4])
for ep, obj in zip(["syslog-rules", "trap-rules", "trap-dictionary", "trap-suppressions"], objs):
if obj.get("id"):
created_ids.append((ep, int(obj["id"])))
ok = statuses_ok and len(created_ids) == 4
self.add("TC-003", "规则 CRUDsyslog/trap/dictionary/suppression", "四类规则创建成功", f"created={created_ids}", ok, ["POST 4类规则"])
finally:
for ep, rid in created_ids:
http_json("DELETE", f"{self.cfg.base_url}/{ep}/{rid}", token=self.cfg.token)
def case_resource_events(self) -> None:
if not self.auth_ready():
self.add("TC-004", "resource-events 签名/时间窗/幂等", "签名和幂等校验生效", "鉴权失败,无法执行", False, ["POST /resource-events"], "critical")
return
base_event = {
"event_id": f"e2e-{self.cfg.run_id}-{uuid.uuid4().hex[:8]}",
"event_time": rfc3339(now_utc()),
"event_type": "resource.upsert",
"resource_type": "server",
"resource_id": f"res-{self.cfg.run_id}",
"resource_name": f"E2E Resource {self.cfg.run_id}",
"ips": ["127.0.0.1"],
"hostnames": [f"e2e-host-{self.cfg.run_id}"],
"labels": {"run_id": self.cfg.run_id},
"version": 2,
}
raw = json.dumps(base_event, ensure_ascii=False).encode("utf-8")
sig = hmac.new(self.cfg.hmac_secret.encode("utf-8"), raw, hashlib.sha256).hexdigest()
s_ok, p_ok = http_json("POST", f"{self.cfg.base_url}/resource-events", token=self.cfg.token, body=base_event, headers={"X-Event-Signature": sig})
s_dup, p_dup = http_json("POST", f"{self.cfg.base_url}/resource-events", token=self.cfg.token, body=base_event, headers={"X-Event-Signature": sig})
bad = dict(base_event)
bad["event_id"] = f"{base_event['event_id']}-bad"
old_dt = now_utc() - timedelta(seconds=1000)
bad["event_time"] = rfc3339(old_dt)
raw_bad = json.dumps(bad, ensure_ascii=False).encode("utf-8")
sig_bad = hmac.new(self.cfg.hmac_secret.encode("utf-8"), raw_bad, hashlib.sha256).hexdigest()
s_old, p_old = http_json("POST", f"{self.cfg.base_url}/resource-events", token=self.cfg.token, body=bad, headers={"X-Event-Signature": sig_bad})
invalid_sig_status, p_bad_sig = http_json("POST", f"{self.cfg.base_url}/resource-events", token=self.cfg.token, body=base_event, headers={"X-Event-Signature": "bad-sign"})
base_ok = s_ok == 200 and payload_obj(p_ok).get("resource_id") == base_event["resource_id"] and payload_obj(p_dup).get("ignored") is True
# bsm-sdk 通常以 HTTP 200 + code!=0 返回错误,这里兼容两种语义。
stale_rejected = s_old != 200 or p_old.get("code", 0) != 0
bad_sig_rejected = invalid_sig_status != 200 or p_bad_sig.get("code", 0) != 0
ok = base_ok and stale_rejected and bad_sig_rejected
self.ctx["resource_id"] = base_event["resource_id"]
self.add("TC-004", "resource-events 签名/时间窗/幂等", "首次成功、重复忽略、旧时间窗拒绝、坏签名拒绝", f"ok={s_ok}/{p_ok}, dup={p_dup}, stale={s_old}/{p_old}, bad_sig={invalid_sig_status}/{p_bad_sig}", ok, ["POST 正常事件", "POST 重复事件", "POST 超时事件", "POST 错签名事件"], "critical")
def case_syslog_ingest_and_entries(self) -> None:
msg = f"<34>Apr 27 17:30:00 e2e-host-{self.cfg.run_id} app: E2E-SYSLOG-{self.cfg.run_id}"
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.sendto(msg.encode("utf-8"), self.cfg.syslog_addr)
sock.close()
time.sleep(2)
row = self.query_one(
"""
SELECT id, source_kind, source_ip, resource_type, resource_id, match_method, dispatch_status
FROM logs_events
WHERE raw_payload LIKE %s
ORDER BY id DESC
LIMIT 1
""",
(f"%E2E-SYSLOG-{self.cfg.run_id}%",),
)
ok = row is not None and row.get("source_kind") == "syslog"
if row:
self.ctx["syslog_log_event_id"] = row["id"]
self.add("TC-005", "Syslog 接收与入库 + 资源关联写入", "syslog 事件入库且带 source_ip/resource/match_method", f"row={row}", ok, [f"UDP sendto {self.cfg.syslog_addr}"])
if not self.auth_ready():
self.add("TC-006", "entries 查询筛选", "source_kind/resource/dispatch_status/log_event_id 可筛选", "鉴权失败,无法执行 API 筛选验证", False, ["GET /entries"], "major")
return
params = [
f"source_kind=syslog",
f"resource_id={self.ctx.get('resource_id', '')}",
"dispatch_status=not_applicable",
f"log_event_id={self.ctx.get('syslog_log_event_id', 0)}",
"page=1&page_size=20",
]
s, p = http_json("GET", f"{self.cfg.base_url}/entries?{'&'.join(params)}", token=self.cfg.token)
items = payload_obj(p).get("items", [])
ok2 = s == 200 and isinstance(items, list)
self.add("TC-006", "entries 查询筛选", "按组合条件可返回列表", f"status={s}, items={len(items) if isinstance(items,list) else 'n/a'}", ok2, [f"GET /entries?{'&'.join(params)}"])
def case_trap_ingest(self) -> None:
restored: List[Tuple[int, Dict[str, Any]]] = []
try:
# 预处理:若存在“全量屏蔽 trap”的规则会导致任何 trap 都不入库;测试期间暂时关闭并在结束后恢复。
if self.auth_ready():
s0, p0 = http_json("GET", f"{self.cfg.base_url}/trap-suppressions", token=self.cfg.token)
if s0 == 200:
for row in payload_obj(p0).get("items", []):
if not row.get("enabled", False):
continue
if str(row.get("source_ip_cidr", "")).strip() == "" and str(row.get("oid_prefix", "")).strip() == "" and str(row.get("interface_hint", "")).strip() == "" and str(row.get("time_windows_json", "")).strip() == "":
rid = int(row.get("id", 0))
if rid > 0:
body = dict(row)
body["enabled"] = False
http_json("PUT", f"{self.cfg.base_url}/trap-suppressions/{rid}", token=self.cfg.token, body=body)
restored.append((rid, row))
before = self.query_one(
"SELECT COUNT(1) AS cnt FROM logs_events WHERE source_kind='snmp_trap'",
(),
)
# 先用 gosnmp 发送,保证与服务端 TrapListener 编码兼容;再发一份 pysnmp。
subprocess.run(
["go", "run", "./scripts/send_trap.go", self.cfg.trap_addr[0], f"E2E-TRAP-{self.cfg.run_id}"],
check=True,
capture_output=True,
text=True,
cwd="d:/work/ops/logs",
)
asyncio.run(send_trap_async(self.cfg.trap_addr, self.cfg.run_id))
time.sleep(3)
row = self.query_one(
"""
SELECT id, source_kind, trap_o_id, raw_payload, created_at
FROM logs_events
WHERE source_kind='snmp_trap'
ORDER BY id DESC LIMIT 1
""",
(),
)
after = self.query_one(
"SELECT COUNT(1) AS cnt FROM logs_events WHERE source_kind='snmp_trap'",
(),
)
before_cnt = int((before or {}).get("cnt", 0))
after_cnt = int((after or {}).get("cnt", 0))
ok = row is not None and after_cnt > before_cnt
self.add(
"TC-007",
"Trap 接收与入库",
"snmp_trap 事件写入 logs_events",
f"before={before_cnt}, after={after_cnt}, latest={row}",
ok,
[f"SNMP trap -> {self.cfg.trap_addr}"],
"critical",
)
except Exception as e:
self.add("TC-007", "Trap 接收与入库", "snmp_trap 事件写入", str(e), False, [f"SNMP trap -> {self.cfg.trap_addr}"], "critical")
finally:
for rid, row in restored:
http_json("PUT", f"{self.cfg.base_url}/trap-suppressions/{rid}", token=self.cfg.token, body=row)
def case_outbox_flow(self) -> None:
rows = self.query_all(
"""
SELECT o.id, o.status, o.retry_count, o.log_event_id, e.dispatch_status
FROM logs_alert_outbox o
LEFT JOIN logs_events e ON e.id = o.log_event_id
ORDER BY o.id DESC
LIMIT 10
""",
(),
)
has_chain = any(r["status"] in ("pending", "retrying", "sent", "dead") for r in rows)
manual_retry_ok = False
detail = {"rows": rows}
if self.auth_ready() and rows:
target = rows[0]["id"]
s, p = http_json("POST", f"{self.cfg.base_url}/alert-outbox/{target}/retry", token=self.cfg.token)
manual_retry_ok = s == 200 and payload_obj(p).get("status") == "pending"
detail["manual_retry"] = {"status": s, "payload": p}
ok = has_chain and (manual_retry_ok or not self.auth_ready())
if not self.auth_ready():
detail["manual_retry"] = "skip(鉴权失败)"
self.add("TC-008", "outbox 链路(入队/worker/状态流转/手动重试)", "存在 outbox 状态流转,手动重试可重置 pending", json.dumps(detail, ensure_ascii=False), ok, ["查 logs_alert_outbox", "POST /alert-outbox/:id/retry"], "major")
def write_report(self) -> None:
start = now_utc()
end = now_utc()
report_path = Path(f"d:/work/ops/artifacts/logs_e2e_report_{self.cfg.run_id}.md")
report_path.parent.mkdir(parents=True, exist_ok=True)
passed = sum(1 for x in self.results if x["result"] == "PASS")
failed = len(self.results) - passed
issues = [x for x in self.results if x["result"] == "FAIL"]
lines: List[str] = []
lines.append("# 日志管理全链路测试报告")
lines.append("")
lines.append("## 测试范围")
lines.append("- Syslog/Trap 接收与入库")
lines.append("- 规则 CRUDsyslog/trap/dictionary/suppression")
lines.append("- resource-events签名、时间窗、幂等")
lines.append("- 资源关联字段落库resource_type/resource_id/match_method/source_ip")
lines.append("- entries 筛选source_kind/resource_type/resource_id/dispatch_status/log_event_id")
lines.append("- outbox入队、worker、状态、手动重试")
lines.append("- 前端关键入口联调(日志页、告警队列入口)")
lines.append("")
lines.append("## 环境信息")
lines.append(f"- 执行时间: {rfc3339(start)} ~ {rfc3339(end)}")
lines.append(f"- logs API: `{self.cfg.base_url}`")
lines.append(f"- syslog: `{self.cfg.syslog_addr[0]}:{self.cfg.syslog_addr[1]}`")
lines.append(f"- trap: `{self.cfg.trap_addr[0]}:{self.cfg.trap_addr[1]}`")
lines.append(f"- front: `{self.cfg.front_url}`")
lines.append(f"- run_id: `{self.cfg.run_id}`")
lines.append("")
lines.append("## 用例清单(编号、步骤、预期、实际、结论)")
for r in self.results:
lines.append(f"- **{r['id']} {r['title']}**")
lines.append(f" - 步骤: {'; '.join(r['steps'])}")
lines.append(f" - 预期: {r['expected']}")
lines.append(f" - 实际: {r['actual']}")
lines.append(f" - 结论: {r['result']}")
lines.append("")
lines.append("## 问题清单(严重级别)")
if not issues:
lines.append("- 无失败项。")
else:
for i in issues:
lines.append(f"- [{i['severity'].upper()}] {i['id']} {i['title']}{i['actual']}")
lines.append("")
lines.append("## 链路结论(是否可上线联调)")
if failed == 0:
lines.append(f"- 结论:可上线联调({passed} 通过 / {failed} 失败)。")
else:
lines.append(f"- 结论:暂不建议上线联调({passed} 通过 / {failed} 失败)。")
lines.append("- 建议先修复高优先级失败项后再回归。")
report_path.write_text("\n".join(lines), encoding="utf-8")
print(f"REPORT_PATH={report_path}")
def build_config(args: argparse.Namespace) -> Config:
data = yaml.safe_load(Path(args.config).read_text(encoding="utf-8"))
host = args.host
if args.base_url:
base_url = args.base_url.rstrip("/")
else:
base_url = f"http://{host}:{data['Port']}/Logs/v1"
syslog_port = int(str(data["Ingest"]["syslog_listen_addr"]).split(":")[-1])
trap_port = int(str(data["Ingest"]["trap_listen_addr"]).split(":")[-1])
token = args.token or load_token(Path("d:/work/ops/scripts/test_alert_dispatch.env"))
run_id = args.run_id or datetime.now().strftime("%Y%m%d%H%M%S")
return Config(
base_url=base_url,
syslog_addr=(args.ingest_host or host, syslog_port),
trap_addr=(args.ingest_host or host, trap_port),
db_dsn=parse_pg_dsn(data["Databases"]["Source"][0]),
hmac_secret=data["ResourceEvent"]["hmac_secret"],
token=token,
run_id=run_id,
front_url=args.front_url,
skip_front=args.skip_front,
skip_resource_event=args.skip_resource_event,
skip_trap=args.skip_trap,
)
def main() -> int:
parser = argparse.ArgumentParser(description="日志管理全链路测试脚本(真实执行)")
parser.add_argument("--config", default="d:/work/ops/logs/etc/logs_dev.yaml")
parser.add_argument("--host", default="127.0.0.1")
parser.add_argument("--ingest-host", default="", help="syslog/trap 发送目标主机,默认与 --host 相同")
parser.add_argument("--base-url", default="", help="完整 API 前缀,例如 https://ops-api.apinb.com/Logs/v1")
parser.add_argument("--token", default="", help="Authorization 值(例如 Bearer xxx")
parser.add_argument("--run-id", default="")
parser.add_argument("--front-url", default="http://127.0.0.1:5173/log-mgmt/entries")
parser.add_argument("--skip-front", action="store_true", help="跳过前端入口检查")
parser.add_argument("--skip-resource-event", action="store_true", help="跳过 resource-events 用例")
parser.add_argument("--skip-trap", action="store_true", help="跳过 trap 接收用例")
args = parser.parse_args()
cfg = build_config(args)
runner = Runner(cfg)
return runner.run()
if __name__ == "__main__":
raise SystemExit(main())