From 46fc601f6c9f3bd45e1c86d0ad9340c347e45316 Mon Sep 17 00:00:00 2001 From: Local User Date: Fri, 13 Feb 2026 08:37:12 +0800 Subject: [PATCH] feat: add health check monitoring and alerting system - Add health status change logging with detailed information - Add optional webhook notifications for provider health changes - Configurable via HEALTH_ALERT_WEBHOOK_URL environment variable - Sends notifications when provider becomes unhealthy or recovers - Broadcast health status events for real-time UI updates - Integrate periodic health checks in heartbeat function - Track provider stats (usage count, error count) in health logs This improves observability by allowing operators to: - Monitor provider health status changes in real-time - Receive alerts via webhook when issues occur - View detailed health statistics for troubleshooting --- src/providers/provider-pool-manager.js | 105 +++++++++++++++++++++++++ src/services/api-manager.js | 24 +++++- src/services/api-server.js | 2 +- 3 files changed, 126 insertions(+), 5 deletions(-) diff --git a/src/providers/provider-pool-manager.js b/src/providers/provider-pool-manager.js index 7f2043e..cfa7df8 100644 --- a/src/providers/provider-pool-manager.js +++ b/src/providers/provider-pool-manager.js @@ -4,6 +4,7 @@ import { getServiceAdapter } from './adapter.js'; import logger from '../utils/logger.js'; import { MODEL_PROVIDER, getProtocolPrefix } from '../utils/common.js'; import { getProviderModels } from './provider-models.js'; +import { broadcastEvent } from '../ui-modules/event-broadcast.js'; import axios from 'axios'; /** @@ -458,6 +459,90 @@ export class ProviderPoolManager { } } + /** + * 记录健康状态变化日志 + * @param {string} providerType - 提供商类型 + * @param {object} providerConfig - 提供商配置 + * @param {string} fromStatus - 之前状态 + * @param {string} toStatus - 当前状态 + * @param {string} [errorMessage] - 错误信息(可选) + * @private + */ + _logHealthStatusChange(providerType, providerConfig, fromStatus, toStatus, errorMessage = null) { + const customName = providerConfig.customName || providerConfig.uuid; + const timestamp = new Date().toISOString(); + + const logEntry = { + timestamp, + providerType, + uuid: providerConfig.uuid, + customName, + fromStatus, + toStatus, + errorMessage, + usageCount: providerConfig.usageCount || 0, + errorCount: providerConfig.errorCount || 0 + }; + + // 输出详细的状态变化日志 + if (toStatus === 'unhealthy') { + logger.warn(`[HealthMonitor] ⚠️ Provider became UNHEALTHY: ${customName} (${providerType})`); + logger.warn(`[HealthMonitor] Reason: ${errorMessage || 'Unknown'}`); + logger.warn(`[HealthMonitor] Error Count: ${providerConfig.errorCount}`); + + // 触发告警(如果配置了 Webhook) + this._triggerHealthAlert(providerType, providerConfig, 'unhealthy', errorMessage); + } else if (toStatus === 'healthy' && fromStatus === 'unhealthy') { + logger.info(`[HealthMonitor] ✅ Provider recovered to HEALTHY: ${customName} (${providerType})`); + + // 触发恢复通知 + this._triggerHealthAlert(providerType, providerConfig, 'recovered', null); + } + + // 广播健康状态变化事件 + broadcastEvent('health_status_change', logEntry); + } + + /** + * 触发健康状态告警 + * @param {string} providerType - 提供商类型 + * @param {object} providerConfig - 提供商配置 + * @param {string} status - 状态 ('unhealthy' | 'recovered') + * @param {string} [errorMessage] - 错误信息 + * @private + */ + async _triggerHealthAlert(providerType, providerConfig, status, errorMessage = null) { + const webhookUrl = this.globalConfig?.HEALTH_ALERT_WEBHOOK_URL; + if (!webhookUrl) { + return; // 未配置 Webhook,跳过 + } + + const customName = providerConfig.customName || providerConfig.uuid; + const payload = { + timestamp: new Date().toISOString(), + providerType, + uuid: providerConfig.uuid, + customName, + status, + errorMessage, + stats: { + usageCount: providerConfig.usageCount || 0, + errorCount: providerConfig.errorCount || 0 + } + }; + + try { + const axios = (await import('axios')).default; + await axios.post(webhookUrl, payload, { + timeout: 5000, + headers: { 'Content-Type': 'application/json' } + }); + this._log('info', `Health alert sent to webhook for ${customName}: ${status}`); + } catch (error) { + this._log('error', `Failed to send health alert to webhook: ${error.message}`); + } + } + /** * 查找指定的 provider * @private @@ -880,6 +965,7 @@ export class ProviderPoolManager { const provider = this._findProvider(providerType, providerConfig.uuid); if (provider) { + const wasHealthy = provider.config.isHealthy; const now = Date.now(); const lastErrorTime = provider.config.lastErrorTime ? new Date(provider.config.lastErrorTime).getTime() : 0; const errorWindowMs = 10000; // 10 秒窗口期 @@ -902,6 +988,12 @@ export class ProviderPoolManager { if (this.maxErrorCount > 0 && provider.config.errorCount >= this.maxErrorCount) { provider.config.isHealthy = false; + + // 健康状态变化日志 + if (wasHealthy) { + this._logHealthStatusChange(providerType, provider.config, 'healthy', 'unhealthy', errorMessage); + } + this._log('warn', `Marked provider as unhealthy: ${providerConfig.uuid} for type ${providerType}. Total errors: ${provider.config.errorCount}`); } @@ -924,6 +1016,7 @@ export class ProviderPoolManager { const provider = this._findProvider(providerType, providerConfig.uuid); if (provider) { + const wasHealthy = provider.config.isHealthy; provider.config.isHealthy = false; provider.config.errorCount = this.maxErrorCount; // Set to max to indicate definitive failure provider.config.lastErrorTime = new Date().toISOString(); @@ -933,6 +1026,11 @@ export class ProviderPoolManager { provider.config.lastErrorMessage = errorMessage; } + // 健康状态变化日志 + if (wasHealthy) { + this._logHealthStatusChange(providerType, provider.config, 'healthy', 'unhealthy', errorMessage); + } + this._log('warn', `Immediately marked provider as unhealthy: ${providerConfig.uuid} for type ${providerType}. Reason: ${errorMessage || 'Authentication error'}`); this._debouncedSave(providerType); @@ -992,6 +1090,7 @@ export class ProviderPoolManager { const provider = this._findProvider(providerType, providerConfig.uuid); if (provider) { + const wasHealthy = provider.config.isHealthy; provider.config.isHealthy = true; provider.config.errorCount = 0; provider.config.refreshCount = 0; @@ -1012,6 +1111,12 @@ export class ProviderPoolManager { provider.config.usageCount++; provider.config.lastUsed = new Date().toISOString(); } + + // 健康状态变化日志 + if (!wasHealthy) { + this._logHealthStatusChange(providerType, provider.config, 'unhealthy', 'healthy', null); + } + this._log('info', `Marked provider as healthy: ${provider.config.uuid} for type ${providerType}${resetUsageCount ? ' (usage count reset)' : ''}`); this._debouncedSave(providerType); diff --git a/src/services/api-manager.js b/src/services/api-manager.js index d9cb526..dc97f5e 100644 --- a/src/services/api-manager.js +++ b/src/services/api-manager.js @@ -62,14 +62,30 @@ export async function handleAPIRequests(method, path, req, res, currentConfig, a * @param {Object} services - The initialized services * @returns {Function} - The heartbeat and token refresh function */ -export function initializeAPIManagement(services) { +export function initializeAPIManagement(services, config = {}) { const providerPoolManager = getProviderPoolManager(); + const healthCheckInterval = config.HEALTH_CHECK_INTERVAL || 10 * 60 * 1000; // 默认10分钟 + return async function heartbeatAndRefreshToken() { logger.info(`[Heartbeat] Server is running. Current time: ${new Date().toLocaleString()}`, Object.keys(services)); + + // 定期执行健康检查 + if (providerPoolManager) { + try { + logger.info('[HealthCheck] Starting periodic health check...'); + await providerPoolManager.performHealthChecks(); + const stats = {}; + for (const providerType in providerPoolManager.providerStatus) { + const providerStats = providerPoolManager.getProviderStats(providerType); + stats[providerType] = providerStats; + } + logger.info('[HealthCheck] Health check completed. Stats:', JSON.stringify(stats)); + } catch (error) { + logger.error('[HealthCheck] Health check failed:', error.message); + } + } + // 循环遍历所有已初始化的服务适配器,并尝试刷新令牌 - // if (getProviderPoolManager()) { - // await getProviderPoolManager().performHealthChecks(); // 定期执行健康检查 - // } for (const providerKey in services) { const serviceAdapter = services[providerKey]; try { diff --git a/src/services/api-server.js b/src/services/api-server.js index 90543f2..08d1ead 100644 --- a/src/services/api-server.js +++ b/src/services/api-server.js @@ -265,7 +265,7 @@ async function startServer() { initializeUIManagement(CONFIG); // Initialize API management and get heartbeat function - const heartbeatAndRefreshToken = initializeAPIManagement(services); + const heartbeatAndRefreshToken = initializeAPIManagement(services, CONFIG); // Create request handler const requestHandlerInstance = createRequestHandler(CONFIG, getProviderPoolManager());