Merge pull request #326 from ggdayup/feature/health-check-alerts

feat: add health check monitoring and alerting system
This commit is contained in:
何夕2077 2026-02-13 15:01:08 +08:00 committed by GitHub
commit ce8e8ad855
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 126 additions and 5 deletions

View file

@ -4,6 +4,7 @@ import { getServiceAdapter } from './adapter.js';
import logger from '../utils/logger.js';
import { MODEL_PROVIDER, getProtocolPrefix } from '../utils/common.js';
import { getProviderModels } from './provider-models.js';
import { broadcastEvent } from '../ui-modules/event-broadcast.js';
import axios from 'axios';
/**
@ -458,6 +459,90 @@ export class ProviderPoolManager {
}
}
/**
* 记录健康状态变化日志
* @param {string} providerType - 提供商类型
* @param {object} providerConfig - 提供商配置
* @param {string} fromStatus - 之前状态
* @param {string} toStatus - 当前状态
* @param {string} [errorMessage] - 错误信息可选
* @private
*/
_logHealthStatusChange(providerType, providerConfig, fromStatus, toStatus, errorMessage = null) {
const customName = providerConfig.customName || providerConfig.uuid;
const timestamp = new Date().toISOString();
const logEntry = {
timestamp,
providerType,
uuid: providerConfig.uuid,
customName,
fromStatus,
toStatus,
errorMessage,
usageCount: providerConfig.usageCount || 0,
errorCount: providerConfig.errorCount || 0
};
// 输出详细的状态变化日志
if (toStatus === 'unhealthy') {
logger.warn(`[HealthMonitor] ⚠️ Provider became UNHEALTHY: ${customName} (${providerType})`);
logger.warn(`[HealthMonitor] Reason: ${errorMessage || 'Unknown'}`);
logger.warn(`[HealthMonitor] Error Count: ${providerConfig.errorCount}`);
// 触发告警(如果配置了 Webhook
this._triggerHealthAlert(providerType, providerConfig, 'unhealthy', errorMessage);
} else if (toStatus === 'healthy' && fromStatus === 'unhealthy') {
logger.info(`[HealthMonitor] ✅ Provider recovered to HEALTHY: ${customName} (${providerType})`);
// 触发恢复通知
this._triggerHealthAlert(providerType, providerConfig, 'recovered', null);
}
// 广播健康状态变化事件
broadcastEvent('health_status_change', logEntry);
}
/**
* 触发健康状态告警
* @param {string} providerType - 提供商类型
* @param {object} providerConfig - 提供商配置
* @param {string} status - 状态 ('unhealthy' | 'recovered')
* @param {string} [errorMessage] - 错误信息
* @private
*/
async _triggerHealthAlert(providerType, providerConfig, status, errorMessage = null) {
const webhookUrl = this.globalConfig?.HEALTH_ALERT_WEBHOOK_URL;
if (!webhookUrl) {
return; // 未配置 Webhook跳过
}
const customName = providerConfig.customName || providerConfig.uuid;
const payload = {
timestamp: new Date().toISOString(),
providerType,
uuid: providerConfig.uuid,
customName,
status,
errorMessage,
stats: {
usageCount: providerConfig.usageCount || 0,
errorCount: providerConfig.errorCount || 0
}
};
try {
const axios = (await import('axios')).default;
await axios.post(webhookUrl, payload, {
timeout: 5000,
headers: { 'Content-Type': 'application/json' }
});
this._log('info', `Health alert sent to webhook for ${customName}: ${status}`);
} catch (error) {
this._log('error', `Failed to send health alert to webhook: ${error.message}`);
}
}
/**
* 查找指定的 provider
* @private
@ -880,6 +965,7 @@ export class ProviderPoolManager {
const provider = this._findProvider(providerType, providerConfig.uuid);
if (provider) {
const wasHealthy = provider.config.isHealthy;
const now = Date.now();
const lastErrorTime = provider.config.lastErrorTime ? new Date(provider.config.lastErrorTime).getTime() : 0;
const errorWindowMs = 10000; // 10 秒窗口期
@ -902,6 +988,12 @@ export class ProviderPoolManager {
if (this.maxErrorCount > 0 && provider.config.errorCount >= this.maxErrorCount) {
provider.config.isHealthy = false;
// 健康状态变化日志
if (wasHealthy) {
this._logHealthStatusChange(providerType, provider.config, 'healthy', 'unhealthy', errorMessage);
}
this._log('warn', `Marked provider as unhealthy: ${providerConfig.uuid} for type ${providerType}. Total errors: ${provider.config.errorCount}`);
}
@ -924,6 +1016,7 @@ export class ProviderPoolManager {
const provider = this._findProvider(providerType, providerConfig.uuid);
if (provider) {
const wasHealthy = provider.config.isHealthy;
provider.config.isHealthy = false;
provider.config.errorCount = this.maxErrorCount; // Set to max to indicate definitive failure
provider.config.lastErrorTime = new Date().toISOString();
@ -933,6 +1026,11 @@ export class ProviderPoolManager {
provider.config.lastErrorMessage = errorMessage;
}
// 健康状态变化日志
if (wasHealthy) {
this._logHealthStatusChange(providerType, provider.config, 'healthy', 'unhealthy', errorMessage);
}
this._log('warn', `Immediately marked provider as unhealthy: ${providerConfig.uuid} for type ${providerType}. Reason: ${errorMessage || 'Authentication error'}`);
this._debouncedSave(providerType);
@ -992,6 +1090,7 @@ export class ProviderPoolManager {
const provider = this._findProvider(providerType, providerConfig.uuid);
if (provider) {
const wasHealthy = provider.config.isHealthy;
provider.config.isHealthy = true;
provider.config.errorCount = 0;
provider.config.refreshCount = 0;
@ -1012,6 +1111,12 @@ export class ProviderPoolManager {
provider.config.usageCount++;
provider.config.lastUsed = new Date().toISOString();
}
// 健康状态变化日志
if (!wasHealthy) {
this._logHealthStatusChange(providerType, provider.config, 'unhealthy', 'healthy', null);
}
this._log('info', `Marked provider as healthy: ${provider.config.uuid} for type ${providerType}${resetUsageCount ? ' (usage count reset)' : ''}`);
this._debouncedSave(providerType);

View file

@ -62,14 +62,30 @@ export async function handleAPIRequests(method, path, req, res, currentConfig, a
* @param {Object} services - The initialized services
* @returns {Function} - The heartbeat and token refresh function
*/
export function initializeAPIManagement(services) {
export function initializeAPIManagement(services, config = {}) {
const providerPoolManager = getProviderPoolManager();
const healthCheckInterval = config.HEALTH_CHECK_INTERVAL || 10 * 60 * 1000; // 默认10分钟
return async function heartbeatAndRefreshToken() {
logger.info(`[Heartbeat] Server is running. Current time: ${new Date().toLocaleString()}`, Object.keys(services));
// 定期执行健康检查
if (providerPoolManager) {
try {
logger.info('[HealthCheck] Starting periodic health check...');
await providerPoolManager.performHealthChecks();
const stats = {};
for (const providerType in providerPoolManager.providerStatus) {
const providerStats = providerPoolManager.getProviderStats(providerType);
stats[providerType] = providerStats;
}
logger.info('[HealthCheck] Health check completed. Stats:', JSON.stringify(stats));
} catch (error) {
logger.error('[HealthCheck] Health check failed:', error.message);
}
}
// 循环遍历所有已初始化的服务适配器,并尝试刷新令牌
// if (getProviderPoolManager()) {
// await getProviderPoolManager().performHealthChecks(); // 定期执行健康检查
// }
for (const providerKey in services) {
const serviceAdapter = services[providerKey];
try {

View file

@ -265,7 +265,7 @@ async function startServer() {
initializeUIManagement(CONFIG);
// Initialize API management and get heartbeat function
const heartbeatAndRefreshToken = initializeAPIManagement(services);
const heartbeatAndRefreshToken = initializeAPIManagement(services, CONFIG);
// Create request handler
const requestHandlerInstance = createRequestHandler(CONFIG, getProviderPoolManager());