Merge pull request #326 from ggdayup/feature/health-check-alerts
feat: add health check monitoring and alerting system
This commit is contained in:
commit
ce8e8ad855
3 changed files with 126 additions and 5 deletions
|
|
@ -4,6 +4,7 @@ import { getServiceAdapter } from './adapter.js';
|
|||
import logger from '../utils/logger.js';
|
||||
import { MODEL_PROVIDER, getProtocolPrefix } from '../utils/common.js';
|
||||
import { getProviderModels } from './provider-models.js';
|
||||
import { broadcastEvent } from '../ui-modules/event-broadcast.js';
|
||||
import axios from 'axios';
|
||||
|
||||
/**
|
||||
|
|
@ -458,6 +459,90 @@ export class ProviderPoolManager {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 记录健康状态变化日志
|
||||
* @param {string} providerType - 提供商类型
|
||||
* @param {object} providerConfig - 提供商配置
|
||||
* @param {string} fromStatus - 之前状态
|
||||
* @param {string} toStatus - 当前状态
|
||||
* @param {string} [errorMessage] - 错误信息(可选)
|
||||
* @private
|
||||
*/
|
||||
_logHealthStatusChange(providerType, providerConfig, fromStatus, toStatus, errorMessage = null) {
|
||||
const customName = providerConfig.customName || providerConfig.uuid;
|
||||
const timestamp = new Date().toISOString();
|
||||
|
||||
const logEntry = {
|
||||
timestamp,
|
||||
providerType,
|
||||
uuid: providerConfig.uuid,
|
||||
customName,
|
||||
fromStatus,
|
||||
toStatus,
|
||||
errorMessage,
|
||||
usageCount: providerConfig.usageCount || 0,
|
||||
errorCount: providerConfig.errorCount || 0
|
||||
};
|
||||
|
||||
// 输出详细的状态变化日志
|
||||
if (toStatus === 'unhealthy') {
|
||||
logger.warn(`[HealthMonitor] ⚠️ Provider became UNHEALTHY: ${customName} (${providerType})`);
|
||||
logger.warn(`[HealthMonitor] Reason: ${errorMessage || 'Unknown'}`);
|
||||
logger.warn(`[HealthMonitor] Error Count: ${providerConfig.errorCount}`);
|
||||
|
||||
// 触发告警(如果配置了 Webhook)
|
||||
this._triggerHealthAlert(providerType, providerConfig, 'unhealthy', errorMessage);
|
||||
} else if (toStatus === 'healthy' && fromStatus === 'unhealthy') {
|
||||
logger.info(`[HealthMonitor] ✅ Provider recovered to HEALTHY: ${customName} (${providerType})`);
|
||||
|
||||
// 触发恢复通知
|
||||
this._triggerHealthAlert(providerType, providerConfig, 'recovered', null);
|
||||
}
|
||||
|
||||
// 广播健康状态变化事件
|
||||
broadcastEvent('health_status_change', logEntry);
|
||||
}
|
||||
|
||||
/**
|
||||
* 触发健康状态告警
|
||||
* @param {string} providerType - 提供商类型
|
||||
* @param {object} providerConfig - 提供商配置
|
||||
* @param {string} status - 状态 ('unhealthy' | 'recovered')
|
||||
* @param {string} [errorMessage] - 错误信息
|
||||
* @private
|
||||
*/
|
||||
async _triggerHealthAlert(providerType, providerConfig, status, errorMessage = null) {
|
||||
const webhookUrl = this.globalConfig?.HEALTH_ALERT_WEBHOOK_URL;
|
||||
if (!webhookUrl) {
|
||||
return; // 未配置 Webhook,跳过
|
||||
}
|
||||
|
||||
const customName = providerConfig.customName || providerConfig.uuid;
|
||||
const payload = {
|
||||
timestamp: new Date().toISOString(),
|
||||
providerType,
|
||||
uuid: providerConfig.uuid,
|
||||
customName,
|
||||
status,
|
||||
errorMessage,
|
||||
stats: {
|
||||
usageCount: providerConfig.usageCount || 0,
|
||||
errorCount: providerConfig.errorCount || 0
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const axios = (await import('axios')).default;
|
||||
await axios.post(webhookUrl, payload, {
|
||||
timeout: 5000,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
});
|
||||
this._log('info', `Health alert sent to webhook for ${customName}: ${status}`);
|
||||
} catch (error) {
|
||||
this._log('error', `Failed to send health alert to webhook: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找指定的 provider
|
||||
* @private
|
||||
|
|
@ -880,6 +965,7 @@ export class ProviderPoolManager {
|
|||
|
||||
const provider = this._findProvider(providerType, providerConfig.uuid);
|
||||
if (provider) {
|
||||
const wasHealthy = provider.config.isHealthy;
|
||||
const now = Date.now();
|
||||
const lastErrorTime = provider.config.lastErrorTime ? new Date(provider.config.lastErrorTime).getTime() : 0;
|
||||
const errorWindowMs = 10000; // 10 秒窗口期
|
||||
|
|
@ -902,6 +988,12 @@ export class ProviderPoolManager {
|
|||
|
||||
if (this.maxErrorCount > 0 && provider.config.errorCount >= this.maxErrorCount) {
|
||||
provider.config.isHealthy = false;
|
||||
|
||||
// 健康状态变化日志
|
||||
if (wasHealthy) {
|
||||
this._logHealthStatusChange(providerType, provider.config, 'healthy', 'unhealthy', errorMessage);
|
||||
}
|
||||
|
||||
this._log('warn', `Marked provider as unhealthy: ${providerConfig.uuid} for type ${providerType}. Total errors: ${provider.config.errorCount}`);
|
||||
}
|
||||
|
||||
|
|
@ -924,6 +1016,7 @@ export class ProviderPoolManager {
|
|||
|
||||
const provider = this._findProvider(providerType, providerConfig.uuid);
|
||||
if (provider) {
|
||||
const wasHealthy = provider.config.isHealthy;
|
||||
provider.config.isHealthy = false;
|
||||
provider.config.errorCount = this.maxErrorCount; // Set to max to indicate definitive failure
|
||||
provider.config.lastErrorTime = new Date().toISOString();
|
||||
|
|
@ -933,6 +1026,11 @@ export class ProviderPoolManager {
|
|||
provider.config.lastErrorMessage = errorMessage;
|
||||
}
|
||||
|
||||
// 健康状态变化日志
|
||||
if (wasHealthy) {
|
||||
this._logHealthStatusChange(providerType, provider.config, 'healthy', 'unhealthy', errorMessage);
|
||||
}
|
||||
|
||||
this._log('warn', `Immediately marked provider as unhealthy: ${providerConfig.uuid} for type ${providerType}. Reason: ${errorMessage || 'Authentication error'}`);
|
||||
|
||||
this._debouncedSave(providerType);
|
||||
|
|
@ -992,6 +1090,7 @@ export class ProviderPoolManager {
|
|||
|
||||
const provider = this._findProvider(providerType, providerConfig.uuid);
|
||||
if (provider) {
|
||||
const wasHealthy = provider.config.isHealthy;
|
||||
provider.config.isHealthy = true;
|
||||
provider.config.errorCount = 0;
|
||||
provider.config.refreshCount = 0;
|
||||
|
|
@ -1012,6 +1111,12 @@ export class ProviderPoolManager {
|
|||
provider.config.usageCount++;
|
||||
provider.config.lastUsed = new Date().toISOString();
|
||||
}
|
||||
|
||||
// 健康状态变化日志
|
||||
if (!wasHealthy) {
|
||||
this._logHealthStatusChange(providerType, provider.config, 'unhealthy', 'healthy', null);
|
||||
}
|
||||
|
||||
this._log('info', `Marked provider as healthy: ${provider.config.uuid} for type ${providerType}${resetUsageCount ? ' (usage count reset)' : ''}`);
|
||||
|
||||
this._debouncedSave(providerType);
|
||||
|
|
|
|||
|
|
@ -62,14 +62,30 @@ export async function handleAPIRequests(method, path, req, res, currentConfig, a
|
|||
* @param {Object} services - The initialized services
|
||||
* @returns {Function} - The heartbeat and token refresh function
|
||||
*/
|
||||
export function initializeAPIManagement(services) {
|
||||
export function initializeAPIManagement(services, config = {}) {
|
||||
const providerPoolManager = getProviderPoolManager();
|
||||
const healthCheckInterval = config.HEALTH_CHECK_INTERVAL || 10 * 60 * 1000; // 默认10分钟
|
||||
|
||||
return async function heartbeatAndRefreshToken() {
|
||||
logger.info(`[Heartbeat] Server is running. Current time: ${new Date().toLocaleString()}`, Object.keys(services));
|
||||
|
||||
// 定期执行健康检查
|
||||
if (providerPoolManager) {
|
||||
try {
|
||||
logger.info('[HealthCheck] Starting periodic health check...');
|
||||
await providerPoolManager.performHealthChecks();
|
||||
const stats = {};
|
||||
for (const providerType in providerPoolManager.providerStatus) {
|
||||
const providerStats = providerPoolManager.getProviderStats(providerType);
|
||||
stats[providerType] = providerStats;
|
||||
}
|
||||
logger.info('[HealthCheck] Health check completed. Stats:', JSON.stringify(stats));
|
||||
} catch (error) {
|
||||
logger.error('[HealthCheck] Health check failed:', error.message);
|
||||
}
|
||||
}
|
||||
|
||||
// 循环遍历所有已初始化的服务适配器,并尝试刷新令牌
|
||||
// if (getProviderPoolManager()) {
|
||||
// await getProviderPoolManager().performHealthChecks(); // 定期执行健康检查
|
||||
// }
|
||||
for (const providerKey in services) {
|
||||
const serviceAdapter = services[providerKey];
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -265,7 +265,7 @@ async function startServer() {
|
|||
initializeUIManagement(CONFIG);
|
||||
|
||||
// Initialize API management and get heartbeat function
|
||||
const heartbeatAndRefreshToken = initializeAPIManagement(services);
|
||||
const heartbeatAndRefreshToken = initializeAPIManagement(services, CONFIG);
|
||||
|
||||
// Create request handler
|
||||
const requestHandlerInstance = createRequestHandler(CONFIG, getProviderPoolManager());
|
||||
|
|
|
|||
Loading…
Reference in a new issue