#!/home/renjianbo/miniconda3/envs/myenv/bin/python # -*- coding: utf-8 -*- """ 端口监控脚本 用于监控Flask应用的端口占用和服务状态 """ import os import sys import time import json import subprocess import logging from datetime import datetime from pathlib import Path # 配置 PORT = 5002 APP_NAME = "flask_prompt_master" PID_FILE = "logs/gunicorn.pid" LOG_FILE = "logs/port_monitor.log" CONFIG_FILE = "logs/monitor_config.json" ALERT_FILE = "logs/port_alerts.log" # 日志配置 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(LOG_FILE), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) class PortMonitor: def __init__(self): self.project_dir = Path(__file__).parent.parent self.pid_file = self.project_dir / PID_FILE self.config_file = self.project_dir / CONFIG_FILE self.alert_file = self.project_dir / ALERT_FILE # 确保日志目录存在 self.project_dir.mkdir(exist_ok=True) (self.project_dir / "logs").mkdir(exist_ok=True) # 加载配置 self.config = self.load_config() def load_config(self): """加载监控配置""" default_config = { "check_interval": 30, # 检查间隔(秒) "max_restart_attempts": 3, # 最大重启尝试次数 "alert_threshold": 2, # 告警阈值 "port_timeout": 5, # 端口检查超时时间 "enable_alerts": True, # 启用告警 "auto_restart": True, # 自动重启 } if self.config_file.exists(): try: with open(self.config_file, 'r', encoding='utf-8') as f: config = json.load(f) default_config.update(config) except Exception as e: logger.error("加载配置文件失败: {}".format(e)) return default_config def save_config(self): """保存监控配置""" try: with open(self.config_file, 'w', encoding='utf-8') as f: json.dump(self.config, f, indent=2, ensure_ascii=False) except Exception as e: logger.error("保存配置文件失败: {}".format(e)) def run_command(self, cmd, timeout=10): """执行命令""" try: result = subprocess.run( cmd, shell=True, capture_output=True, text=True, timeout=timeout ) return result.returncode == 0, result.stdout, result.stderr except subprocess.TimeoutExpired: return False, "", "命令执行超时" except Exception as e: return False, "", str(e) def check_port_usage(self): """检查端口占用情况""" cmd = f"ss -tlnp | grep ':{PORT} '" success, stdout, stderr = self.run_command(cmd) if success and stdout.strip(): # 解析进程信息 processes = [] for line in stdout.strip().split('\n'): if line: parts = line.split() if len(parts) >= 7: pid_info = parts[6] if 'pid=' in pid_info: pid = pid_info.split('pid=')[1].split(',')[0] processes.append(pid) return True, processes else: return False, [] def check_gunicorn_process(self): """检查Gunicorn进程状态""" if not self.pid_file.exists(): return False, "PID文件不存在" try: with open(self.pid_file, 'r') as f: pid = f.read().strip() if not pid: return False, "PID文件为空" # 检查进程是否存在 cmd = f"ps -p {pid}" success, stdout, stderr = self.run_command(cmd) if success and stdout.strip(): # 检查工作进程数量 cmd = "ps aux | grep 'gunicorn.*run_dev:app' | grep -v grep | wc -l" success, stdout, stderr = self.run_command(cmd) if success: worker_count = int(stdout.strip()) return True, f"主进程 {pid}, 工作进程 {worker_count}" return False, f"进程 {pid} 不存在" except Exception as e: return False, "检查进程失败: {}".format(e) def check_service_response(self): """检查服务响应""" cmd = f"curl -s -o /dev/null -w '%{{http_code}}' http://localhost:{PORT}/" success, stdout, stderr = self.run_command(cmd, timeout=self.config['port_timeout']) if success and stdout.strip() == '200': return True, "服务正常响应" else: return False, "服务无响应 (HTTP: {})".format(stdout.strip()) def get_system_info(self): """获取系统信息""" info = {} # CPU使用率 cmd = "top -bn1 | grep 'Cpu(s)' | awk '{print $2}' | cut -d'%' -f1" success, stdout, stderr = self.run_command(cmd) if success: info['cpu_usage'] = float(stdout.strip()) # 内存使用率 cmd = "free | grep Mem | awk '{printf \"%.2f\", $3/$2 * 100.0}'" success, stdout, stderr = self.run_command(cmd) if success: info['memory_usage'] = float(stdout.strip()) # 磁盘使用率 cmd = "df / | tail -1 | awk '{print $5}' | sed 's/%//'" success, stdout, stderr = self.run_command(cmd) if success: info['disk_usage'] = float(stdout.strip()) return info def log_alert(self, message, level="WARNING"): """记录告警信息""" timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") alert_msg = f"[{timestamp}] [{level}] {message}" logger.warning(alert_msg) if self.config['enable_alerts']: try: with open(self.alert_file, 'a', encoding='utf-8') as f: f.write(alert_msg + '\n') except Exception as e: logger.error("写入告警文件失败: {}".format(e)) def restart_service(self): """重启服务""" logger.info("尝试重启服务...") # 停止服务 cmd = f"cd {self.project_dir} && ./scripts/port_manager.sh stop" success, stdout, stderr = self.run_command(cmd) if not success: logger.error("停止服务失败: {}".format(stderr)) return False time.sleep(2) # 启动服务 cmd = f"cd {self.project_dir} && ./scripts/port_manager.sh start" success, stdout, stderr = self.run_command(cmd) if not success: logger.error("启动服务失败: {}".format(stderr)) return False logger.info("服务重启成功") return True def generate_report(self): """生成监控报告""" report = { "timestamp": datetime.now().isoformat(), "port_usage": {}, "gunicorn_status": {}, "service_response": {}, "system_info": {}, "recommendations": [] } # 检查端口占用 port_occupied, processes = self.check_port_usage() report["port_usage"] = { "occupied": port_occupied, "processes": processes } # 检查Gunicorn进程 gunicorn_running, status = self.check_gunicorn_process() report["gunicorn_status"] = { "running": gunicorn_running, "status": status } # 检查服务响应 service_ok, response = self.check_service_response() report["service_response"] = { "ok": service_ok, "response": response } # 获取系统信息 report["system_info"] = self.get_system_info() # 生成建议 if not port_occupied: report["recommendations"].append("端口未被占用,可能需要启动服务") if not gunicorn_running: report["recommendations"].append("Gunicorn进程未运行,需要重启服务") if not service_ok: report["recommendations"].append("服务无响应,需要检查服务状态") # 检查系统资源 sys_info = report["system_info"] if sys_info.get('cpu_usage', 0) > 80: report["recommendations"].append("CPU使用率过高,建议优化或扩容") if sys_info.get('memory_usage', 0) > 90: report["recommendations"].append("内存使用率过高,建议增加内存或优化应用") if sys_info.get('disk_usage', 0) > 85: report["recommendations"].append("磁盘使用率过高,建议清理日志或扩容") return report def monitor_loop(self): """监控主循环""" logger.info("开始端口监控...") restart_count = 0 last_restart_time = 0 while True: try: # 生成报告 report = self.generate_report() # 检查是否需要重启 need_restart = False if not report["gunicorn_status"]["running"]: logger.warning("Gunicorn进程未运行") need_restart = True if not report["service_response"]["ok"]: logger.warning("服务无响应") need_restart = True # 执行重启 if need_restart and self.config['auto_restart']: current_time = time.time() # 检查重启限制 if (current_time - last_restart_time > 300 and # 5分钟内不重复重启 restart_count < self.config['max_restart_attempts']): self.log_alert("检测到服务异常,尝试自动重启") if self.restart_service(): restart_count += 1 last_restart_time = current_time else: self.log_alert("自动重启失败", "ERROR") else: self.log_alert("达到重启限制,跳过自动重启", "ERROR") # 记录状态 status_msg = f"端口占用: {report['port_usage']['occupied']}, " \ f"Gunicorn: {report['gunicorn_status']['running']}, " \ f"服务响应: {report['service_response']['ok']}" logger.info(status_msg) # 等待下次检查 time.sleep(self.config['check_interval']) except KeyboardInterrupt: logger.info("监控被用户中断") break except Exception as e: logger.error(f"监控过程中发生错误: {e}") time.sleep(self.config['check_interval']) def show_status(self): """显示当前状态""" report = self.generate_report() print("=== Flask应用监控状态 ===") print(f"检查时间: {report['timestamp']}") print() print("1. 端口占用检查:") if report['port_usage']['occupied']: print(f" ✅ 端口 {PORT} 被占用") print(f" 进程: {', '.join(report['port_usage']['processes'])}") else: print(f" ❌ 端口 {PORT} 未被占用") print() print("2. Gunicorn进程检查:") if report['gunicorn_status']['running']: print(f" ✅ {report['gunicorn_status']['status']}") else: print(f" ❌ {report['gunicorn_status']['status']}") print() print("3. 服务响应检查:") if report['service_response']['ok']: print(f" ✅ {report['service_response']['response']}") else: print(f" ❌ {report['service_response']['response']}") print() print("4. 系统资源:") sys_info = report['system_info'] print(f" CPU使用率: {sys_info.get('cpu_usage', 'N/A')}%") print(f" 内存使用率: {sys_info.get('memory_usage', 'N/A')}%") print(f" 磁盘使用率: {sys_info.get('disk_usage', 'N/A')}%") if report['recommendations']: print() print("5. 建议:") for i, rec in enumerate(report['recommendations'], 1): print(f" {i}. {rec}") def main(): monitor = PortMonitor() if len(sys.argv) > 1: command = sys.argv[1] if command == "status": monitor.show_status() elif command == "report": report = monitor.generate_report() print(json.dumps(report, indent=2, ensure_ascii=False)) elif command == "restart": monitor.restart_service() elif command == "monitor": monitor.monitor_loop() elif command == "config": print(json.dumps(monitor.config, indent=2, ensure_ascii=False)) else: print("用法: python port_monitor.py [status|report|restart|monitor|config]") else: monitor.monitor_loop() if __name__ == "__main__": main()