""" 执行管理API """ from fastapi import APIRouter, Depends, HTTPException, status from sqlalchemy import or_ from sqlalchemy.orm import Session from pydantic import BaseModel from typing import List, Optional, Dict, Any, Literal from datetime import datetime from app.core.database import get_db from app.models.execution import Execution from app.models.workflow import Workflow from app.models.agent import Agent from app.api.auth import get_current_user from app.models.user import User from app.tasks.workflow_tasks import execute_workflow_task, resume_workflow_task from app.services.agent_workspace_chat_log import ensure_agent_dialogue_logged_from_db_execution import uuid import logging logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/executions", tags=["executions"]) def _enqueue_workflow_task_safe( db: Session, execution: Execution, workflow_label: str, workflow_data: dict, input_data: dict, ) -> Execution: """ 将工作流执行任务投递到 Celery。若 Redis/Worker 不可用,回写执行记录为 failed 并抛出 503。 """ try: task = execute_workflow_task.delay( str(execution.id), workflow_label, workflow_data, input_data, ) execution.task_id = task.id db.commit() db.refresh(execution) return execution except Exception as e: logger.exception( "Celery 任务入队失败 execution_id=%s workflow_label=%s", execution.id, workflow_label, ) msg = ( "异步任务入队失败:无法连接消息队列或 Celery Worker。" "请确认 Redis 已启动、.env 中 REDIS_URL 正确,并已运行 Celery Worker(如 celery -A app.core.celery_app worker)。" f" 原始错误: {e!s}" ) try: execution.status = "failed" execution.error_message = msg[:2000] db.commit() db.refresh(execution) except Exception as e2: logger.exception("回写执行失败状态时出错: %s", e2) db.rollback() raise HTTPException(status_code=503, detail=msg[:2000]) from e class ExecutionCreate(BaseModel): """执行创建模型""" workflow_id: Optional[str] = None agent_id: Optional[str] = None input_data: Dict[str, Any] parent_execution_id: Optional[str] = None depth: Optional[int] = 0 class ExecutionResponse(BaseModel): """执行响应模型""" id: str workflow_id: Optional[str] agent_id: Optional[str] input_data: Optional[Dict[str, Any]] output_data: Optional[Dict[str, Any]] status: str error_message: Optional[str] execution_time: Optional[int] task_id: Optional[str] parent_execution_id: Optional[str] depth: int pause_state: Optional[Dict[str, Any]] = None created_at: datetime class Config: from_attributes = True def _execution_to_response(ex: Execution) -> ExecutionResponse: return ExecutionResponse( id=str(ex.id), workflow_id=str(ex.workflow_id) if ex.workflow_id is not None else None, agent_id=str(ex.agent_id) if ex.agent_id is not None else None, input_data=ex.input_data, output_data=ex.output_data, status=ex.status, error_message=ex.error_message, execution_time=ex.execution_time, task_id=str(ex.task_id) if ex.task_id is not None else None, parent_execution_id=( str(ex.parent_execution_id) if ex.parent_execution_id is not None else None ), depth=int(ex.depth) if ex.depth is not None else 0, pause_state=ex.pause_state, created_at=ex.created_at, ) class ResumeExecutionBody(BaseModel): """恢复挂起的执行(审批)""" decision: Literal["approved", "rejected"] comment: Optional[str] = None def _can_view_execution( db: Session, current_user: User, execution: Execution ) -> bool: if getattr(current_user, "role", None) == "admin": return True if execution.workflow_id: wf = db.query(Workflow).filter(Workflow.id == execution.workflow_id).first() if wf and wf.user_id == current_user.id: return True if execution.agent_id: ag = db.query(Agent).filter(Agent.id == execution.agent_id).first() if ag and ag.user_id == current_user.id: return True return False @router.post("", response_model=ExecutionResponse, status_code=status.HTTP_201_CREATED) async def create_execution( execution_data: ExecutionCreate, db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """创建执行任务""" # 验证工作流或智能体是否存在 if execution_data.workflow_id: workflow = db.query(Workflow).filter( Workflow.id == execution_data.workflow_id, Workflow.user_id == current_user.id ).first() if not workflow: raise HTTPException(status_code=404, detail="工作流不存在") # 创建执行记录 execution = Execution( workflow_id=execution_data.workflow_id, input_data=execution_data.input_data, status="pending", parent_execution_id=execution_data.parent_execution_id, depth=max(0, int(execution_data.depth or 0)), ) db.add(execution) db.commit() db.refresh(execution) # 异步执行工作流 workflow_data = { 'nodes': workflow.nodes, 'edges': workflow.edges } ex = _enqueue_workflow_task_safe( db, execution, execution_data.workflow_id, workflow_data, execution_data.input_data, ) return _execution_to_response(ex) elif execution_data.agent_id: agent = db.query(Agent).filter(Agent.id == execution_data.agent_id).first() if not agent: raise HTTPException(status_code=404, detail="智能体不存在") # 检查权限:只有已发布的Agent可以执行,或者所有者可以测试 if agent.status not in ["published", "running"] and agent.user_id != current_user.id: raise HTTPException(status_code=403, detail="Agent未发布或无权执行") # 验证工作流配置 if not agent.workflow_config or "nodes" not in agent.workflow_config or "edges" not in agent.workflow_config: raise HTTPException(status_code=400, detail="Agent工作流配置无效") # 创建执行记录 execution = Execution( agent_id=execution_data.agent_id, input_data=execution_data.input_data, status="pending", parent_execution_id=execution_data.parent_execution_id, depth=max(0, int(execution_data.depth or 0)), ) db.add(execution) db.commit() db.refresh(execution) # 异步执行Agent工作流 workflow_data = { 'nodes': agent.workflow_config.get('nodes', []), 'edges': agent.workflow_config.get('edges', []) } # 调试:检查节点数据是否包含 api_key logger.debug(f"[rjb] Agent工作流数据: nodes数量={len(workflow_data['nodes'])}") for node in workflow_data['nodes']: if node.get('type') == 'llm': node_data = node.get('data', {}) logger.debug(f"[rjb] LLM节点: node_id={node.get('id')}, data keys={list(node_data.keys())}, api_key={'已配置' if node_data.get('api_key') else '未配置'}") ex = _enqueue_workflow_task_safe( db, execution, f"agent_{agent.id}", workflow_data, execution_data.input_data, ) return _execution_to_response(ex) else: raise HTTPException(status_code=400, detail="必须提供workflow_id或agent_id") @router.get("", response_model=List[ExecutionResponse]) async def get_executions( skip: int = 0, limit: int = 100, workflow_id: Optional[str] = None, agent_id: Optional[str] = None, status: Optional[str] = None, search: Optional[str] = None, db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ 获取执行记录列表(支持分页、筛选、搜索) Args: skip: 跳过记录数(分页) limit: 每页记录数(分页,最大100) workflow_id: 工作流ID筛选 status: 状态筛选(pending, running, completed, failed, awaiting_approval) search: 搜索关键词(搜索执行ID、工作流ID、任务ID) """ # 限制每页最大记录数 limit = min(limit, 100) # 管理员可看全部;普通用户:自己拥有的工作流或 Agent 上的执行记录(含纯 Agent 执行) if getattr(current_user, "role", None) == "admin": query = db.query(Execution) else: query = ( db.query(Execution) .outerjoin(Workflow, Execution.workflow_id == Workflow.id) .outerjoin(Agent, Execution.agent_id == Agent.id) .filter( or_( Workflow.user_id == current_user.id, Agent.user_id == current_user.id, ) ) ) # 工作流ID筛选 if workflow_id: query = query.filter(Execution.workflow_id == workflow_id) if agent_id: query = query.filter(Execution.agent_id == agent_id) # 状态筛选 if status: query = query.filter(Execution.status == status) # 搜索 if search: search_pattern = f"%{search}%" query = query.filter( (Execution.id.like(search_pattern)) | (Execution.workflow_id.like(search_pattern)) | (Execution.agent_id.like(search_pattern)) | (Execution.task_id.like(search_pattern)) ) # 排序和分页 executions = query.order_by(Execution.created_at.desc()).offset(skip).limit(limit).all() return [_execution_to_response(e) for e in executions] @router.get("/{execution_id}/status", response_model=Dict[str, Any]) async def get_execution_status( execution_id: str, db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ 获取执行状态和当前执行的节点信息 用于实时显示执行进度 """ from app.models.execution_log import ExecutionLog from sqlalchemy import desc execution = db.query(Execution).filter(Execution.id == execution_id).first() if not execution: raise HTTPException(status_code=404, detail="执行记录不存在") # 获取最新的节点执行日志 current_node = None executed_nodes = [] failed_nodes = [] # 无论执行状态如何,都获取节点执行日志(包括已完成和失败的执行) # 获取最近执行的节点(正在执行或刚完成的节点) recent_logs = db.query(ExecutionLog).filter( ExecutionLog.execution_id == execution_id, ExecutionLog.node_id.isnot(None) ).order_by(desc(ExecutionLog.timestamp)).limit(50).all() logger.info(f"[rjb] 获取执行 {execution_id} 的日志,状态: {execution.status}, 日志数量: {len(recent_logs)}") # 找出正在执行的节点(有开始日志但没有完成日志的节点) node_status = {} for log in recent_logs: node_id = log.node_id logger.debug(f"[rjb] 日志: node_id={node_id}, node_type={log.node_type}, message={log.message}, level={log.level}, data={log.data}") if node_id not in node_status: node_status[node_id] = { 'node_id': node_id, 'node_type': log.node_type, 'started': False, 'completed': False, 'failed': False, 'duration': None, 'error_message': None, 'error_type': None, 'timestamp': log.timestamp.isoformat() if log.timestamp else None } # 匹配日志消息,支持多种格式 message = log.message or '' if '开始执行' in message or '开始' in message: node_status[node_id]['started'] = True logger.debug(f"[rjb] 节点 {node_id} 标记为已开始") elif '执行完成' in message or '完成' in message: node_status[node_id]['completed'] = True node_status[node_id]['duration'] = log.duration logger.debug(f"[rjb] 节点 {node_id} 标记为已完成") elif '执行失败' in message or '失败' in message or log.level == 'ERROR': node_status[node_id]['failed'] = True # 从日志的 data 字段中提取错误信息 if log.data and isinstance(log.data, dict): if 'error' in log.data: node_status[node_id]['error_message'] = log.data.get('error') if 'error_type' in log.data: node_status[node_id]['error_type'] = log.data.get('error_type') # 如果 data 中没有错误信息,尝试从 message 中提取 if not node_status[node_id]['error_message']: # 尝试从消息中提取错误信息(格式:节点 xxx 执行失败: 错误信息) if '执行失败:' in message: error_msg = message.split('执行失败:')[-1].strip() node_status[node_id]['error_message'] = error_msg elif '失败:' in message: error_msg = message.split('失败:')[-1].strip() node_status[node_id]['error_message'] = error_msg logger.debug(f"[rjb] 节点 {node_id} 标记为失败, 错误信息: {node_status[node_id]['error_message']}") logger.info(f"[rjb] 节点状态统计: {len(node_status)} 个节点") for node_id, status in node_status.items(): logger.debug(f"[rjb] 节点 {node_id}: started={status['started']}, completed={status['completed']}, failed={status['failed']}") # 找出正在执行的节点(已开始但未完成且未失败) for node_id, status in node_status.items(): if status['started'] and not status['completed'] and not status['failed']: current_node = { 'node_id': node_id, 'node_type': status['node_type'], 'status': 'running' } logger.info(f"[rjb] 当前执行节点: {node_id} ({status['node_type']})") break # 已完成的节点 executed_nodes = [ { 'node_id': node_id, 'node_type': status['node_type'], 'status': 'completed', 'duration': status['duration'] } for node_id, status in node_status.items() if status['completed'] ] # 失败的节点(包含错误信息) failed_nodes = [ { 'node_id': node_id, 'node_type': status['node_type'], 'status': 'failed', 'error_message': status.get('error_message'), 'error_type': status.get('error_type') } for node_id, status in node_status.items() if status['failed'] ] logger.info(f"[rjb] 执行状态汇总: current_node={current_node}, executed={len(executed_nodes)}, failed={len(failed_nodes)}") return { 'execution_id': execution.id, 'status': execution.status, 'current_node': current_node, 'executed_nodes': executed_nodes, 'failed_nodes': failed_nodes, 'execution_time': execution.execution_time } @router.post("/{execution_id}/resume", response_model=ExecutionResponse) async def resume_execution( execution_id: str, body: ResumeExecutionBody, db: Session = Depends(get_db), current_user: User = Depends(get_current_user), ): """在审批挂起(awaiting_approval)后恢复执行。""" execution = db.query(Execution).filter(Execution.id == execution_id).first() if not execution: raise HTTPException(status_code=404, detail="执行记录不存在") if not _can_view_execution(db, current_user, execution): raise HTTPException(status_code=403, detail="无权访问") if execution.status != "awaiting_approval": raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=f"当前状态不可恢复: {execution.status}", ) if not execution.pause_state: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail="缺少挂起快照,无法恢复", ) task = resume_workflow_task.delay( str(execution.id), body.decision, body.comment, ) execution.task_id = task.id db.commit() db.refresh(execution) return execution @router.get("/{execution_id}", response_model=ExecutionResponse) async def get_execution( execution_id: str, db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """获取执行详情""" execution = db.query(Execution).filter(Execution.id == execution_id).first() if not execution: raise HTTPException(status_code=404, detail="执行记录不存在") if not _can_view_execution(db, current_user, execution): raise HTTPException(status_code=403, detail="无权访问") # 补救:若 Celery Worker 未带对话落盘逻辑,首次拉取已完成执行时由 API 进程写入 dialogue.md(与任务内写入幂等) if execution.status == "completed": try: ensure_agent_dialogue_logged_from_db_execution(db, execution) except Exception: logger.debug("补写智能体对话 MD 跳过", exc_info=True) return _execution_to_response(execution)