From 32011512557ebc3537cbd4bc3a24aca477b0b5e2 Mon Sep 17 00:00:00 2001 From: liushuang Date: Sat, 11 Oct 2025 09:31:50 +0800 Subject: [PATCH] add --- 20251011.md | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 20251011.md diff --git a/20251011.md b/20251011.md new file mode 100644 index 0000000..99e752d --- /dev/null +++ b/20251011.md @@ -0,0 +1,147 @@ +## 崩溃原因 + +- 一个请求被发送到 vLLM 服务,该请求包含了结构化输出的意图 +- vLLM 在处理请求时,进入 structured_output_request.structured_output_key 的逻辑,最终调用 get_structured_output_key(sampling_params)函数。 +- 该函数遍历 sampling_params 中的各种参数(如 grammar, json_schema, regex 等),但没有找到任何一个有效的结构化输出参数。 +- 因此抛出异常:raise ValueError("No valid structured output parameter found") +- 这个 ValueError 在后台线程中抛出,但未被捕获,导致 EngineCore 进程(PID 2738693)崩溃 +- 主 API 服务(APIServer)检测到 EngineCore 崩溃,抛出 EngineDeadError,最终整个服务终止。 + +## 解决方案一 + +确保请求进入vllm的时候,提供有效的结构化输出参数。例如: + +```json +{ + "prompt": "生成一个用户信息", + "structured_output": { + "type": "json", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"} + }, + "required": ["name", "age"] + } + } +} +``` + +##### 错误示例 + +```json +{ + "prompt": "生成一个JSON", + "structured_output": {} +} +``` + +```json +{ + "prompt": "生成一个JSON", + "json_schema": null +} +``` + +## 解决方案二 + +升级vllm的版本到最新稳定版,官方已做出大量优化和改进。 + +## 崩溃日志片段 + +```text +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] EngineCore encountered a fatal error. +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] Traceback (most recent call last): +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 701, in run_engine_core +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] engine_core.run_busy_loop() +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 728, in run_busy_loop +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] self._process_engine_step() +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 754, in _process_engine_step +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] outputs, model_executed = self.step_fn() +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 283, in step +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] scheduler_output = self.scheduler.schedule() +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/core/sched/scheduler.py", line 359, in schedule +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] if structured_output_req and structured_output_req.grammar: +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 45, in grammar +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] completed = self._check_grammar_completion() +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 33, in _check_grammar_completion +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] self._grammar = self._grammar.result(timeout=0.0001) +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/_base.py", line 458, in result +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] return self.__get_result() +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] raise self._exception +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/thread.py", line 58, in run +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] result = self.fn(*self.args, **self.kwargs) +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/__init__.py", line 128, in _async_create_grammar +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] key = request.structured_output_request.structured_output_key # type: ignore[union-attr] +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/functools.py", line 981, in __get__ +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] val = self.func(instance) +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 58, in structured_output_key +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] return get_structured_output_key(self.sampling_params) +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 86, in get_structured_output_key +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] raise ValueError("No valid structured output parameter found") +(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] ValueError: No valid structured output parameter found +(EngineCore_DP0 pid=2738693) Process EngineCore_DP0: +(EngineCore_DP0 pid=2738693) Traceback (most recent call last): +(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] AsyncLLM output_handler failed. +(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] Traceback (most recent call last): +(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 439, in output_handler +(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] outputs = await engine_core.get_output_async() +(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 846, in get_output_async +(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] raise self._format_exception(outputs) from None +(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause. +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap +(EngineCore_DP0 pid=2738693) self.run() +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/multiprocessing/process.py", line 108, in run +(EngineCore_DP0 pid=2738693) self._target(*self._args, **self._kwargs) +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 712, in run_engine_core +(EngineCore_DP0 pid=2738693) raise e +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 701, in run_engine_core +(EngineCore_DP0 pid=2738693) engine_core.run_busy_loop() +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 728, in run_busy_loop +(EngineCore_DP0 pid=2738693) self._process_engine_step() +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 754, in _process_engine_step +(EngineCore_DP0 pid=2738693) outputs, model_executed = self.step_fn() +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 283, in step +(EngineCore_DP0 pid=2738693) scheduler_output = self.scheduler.schedule() +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/core/sched/scheduler.py", line 359, in schedule +(EngineCore_DP0 pid=2738693) if structured_output_req and structured_output_req.grammar: +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 45, in grammar +(EngineCore_DP0 pid=2738693) completed = self._check_grammar_completion() +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 33, in _check_grammar_completion +(EngineCore_DP0 pid=2738693) self._grammar = self._grammar.result(timeout=0.0001) +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/_base.py", line 458, in result +(EngineCore_DP0 pid=2738693) return self.__get_result() +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result +(EngineCore_DP0 pid=2738693) raise self._exception +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/thread.py", line 58, in run +(EngineCore_DP0 pid=2738693) result = self.fn(*self.args, **self.kwargs) +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/__init__.py", line 128, in _async_create_grammar +(EngineCore_DP0 pid=2738693) key = request.structured_output_request.structured_output_key # type: ignore[union-attr] +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/functools.py", line 981, in __get__ +(EngineCore_DP0 pid=2738693) val = self.func(instance) +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 58, in structured_output_key +(EngineCore_DP0 pid=2738693) return get_structured_output_key(self.sampling_params) +(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 86, in get_structured_output_key +(EngineCore_DP0 pid=2738693) raise ValueError("No valid structured output parameter found") +(EngineCore_DP0 pid=2738693) ValueError: No valid structured output parameter found +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] Error in chat completion stream generator. +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] Traceback (most recent call last): +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/entrypoints/openai/serving_chat.py", line 574, in chat_completion_stream_generator +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] async for res in result_generator: +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 387, in generate +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] out = q.get_nowait() or await q.get() +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/output_processor.py", line 59, in get +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] raise output +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 439, in output_handler +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] outputs = await engine_core.get_output_async() +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 846, in get_output_async +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] raise self._format_exception(outputs) from None +(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause. +[rank0]:[W1010 10:43:10.666166970 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +(APIServer pid=2738423) INFO: Shutting down +(APIServer pid=2738423) INFO: Waiting for application shutdown. +(APIServer pid=2738423) INFO: Application shutdown complete. +(APIServer pid=2738423) INFO: Finished server process [2738423] +``` \ No newline at end of file