vllm.envs ¶
CMAKE_BUILD_TYPE module-attribute
¶
VLLM_ALL2ALL_BACKEND module-attribute
¶
VLLM_ALL2ALL_BACKEND: Literal[
"naive",
"pplx",
"deepep_high_throughput",
"deepep_low_latency",
"allgather_reducescatter",
"flashinfer_all2allv",
] = "allgather_reducescatter"
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE module-attribute
¶
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = (
False
)
VLLM_ALLOW_INSECURE_SERIALIZATION module-attribute
¶
VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING module-attribute
¶
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING module-attribute
¶
VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING: bool = True
VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION module-attribute
¶
VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False
VLLM_FLASHINFER_MOE_BACKEND module-attribute
¶
VLLM_FLASHINFER_MOE_BACKEND: Literal[
"throughput", "latency"
] = "throughput"
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH module-attribute
¶
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32
VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS module-attribute
¶
VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
VLLM_KV_CACHE_LAYOUT module-attribute
¶
VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES module-attribute
¶
VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
VLLM_LOGITS_PROCESSOR_THREADS module-attribute
¶
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE module-attribute
¶
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME module-attribute
¶
VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = (
"VLLM_OBJECT_STORAGE_SHM_BUFFER"
)
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16 module-attribute
¶
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB module-attribute
¶
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION module-attribute
¶
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal[
"FP", "INT8", "INT6", "INT4", "NONE"
] = "NONE"
VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION module-attribute
¶
VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL module-attribute
¶
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS module-attribute
¶
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
VLLM_TORCH_PROFILER_RECORD_SHAPES module-attribute
¶
VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY module-attribute
¶
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 module-attribute
¶
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 module-attribute
¶
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS module-attribute
¶
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE module-attribute
¶
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal[
"auto", "nccl", "shm"
] = "auto"
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM module-attribute
¶
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
VLLM_V1_USE_PREFILL_DECODE_ATTENTION module-attribute
¶
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
VLLM_WORKER_MULTIPROC_METHOD module-attribute
¶
VLLM_WORKER_MULTIPROC_METHOD: Literal["fork", "spawn"] = (
"fork"
)
VLLM_XLA_CACHE_PATH module-attribute
¶
VLLM_XLA_CACHE_PATH: str = join(
VLLM_CACHE_ROOT, "xla_cache"
)
environment_variables module-attribute
¶
environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_TARGET_DEVICE": lambda: lower(),
"VLLM_MAIN_CUDA_VERSION": lambda: lower() or "12.8",
"MAX_JOBS": lambda: getenv("MAX_JOBS", None),
"NVCC_THREADS": lambda: getenv("NVCC_THREADS", None),
"VLLM_USE_PRECOMPILED": lambda: lower() in ("1", "true")
or bool(get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
"VLLM_DOCKER_BUILD_CONTEXT": lambda: lower()
in ("1", "true"),
"VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": lambda: bool(
int(
getenv(
"VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL",
"0",
)
)
),
"CMAKE_BUILD_TYPE": env_with_choices(
"CMAKE_BUILD_TYPE",
None,
["Debug", "Release", "RelWithDebInfo"],
),
"VERBOSE": lambda: bool(int(getenv("VERBOSE", "0"))),
"VLLM_CONFIG_ROOT": lambda: expanduser(
getenv(
"VLLM_CONFIG_ROOT",
join(get_default_config_root(), "vllm"),
)
),
"VLLM_CACHE_ROOT": lambda: expanduser(
getenv(
"VLLM_CACHE_ROOT",
join(get_default_cache_root(), "vllm"),
)
),
"VLLM_HOST_IP": lambda: getenv("VLLM_HOST_IP", ""),
"VLLM_PORT": get_vllm_port,
"VLLM_RPC_BASE_PATH": lambda: getenv(
"VLLM_RPC_BASE_PATH", gettempdir()
),
"VLLM_USE_MODELSCOPE": lambda: lower() == "true",
"VLLM_RINGBUFFER_WARNING_INTERVAL": lambda: int(
get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")
),
"CUDA_HOME": lambda: get("CUDA_HOME", None),
"VLLM_NCCL_SO_PATH": lambda: get(
"VLLM_NCCL_SO_PATH", None
),
"LD_LIBRARY_PATH": lambda: get("LD_LIBRARY_PATH", None),
"VLLM_USE_TRITON_FLASH_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: lower()
in ("true", "1"),
"VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(
get("VLLM_FLASH_ATTN_VERSION", None)
),
"VLLM_USE_STANDALONE_COMPILE": lambda: get(
"VLLM_USE_STANDALONE_COMPILE", "0"
)
== "1",
"VLLM_PATTERN_MATCH_DEBUG": lambda: get(
"VLLM_PATTERN_MATCH_DEBUG", None
),
"VLLM_DEBUG_DUMP_PATH": lambda: get(
"VLLM_DEBUG_DUMP_PATH", None
),
"LOCAL_RANK": lambda: int(get("LOCAL_RANK", "0")),
"CUDA_VISIBLE_DEVICES": lambda: get(
"CUDA_VISIBLE_DEVICES", None
),
"VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
),
"VLLM_API_KEY": lambda: get("VLLM_API_KEY", None),
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE": lambda: lower()
== "true",
"S3_ACCESS_KEY_ID": lambda: get(
"S3_ACCESS_KEY_ID", None
),
"S3_SECRET_ACCESS_KEY": lambda: get(
"S3_SECRET_ACCESS_KEY", None
),
"S3_ENDPOINT_URL": lambda: get("S3_ENDPOINT_URL", None),
"VLLM_USAGE_STATS_SERVER": lambda: get(
"VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"
),
"VLLM_NO_USAGE_STATS": lambda: get(
"VLLM_NO_USAGE_STATS", "0"
)
== "1",
"VLLM_DISABLE_FLASHINFER_PREFILL": lambda: get(
"VLLM_DISABLE_FLASHINFER_PREFILL", "0"
)
== "1",
"VLLM_DO_NOT_TRACK": lambda: (
get("VLLM_DO_NOT_TRACK", None)
or get("DO_NOT_TRACK", None)
or "0"
)
== "1",
"VLLM_USAGE_SOURCE": lambda: get(
"VLLM_USAGE_SOURCE", "production"
),
"VLLM_CONFIGURE_LOGGING": lambda: int(
getenv("VLLM_CONFIGURE_LOGGING", "1")
),
"VLLM_LOGGING_CONFIG_PATH": lambda: getenv(
"VLLM_LOGGING_CONFIG_PATH"
),
"VLLM_LOGGING_LEVEL": lambda: upper(),
"VLLM_LOGGING_STREAM": lambda: getenv(
"VLLM_LOGGING_STREAM", "ext://sys.stdout"
),
"VLLM_LOGGING_PREFIX": lambda: getenv(
"VLLM_LOGGING_PREFIX", ""
),
"VLLM_LOGITS_PROCESSOR_THREADS": lambda: int(
getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0")
)
if "VLLM_LOGITS_PROCESSOR_THREADS" in environ
else None,
"VLLM_LOG_STATS_INTERVAL": lambda: val
if (
val := (
float(getenv("VLLM_LOG_STATS_INTERVAL", "10."))
)
)
> 0.0
else 10.0,
"VLLM_TRACE_FUNCTION": lambda: int(
getenv("VLLM_TRACE_FUNCTION", "0")
),
"VLLM_ATTENTION_BACKEND": env_with_choices(
"VLLM_ATTENTION_BACKEND", None, lambda: list(keys())
),
"VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
int(environ["VLLM_USE_FLASHINFER_SAMPLER"])
)
if "VLLM_USE_FLASHINFER_SAMPLER" in environ
else None,
"VLLM_PP_LAYER_PARTITION": lambda: getenv(
"VLLM_PP_LAYER_PARTITION", None
),
"VLLM_CPU_KVCACHE_SPACE": lambda: int(
getenv("VLLM_CPU_KVCACHE_SPACE", "0")
)
if "VLLM_CPU_KVCACHE_SPACE" in environ
else None,
"VLLM_CPU_OMP_THREADS_BIND": lambda: getenv(
"VLLM_CPU_OMP_THREADS_BIND", "auto"
),
"VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
)
if "VLLM_CPU_NUM_OF_RESERVED_CPU" in environ
else None,
"VLLM_CPU_MOE_PREPACK": lambda: bool(
int(getenv("VLLM_CPU_MOE_PREPACK", "1"))
),
"VLLM_CPU_SGL_KERNEL": lambda: bool(
int(getenv("VLLM_CPU_SGL_KERNEL", "0"))
),
"VLLM_USE_RAY_SPMD_WORKER": lambda: bool(
int(getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))
),
"VLLM_USE_RAY_COMPILED_DAG": lambda: bool(
int(getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))
),
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": env_with_choices(
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE",
"auto",
["auto", "nccl", "shm"],
),
"VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": lambda: bool(
int(
getenv(
"VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM",
"0",
)
)
),
"VLLM_USE_RAY_WRAPPED_PP_COMM": lambda: bool(
int(getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))
),
"VLLM_WORKER_MULTIPROC_METHOD": env_with_choices(
"VLLM_WORKER_MULTIPROC_METHOD",
"fork",
["spawn", "fork"],
),
"VLLM_ASSETS_CACHE": lambda: expanduser(
getenv(
"VLLM_ASSETS_CACHE",
join(
get_default_cache_root(), "vllm", "assets"
),
)
),
"VLLM_ASSETS_CACHE_MODEL_CLEAN": lambda: bool(
int(getenv("VLLM_ASSETS_CACHE_MODEL_CLEAN", "0"))
),
"VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")
),
"VLLM_VIDEO_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")
),
"VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")
),
"VLLM_MEDIA_URL_ALLOW_REDIRECTS": lambda: bool(
int(getenv("VLLM_MEDIA_URL_ALLOW_REDIRECTS", "1"))
),
"VLLM_MEDIA_LOADING_THREAD_COUNT": lambda: int(
getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")
),
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int(
getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")
),
"VLLM_VIDEO_LOADER_BACKEND": lambda: getenv(
"VLLM_VIDEO_LOADER_BACKEND", "opencv"
),
"VLLM_MM_INPUT_CACHE_GIB": lambda: int(
getenv("VLLM_MM_INPUT_CACHE_GIB", "4")
),
"VLLM_XLA_CACHE_PATH": lambda: expanduser(
getenv(
"VLLM_XLA_CACHE_PATH",
join(
get_default_cache_root(),
"vllm",
"xla_cache",
),
)
),
"VLLM_XLA_CHECK_RECOMPILATION": lambda: bool(
int(getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))
),
"VLLM_XLA_USE_SPMD": lambda: bool(
int(getenv("VLLM_XLA_USE_SPMD", "0"))
),
"VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(
getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")
),
"VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING": lambda: bool(
int(
getenv(
"VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING",
"1",
)
)
),
"VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool(
getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)
),
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": lambda: lower()
in ("1", "true"),
"VLLM_TEST_FORCE_FP8_MARLIN": lambda: lower()
in ("1", "true"),
"VLLM_TEST_FORCE_LOAD_FORMAT": lambda: getenv(
"VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"
),
"VLLM_RPC_TIMEOUT": lambda: int(
getenv("VLLM_RPC_TIMEOUT", "10000")
),
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
),
"VLLM_PLUGINS": lambda: None
if "VLLM_PLUGINS" not in environ
else split(","),
"VLLM_LORA_RESOLVER_CACHE_DIR": lambda: getenv(
"VLLM_LORA_RESOLVER_CACHE_DIR", None
),
"VLLM_TORCH_PROFILER_DIR": lambda: None
if getenv("VLLM_TORCH_PROFILER_DIR", None) is None
else abspath(
expanduser(getenv("VLLM_TORCH_PROFILER_DIR", "."))
),
"VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool(
getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0")
!= "0"
),
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool(
getenv(
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0"
)
!= "0"
),
"VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool(
getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"
),
"VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
),
"VLLM_USE_TRITON_AWQ": lambda: bool(
int(getenv("VLLM_USE_TRITON_AWQ", "0"))
),
"VLLM_ALLOW_RUNTIME_LORA_UPDATING": lambda: lower()
in ("1", "true"),
"VLLM_SKIP_P2P_CHECK": lambda: getenv(
"VLLM_SKIP_P2P_CHECK", "1"
)
== "1",
"VLLM_DISABLED_KERNELS": lambda: []
if "VLLM_DISABLED_KERNELS" not in environ
else split(","),
"VLLM_DISABLE_PYNCCL": lambda: lower() in ("true", "1"),
"VLLM_USE_V1": lambda: bool(
int(getenv("VLLM_USE_V1", "1"))
),
"VLLM_ROCM_USE_AITER": lambda: lower() in ("true", "1"),
"VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_LINEAR": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MOE": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_RMSNORM": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MLA": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MHA": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_FP4_ASM_GEMM": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_TRITON_ROPE": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_FP8BMM": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_SKINNY_GEMM": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_FP8_PADDING": lambda: bool(
int(getenv("VLLM_ROCM_FP8_PADDING", "1"))
),
"VLLM_ROCM_MOE_PADDING": lambda: bool(
int(getenv("VLLM_ROCM_MOE_PADDING", "1"))
),
"VLLM_ROCM_CUSTOM_PAGED_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": env_with_choices(
"VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
"NONE",
["FP", "INT8", "INT6", "INT4", "NONE"],
),
"VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB": lambda: maybe_convert_int(
get(
"VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", None
)
),
"Q_SCALE_CONSTANT": lambda: int(
getenv("Q_SCALE_CONSTANT", "200")
),
"K_SCALE_CONSTANT": lambda: int(
getenv("K_SCALE_CONSTANT", "200")
),
"V_SCALE_CONSTANT": lambda: int(
getenv("V_SCALE_CONSTANT", "100")
),
"VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(
int(getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))
),
"VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(
getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")
),
"VLLM_DISABLE_COMPILE_CACHE": lambda: bool(
int(getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))
),
"VLLM_SERVER_DEV_MODE": lambda: bool(
int(getenv("VLLM_SERVER_DEV_MODE", "0"))
),
"VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": lambda: int(
getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")
),
"VLLM_MLA_DISABLE": lambda: bool(
int(getenv("VLLM_MLA_DISABLE", "0"))
),
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": lambda: int(
getenv(
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
"32",
)
),
"VLLM_RAY_PER_WORKER_GPUS": lambda: float(
getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")
),
"VLLM_RAY_BUNDLE_INDICES": lambda: getenv(
"VLLM_RAY_BUNDLE_INDICES", ""
),
"VLLM_CUDART_SO_PATH": lambda: getenv(
"VLLM_CUDART_SO_PATH", None
),
"VLLM_DP_RANK": lambda: int(
getenv("VLLM_DP_RANK", "0")
),
"VLLM_DP_RANK_LOCAL": lambda: int(
getenv("VLLM_DP_RANK_LOCAL", VLLM_DP_RANK)
),
"VLLM_DP_SIZE": lambda: int(
getenv("VLLM_DP_SIZE", "1")
),
"VLLM_DP_MASTER_IP": lambda: getenv(
"VLLM_DP_MASTER_IP", "127.0.0.1"
),
"VLLM_DP_MASTER_PORT": lambda: int(
getenv("VLLM_DP_MASTER_PORT", "0")
),
"VLLM_MOE_DP_CHUNK_SIZE": lambda: int(
getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")
),
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: get(
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
)
== "1",
"VLLM_CI_USE_S3": lambda: get("VLLM_CI_USE_S3", "0")
== "1",
"VLLM_MODEL_REDIRECT_PATH": lambda: get(
"VLLM_MODEL_REDIRECT_PATH", None
),
"VLLM_MARLIN_USE_ATOMIC_ADD": lambda: get(
"VLLM_MARLIN_USE_ATOMIC_ADD", "0"
)
== "1",
"VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
get("VLLM_MXFP4_USE_MARLIN", None)
),
"VLLM_V0_USE_OUTLINES_CACHE": lambda: get(
"VLLM_V0_USE_OUTLINES_CACHE", "0"
)
== "1",
"VLLM_V1_USE_OUTLINES_CACHE": lambda: get(
"VLLM_V1_USE_OUTLINES_CACHE", "0"
)
== "1",
"VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
environ["VLLM_TPU_BUCKET_PADDING_GAP"]
)
if "VLLM_TPU_BUCKET_PADDING_GAP" in environ
else 0,
"VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(
get("VLLM_TPU_MOST_MODEL_LEN", None)
),
"VLLM_TPU_USING_PATHWAYS": lambda: bool(
"proxy" in lower()
),
"VLLM_USE_DEEP_GEMM": lambda: bool(
int(getenv("VLLM_USE_DEEP_GEMM", "1"))
),
"VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
int(getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
),
"VLLM_SKIP_DEEP_GEMM_WARMUP": lambda: bool(
int(getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))
),
"VLLM_USE_FUSED_MOE_GROUPED_TOPK": lambda: bool(
int(getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))
),
"VLLM_USE_FLASHINFER_MOE_FP16": lambda: bool(
int(getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0"))
),
"VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(
int(getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))
),
"VLLM_USE_FLASHINFER_MOE_FP4": lambda: bool(
int(getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))
),
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": lambda: bool(
int(
getenv(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"
)
)
),
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": lambda: bool(
int(
getenv(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
"0",
)
)
),
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": lambda: bool(
int(
getenv(
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"
)
)
),
"VLLM_XGRAMMAR_CACHE_MB": lambda: int(
getenv("VLLM_XGRAMMAR_CACHE_MB", "512")
),
"VLLM_MSGPACK_ZERO_COPY_THRESHOLD": lambda: int(
getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")
),
"VLLM_ALLOW_INSECURE_SERIALIZATION": lambda: bool(
int(
getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0")
)
),
"VLLM_NIXL_SIDE_CHANNEL_HOST": lambda: getenv(
"VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"
),
"VLLM_NIXL_SIDE_CHANNEL_PORT": lambda: int(
getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5600")
),
"VLLM_ALL2ALL_BACKEND": env_with_choices(
"VLLM_ALL2ALL_BACKEND",
"allgather_reducescatter",
[
"naive",
"pplx",
"deepep_high_throughput",
"deepep_low_latency",
"allgather_reducescatter",
"flashinfer_all2allv",
],
),
"VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
"VLLM_FLASHINFER_MOE_BACKEND",
"throughput",
["throughput", "latency"],
),
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int(
getenv(
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840"
)
),
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB": lambda: loads(
getenv(
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB",
"{}",
)
),
"VLLM_MOE_ROUTING_SIMULATION_STRATEGY": lambda: lower(),
"VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": lambda: int(
getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")
),
"VLLM_SLEEP_WHEN_IDLE": lambda: bool(
int(getenv("VLLM_SLEEP_WHEN_IDLE", "0"))
),
"VLLM_MQ_MAX_CHUNK_BYTES_MB": lambda: int(
getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")
),
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": lambda: int(
getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")
),
"VLLM_KV_CACHE_LAYOUT": env_with_choices(
"VLLM_KV_CACHE_LAYOUT", None, ["NHD", "HND"]
),
"VLLM_COMPUTE_NANS_IN_LOGITS": lambda: bool(
int(getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))
),
"VLLM_USE_NVFP4_CT_EMULATIONS": lambda: bool(
int(getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))
),
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT": lambda: int(
getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480")
),
"VLLM_USE_CUDNN_PREFILL": lambda: bool(
int(getenv("VLLM_USE_CUDNN_PREFILL", "0"))
),
"VLLM_USE_TRTLLM_ATTENTION": lambda: None
if "VLLM_USE_TRTLLM_ATTENTION" not in environ
else lower() in ("1", "true"),
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION": lambda: bool(
int(
getenv(
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
"0",
)
)
),
"VLLM_HAS_FLASHINFER_CUBIN": lambda: getenv(
"VLLM_HAS_FLASHINFER_CUBIN", False
),
"VLLM_USE_TRTLLM_FP4_GEMM": lambda: bool(
int(getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))
),
"VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool(
int(getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))
),
"VLLM_DISABLE_PAD_FOR_CUDAGRAPH": lambda: bool(
int(getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0"))
),
"VLLM_LOOPBACK_IP": lambda: getenv(
"VLLM_LOOPBACK_IP", ""
),
"VLLM_PROCESS_NAME_PREFIX": lambda: getenv(
"VLLM_PROCESS_NAME_PREFIX", "VLLM"
),
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool(
int(
getenv(
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE",
"0",
)
)
),
"VLLM_ENABLE_RESPONSES_API_STORE": lambda: bool(
int(getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))
),
"VLLM_ROCM_FP8_MFMA_PAGE_ATTN": lambda: bool(
int(getenv("VLLM_ROCM_FP8_MFMA_PAGE_ATTN", "0"))
),
"VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
int(getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))
),
"VLLM_TUNED_CONFIG_FOLDER": lambda: getenv(
"VLLM_TUNED_CONFIG_FOLDER", None
),
"VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS": lambda: bool(
int(
getenv(
"VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS",
"0",
)
)
),
"VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool(
int(getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))
),
"VLLM_NVTX_SCOPES_FOR_PROFILING": lambda: bool(
int(getenv("VLLM_NVTX_SCOPES_FOR_PROFILING", "0"))
),
"VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES": lambda: bool(
int(
getenv(
"VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"
)
)
),
"VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME": lambda: getenv(
"VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME",
"VLLM_OBJECT_STORAGE_SHM_BUFFER",
),
"VLLM_DEEPEP_BUFFER_SIZE_MB": lambda: int(
getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024")
),
"VLLM_DBO_COMM_SMS": lambda: int(
getenv("VLLM_DBO_COMM_SMS", "20")
),
"GPT_OSS_SYSTEM_TOOL_MCP_LABELS": env_list_with_choices(
"GPT_OSS_SYSTEM_TOOL_MCP_LABELS",
[],
[
"container",
"code_interpreter",
"web_search_preview",
],
),
"VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE": lambda: bool(
int(
getenv("VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE", "1")
)
),
"VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING": lambda: bool(
int(
getenv(
"VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING",
"1",
)
)
),
"VLLM_USE_NCCL_SYMM_MEM": lambda: bool(
int(getenv("VLLM_USE_NCCL_SYMM_MEM", "0"))
),
"VLLM_NCCL_INCLUDE_PATH": lambda: get(
"VLLM_NCCL_INCLUDE_PATH", None
),
"VLLM_USE_FBGEMM": lambda: bool(
int(getenv("VLLM_USE_FBGEMM", "0"))
),
"VLLM_GC_DEBUG": lambda: getenv("VLLM_GC_DEBUG", ""),
}
__dir__ ¶
compute_hash ¶
compute_hash() -> str
WARNING: Whenever a new key is added to this environment variables, ensure that it is included in the factors list if it affects the computation graph. For example, different values of VLLM_PP_LAYER_PARTITION will generate different computation graphs, so it is included in the factors list. The env vars that affect the choice of different kernels or attention backends should also be included in the factors list.
Source code in vllm/envs.py
1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 |
|
env_list_with_choices ¶
env_list_with_choices(
env_name: str,
default: list[str],
choices: Union[list[str], Callable[[], list[str]]],
case_sensitive: bool = True,
) -> Callable[[], list[str]]
Create a lambda that validates environment variable containing comma-separated values against allowed choices
Parameters:
Name | Type | Description | Default |
---|---|---|---|
env_name | str | Name of the environment variable | required |
default | list[str] | Default list of values if not set | required |
choices | Union[list[str], Callable[[], list[str]]] | List of valid string options or callable that returns list | required |
case_sensitive | bool | Whether validation should be case sensitive | True |
Returns:
Type | Description |
---|---|
Callable[[], list[str]] | Lambda function for environment_variables |
Callable[[], list[str]] | dict that returns list of strings |
Source code in vllm/envs.py
env_with_choices ¶
env_with_choices(
env_name: str,
default: Optional[str],
choices: Union[list[str], Callable[[], list[str]]],
case_sensitive: bool = True,
) -> Callable[[], Optional[str]]
Create a lambda that validates environment variable against allowed choices
Parameters:
Name | Type | Description | Default |
---|---|---|---|
env_name | str | Name of the environment variable | required |
default | Optional[str] | Default value if not set (can be None) | required |
choices | Union[list[str], Callable[[], list[str]]] | List of valid string options or callable that returns list | required |
case_sensitive | bool | Whether validation should be case sensitive | True |
Returns:
Type | Description |
---|---|
Callable[[], Optional[str]] | Lambda function for environment_variables dict |
Source code in vllm/envs.py
get_default_cache_root ¶
get_default_config_root ¶
get_vllm_port ¶
Get the port from VLLM_PORT environment variable.
Returns:
Type | Description |
---|---|
Optional[int] | The port number as an integer if VLLM_PORT is set, None otherwise. |
Raises:
Type | Description |
---|---|
ValueError | If VLLM_PORT is a URI, suggest k8s service discovery issue. |