工作常用命令记录--sglang
sglang操作记录python-msglang.launch_server\--model-path Qwen/Qwen3-8B\--speculative-algorithm DFLASH\--speculative-draft-model-path z-lab/Qwen3-8B-DFlash-b16\--speculative-num-draft-tokens16\--tp-size1\--attention-backend flashinfer\--mem-fraction-static0.75\--trust-remote-codeCUDA_VISIBLE_DEVICES=3vllm serve /root/models/Qwen3-8B\--speculative-config'{"method": "qwen3_next_mtp", "model": "/root/models/Qwen3-8B-DFlash-b16", "num_speculative_tokens": 15}'\--max-num-batched-tokens32768CUDA_VISIBLE_DEVICES=3VLLM_USE_V1=0vllm serve /root/models/Qwen3-8B\--port8188\--served-model-name qwen3\--tool-call-parser hermes\--dtypebfloat16\--max-model-len16384\--reasoning-parser deepseek_r1\--gpu-memory-utilization0.6\--enable-prefix-caching\--kv-cache-dtype fp8\--speculative-config'{"method": "dflash", "model": "/root/models/Qwen3-8B-DFlash-b16", "num_speculative_tokens": 15}'CUDA_VISIBLE_DEVICES=3python-msglang.launch_server --model-path /root/models/Qwen3-8B--reasoning-parser qwen3CUDA_VISIBLE_DEVICES=3python-msglang.launch_server\--model-path /root/models/Qwen3-8B\--speculative-algorithm DFLASH\--speculative-draft-model-path /root/models/Qwen3-8B-DFlash-b16\--tp-size1\--dtypebfloat16