LICENSE
MANIFEST.in
README.md
pyproject.toml
requirements.txt
setup.py
csrc/activation_kernels.cu
csrc/cache.h
csrc/cache_kernels.cu
csrc/cuda_compat.h
csrc/cuda_utils.h
csrc/cuda_utils_kernels.cu
csrc/custom_all_reduce.cu
csrc/custom_all_reduce.cuh
csrc/custom_all_reduce_test.cu
csrc/dispatch_utils.h
csrc/layernorm_kernels.cu
csrc/moe_align_block_size_kernels.cu
csrc/ops.h
csrc/pos_encoding_kernels.cu
csrc/pybind.cpp
csrc/reduction_utils.cuh
csrc/attention/attention_dtypes.h
csrc/attention/attention_generic.cuh
csrc/attention/attention_kernels.cu
csrc/attention/attention_utils.cuh
csrc/attention/dtype_bfloat16.cuh
csrc/attention/dtype_float16.cuh
csrc/attention/dtype_float32.cuh
csrc/attention/dtype_fp8_e5m2.cuh
csrc/moe/moe_ops.cpp
csrc/moe/moe_ops.h
csrc/moe/topk_softmax_kernels.cu
csrc/punica/LICENSE
csrc/punica/punica_ops.cc
csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
csrc/punica/bgmv/bgmv_config.h
csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
csrc/punica/bgmv/bgmv_impl.cuh
csrc/punica/bgmv/generator.py
csrc/punica/bgmv/vec_dtypes.cuh
csrc/quantization/awq/dequantize.cuh
csrc/quantization/awq/gemm_kernels.cu
csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh
csrc/quantization/gptq/compat.cuh
csrc/quantization/gptq/matrix_view.cuh
csrc/quantization/gptq/q_gemm.cu
csrc/quantization/gptq/qdq_4.cuh
csrc/quantization/gptq/qdq_util.cuh
csrc/quantization/squeezellm/quant_cuda_kernel.cu
tests/test_regression.py
tests/test_sampling_params.py
tests/lora/__init__.py
tests/lora/conftest.py
tests/lora/test_layers.py
tests/lora/test_llama.py
tests/lora/test_lora.py
tests/lora/test_lora_manager.py
tests/lora/test_punica.py
tests/lora/test_tokenizer.py
tests/lora/test_utils.py
tests/lora/test_worker.py
tests/lora/utils.py
tests/worker/__init__.py
tests/worker/test_model_runner.py
tests/worker/spec_decode/__init__.py
tests/worker/spec_decode/test_multi_step_worker.py
tests/worker/spec_decode/utils.py
vllm/__init__.py
vllm/block.py
vllm/config.py
vllm/logger.py
vllm/outputs.py
vllm/prefix.py
vllm/py.typed
vllm/sampling_params.py
vllm/sequence.py
vllm/test_utils.py
vllm/utils.py
vllm.egg-info/PKG-INFO
vllm.egg-info/SOURCES.txt
vllm.egg-info/dependency_links.txt
vllm.egg-info/requires.txt
vllm.egg-info/top_level.txt
vllm/core/__init__.py
vllm/core/block_manager.py
vllm/core/policy.py
vllm/core/scheduler.py
vllm/engine/__init__.py
vllm/engine/arg_utils.py
vllm/engine/async_llm_engine.py
vllm/engine/llm_engine.py
vllm/engine/metrics.py
vllm/engine/ray_utils.py
vllm/entrypoints/__init__.py
vllm/entrypoints/api_server.py
vllm/entrypoints/llm.py
vllm/entrypoints/openai/__init__.py
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_engine.py
vllm/lora/__init__.py
vllm/lora/layers.py
vllm/lora/lora.py
vllm/lora/models.py
vllm/lora/punica.py
vllm/lora/request.py
vllm/lora/utils.py
vllm/lora/worker_manager.py
vllm/model_executor/__init__.py
vllm/model_executor/input_metadata.py
vllm/model_executor/model_loader.py
vllm/model_executor/sampling_metadata.py
vllm/model_executor/utils.py
vllm/model_executor/weight_utils.py
vllm/model_executor/layers/__init__.py
vllm/model_executor/layers/activation.py
vllm/model_executor/layers/attention.py
vllm/model_executor/layers/fused_moe.py
vllm/model_executor/layers/layernorm.py
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/rejection_sampler.py
vllm/model_executor/layers/rotary_embedding.py
vllm/model_executor/layers/sampler.py
vllm/model_executor/layers/vocab_parallel_embedding.py
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/awq.py
vllm/model_executor/layers/quantization/base_config.py
vllm/model_executor/layers/quantization/gptq.py
vllm/model_executor/layers/quantization/squeezellm.py
vllm/model_executor/layers/triton_kernel/__init__.py
vllm/model_executor/layers/triton_kernel/prefix_prefill.py
vllm/model_executor/models/__init__.py
vllm/model_executor/models/aquila.py
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/bloom.py
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/decilm.py
vllm/model_executor/models/deepseek.py
vllm/model_executor/models/falcon.py
vllm/model_executor/models/gpt2.py
vllm/model_executor/models/gpt_bigcode.py
vllm/model_executor/models/gpt_j.py
vllm/model_executor/models/gpt_neox.py
vllm/model_executor/models/internlm.py
vllm/model_executor/models/internlm2.py
vllm/model_executor/models/llama.py
vllm/model_executor/models/mistral.py
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mixtral_quant.py
vllm/model_executor/models/mpt.py
vllm/model_executor/models/opt.py
vllm/model_executor/models/phi.py
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/stablelm.py
vllm/model_executor/models/yi.py
vllm/model_executor/parallel_utils/__init__.py
vllm/model_executor/parallel_utils/communication_op.py
vllm/model_executor/parallel_utils/custom_all_reduce.py
vllm/model_executor/parallel_utils/parallel_state.py
vllm/model_executor/parallel_utils/utils.py
vllm/transformers_utils/__init__.py
vllm/transformers_utils/config.py
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/aquila.py
vllm/transformers_utils/configs/baichuan.py
vllm/transformers_utils/configs/chatglm.py
vllm/transformers_utils/configs/falcon.py
vllm/transformers_utils/configs/mpt.py
vllm/transformers_utils/configs/qwen.py
vllm/transformers_utils/configs/yi.py
vllm/transformers_utils/tokenizers/__init__.py
vllm/transformers_utils/tokenizers/baichuan.py
vllm/worker/__init__.py
vllm/worker/cache_engine.py
vllm/worker/model_runner.py
vllm/worker/worker.py