Skip to content

Commit

Permalink
Add kv_cache_host_memory_bytes as a configurable runtime setting (#…
Browse files Browse the repository at this point in the history
…1303)

* client side validation for non fp8 kv cache and fp8 context fmha

* enable chunked context as default

* add kv_cache_host_memory_bytes to trt-llm runtime config

* fix test

* bump pyproject version
  • Loading branch information
joostinyi authored Jan 9, 2025
1 parent c07100b commit 5f835d0
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "truss"
version = "0.9.58.post1"
version = "0.9.59.rc0"
description = "A seamless bridge from model development to model delivery"
license = "MIT"
readme = "README.md"
Expand Down
1 change: 1 addition & 0 deletions truss/base/trt_llm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class TrussSpecDecMode(str, Enum):

class TrussTRTLLMRuntimeConfiguration(BaseModel):
kv_cache_free_gpu_mem_fraction: float = 0.9
kv_cache_host_memory_bytes: Optional[int] = None
enable_chunked_context: bool = True
batch_scheduler_policy: TrussTRTLLMBatchSchedulerPolicy = (
TrussTRTLLMBatchSchedulerPolicy.GUARANTEED_NO_EVICT
Expand Down
4 changes: 2 additions & 2 deletions truss/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,8 +409,8 @@ def modify_handle(h: TrussHandle):
},
"runtime": {
"kv_cache_free_gpu_mem_fraction": 0.9,
"enabled_chunked_context": False,
"num_draft_tokens": None,
"kv_cache_host_memory_bytes": 1000,
"enabled_chunked_context": True,
"batch_scheduler_policy": TrussTRTLLMBatchSchedulerPolicy.GUARANTEED_NO_EVICT.value,
},
}
Expand Down
2 changes: 2 additions & 0 deletions truss/tests/trt_llm/test_trt_llm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def test_trt_llm_configuration_init_and_migrate_deprecated_runtime_fields(
trt_llm_config = TRTLLMConfiguration(**deprecated_trtllm_config["trt_llm"])
assert trt_llm_config.runtime.model_dump() == {
"kv_cache_free_gpu_mem_fraction": 0.1,
"kv_cache_host_memory_bytes": None,
"enable_chunked_context": True,
"batch_scheduler_policy": TrussTRTLLMBatchSchedulerPolicy.MAX_UTILIZATION.value,
"request_default_max_tokens": 10,
Expand All @@ -32,6 +33,7 @@ def test_trt_llm_configuration_init_and_migrate_deprecated_runtime_fields_existi
)
assert trt_llm_config.runtime.model_dump() == {
"kv_cache_free_gpu_mem_fraction": 0.1,
"kv_cache_host_memory_bytes": None,
"enable_chunked_context": True,
"batch_scheduler_policy": TrussTRTLLMBatchSchedulerPolicy.MAX_UTILIZATION.value,
"request_default_max_tokens": 10,
Expand Down

0 comments on commit 5f835d0

Please sign in to comment.