config.pbtxt 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857
  1. # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions
  5. # are met:
  6. # * Redistributions of source code must retain the above copyright
  7. # notice, this list of conditions and the following disclaimer.
  8. # * Redistributions in binary form must reproduce the above copyright
  9. # notice, this list of conditions and the following disclaimer in the
  10. # documentation and/or other materials provided with the distribution.
  11. # * Neither the name of NVIDIA CORPORATION nor the names of its
  12. # contributors may be used to endorse or promote products derived
  13. # from this software without specific prior written permission.
  14. #
  15. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
  16. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  18. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  19. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  20. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  21. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  22. # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  23. # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. name: "tensorrt_llm"
  27. backend: "${triton_backend}"
  28. max_batch_size: ${triton_max_batch_size}
  29. model_transaction_policy {
  30. decoupled: ${decoupled_mode}
  31. }
  32. dynamic_batching {
  33. preferred_batch_size: [ ${triton_max_batch_size} ]
  34. max_queue_delay_microseconds: ${max_queue_delay_microseconds}
  35. default_queue_policy: { max_queue_size: ${max_queue_size} }
  36. }
  37. input [
  38. {
  39. name: "input_ids"
  40. data_type: TYPE_INT32
  41. dims: [ -1 ]
  42. allow_ragged_batch: true
  43. optional: true
  44. },
  45. {
  46. name: "encoder_input_features"
  47. data_type: ${encoder_input_features_data_type}
  48. dims: [ -1, -1 ]
  49. allow_ragged_batch: true
  50. optional: true
  51. },
  52. {
  53. name: "encoder_output_lengths"
  54. data_type: TYPE_INT32
  55. dims: [ 1 ]
  56. reshape: { shape: [ ] }
  57. optional: true
  58. },
  59. {
  60. name: "input_lengths"
  61. data_type: TYPE_INT32
  62. dims: [ 1 ]
  63. reshape: { shape: [ ] }
  64. },
  65. {
  66. name: "request_output_len"
  67. data_type: TYPE_INT32
  68. dims: [ 1 ]
  69. reshape: { shape: [ ] }
  70. },
  71. {
  72. name: "num_return_sequences"
  73. data_type: TYPE_INT32
  74. dims: [ 1 ]
  75. reshape: { shape: [ ] }
  76. optional: true
  77. },
  78. {
  79. name: "draft_input_ids"
  80. data_type: TYPE_INT32
  81. dims: [ -1 ]
  82. optional: true
  83. allow_ragged_batch: true
  84. },
  85. {
  86. name: "decoder_input_ids"
  87. data_type: TYPE_INT32
  88. dims: [ -1 ]
  89. optional: true
  90. allow_ragged_batch: true
  91. },
  92. {
  93. name: "decoder_input_lengths"
  94. data_type: TYPE_INT32
  95. dims: [ 1 ]
  96. optional: true
  97. reshape: { shape: [ ] }
  98. },
  99. {
  100. name: "draft_logits"
  101. data_type: ${logits_datatype}
  102. dims: [ -1, -1 ]
  103. optional: true
  104. allow_ragged_batch: true
  105. },
  106. {
  107. name: "draft_acceptance_threshold"
  108. data_type: TYPE_FP32
  109. dims: [ 1 ]
  110. reshape: { shape: [ ] }
  111. optional: true
  112. },
  113. {
  114. name: "end_id"
  115. data_type: TYPE_INT32
  116. dims: [ 1 ]
  117. reshape: { shape: [ ] }
  118. optional: true
  119. },
  120. {
  121. name: "pad_id"
  122. data_type: TYPE_INT32
  123. dims: [ 1 ]
  124. reshape: { shape: [ ] }
  125. optional: true
  126. },
  127. {
  128. name: "stop_words_list"
  129. data_type: TYPE_INT32
  130. dims: [ 2, -1 ]
  131. optional: true
  132. allow_ragged_batch: true
  133. },
  134. {
  135. name: "bad_words_list"
  136. data_type: TYPE_INT32
  137. dims: [ 2, -1 ]
  138. optional: true
  139. allow_ragged_batch: true
  140. },
  141. {
  142. name: "embedding_bias"
  143. data_type: TYPE_FP32
  144. dims: [ -1 ]
  145. optional: true
  146. allow_ragged_batch: true
  147. },
  148. {
  149. name: "beam_width"
  150. data_type: TYPE_INT32
  151. dims: [ 1 ]
  152. reshape: { shape: [ ] }
  153. optional: true
  154. },
  155. {
  156. name: "temperature"
  157. data_type: TYPE_FP32
  158. dims: [ 1 ]
  159. reshape: { shape: [ ] }
  160. optional: true
  161. },
  162. {
  163. name: "runtime_top_k"
  164. data_type: TYPE_INT32
  165. dims: [ 1 ]
  166. reshape: { shape: [ ] }
  167. optional: true
  168. },
  169. {
  170. name: "runtime_top_p"
  171. data_type: TYPE_FP32
  172. dims: [ 1 ]
  173. reshape: { shape: [ ] }
  174. optional: true
  175. },
  176. {
  177. name: "runtime_top_p_min"
  178. data_type: TYPE_FP32
  179. dims: [ 1 ]
  180. reshape: { shape: [ ] }
  181. optional: true
  182. },
  183. {
  184. name: "runtime_top_p_decay"
  185. data_type: TYPE_FP32
  186. dims: [ 1 ]
  187. reshape: { shape: [ ] }
  188. optional: true
  189. },
  190. {
  191. name: "runtime_top_p_reset_ids"
  192. data_type: TYPE_INT32
  193. dims: [ 1 ]
  194. reshape: { shape: [ ] }
  195. optional: true
  196. },
  197. {
  198. name: "len_penalty"
  199. data_type: TYPE_FP32
  200. dims: [ 1 ]
  201. reshape: { shape: [ ] }
  202. optional: true
  203. },
  204. {
  205. name: "early_stopping"
  206. data_type: TYPE_BOOL
  207. dims: [ 1 ]
  208. reshape: { shape: [ ] }
  209. optional: true
  210. },
  211. {
  212. name: "repetition_penalty"
  213. data_type: TYPE_FP32
  214. dims: [ 1 ]
  215. reshape: { shape: [ ] }
  216. optional: true
  217. },
  218. {
  219. name: "min_length"
  220. data_type: TYPE_INT32
  221. dims: [ 1 ]
  222. reshape: { shape: [ ] }
  223. optional: true
  224. },
  225. {
  226. name: "beam_search_diversity_rate"
  227. data_type: TYPE_FP32
  228. dims: [ 1 ]
  229. reshape: { shape: [ ] }
  230. optional: true
  231. },
  232. {
  233. name: "presence_penalty"
  234. data_type: TYPE_FP32
  235. dims: [ 1 ]
  236. reshape: { shape: [ ] }
  237. optional: true
  238. },
  239. {
  240. name: "frequency_penalty"
  241. data_type: TYPE_FP32
  242. dims: [ 1 ]
  243. reshape: { shape: [ ] }
  244. optional: true
  245. },
  246. {
  247. name: "random_seed"
  248. data_type: TYPE_UINT64
  249. dims: [ 1 ]
  250. reshape: { shape: [ ] }
  251. optional: true
  252. },
  253. {
  254. name: "return_log_probs"
  255. data_type: TYPE_BOOL
  256. dims: [ 1 ]
  257. reshape: { shape: [ ] }
  258. optional: true
  259. },
  260. {
  261. name: "return_context_logits"
  262. data_type: TYPE_BOOL
  263. dims: [ 1 ]
  264. reshape: { shape: [ ] }
  265. optional: true
  266. },
  267. {
  268. name: "return_generation_logits"
  269. data_type: TYPE_BOOL
  270. dims: [ 1 ]
  271. reshape: { shape: [ ] }
  272. optional: true
  273. },
  274. {
  275. name: "return_perf_metrics"
  276. data_type: TYPE_BOOL
  277. dims: [ 1 ]
  278. reshape: { shape: [ ] }
  279. optional: true
  280. },
  281. {
  282. name: "exclude_input_in_output"
  283. data_type: TYPE_BOOL
  284. dims: [ 1 ]
  285. reshape: { shape: [ ] }
  286. optional: true
  287. },
  288. {
  289. name: "stop"
  290. data_type: TYPE_BOOL
  291. dims: [ 1 ]
  292. reshape: { shape: [ ] }
  293. optional: true
  294. },
  295. {
  296. name: "streaming"
  297. data_type: TYPE_BOOL
  298. dims: [ 1 ]
  299. reshape: { shape: [ ] }
  300. optional: true
  301. },
  302. {
  303. name: "prompt_embedding_table"
  304. data_type: TYPE_FP16
  305. dims: [ -1, -1 ]
  306. optional: true
  307. allow_ragged_batch: true
  308. },
  309. {
  310. name: "prompt_table_extra_ids"
  311. data_type: TYPE_UINT64
  312. dims: [ -1 ]
  313. optional: true
  314. allow_ragged_batch: true
  315. },
  316. {
  317. name: "prompt_vocab_size"
  318. data_type: TYPE_INT32
  319. dims: [ 1 ]
  320. reshape: { shape: [ ] }
  321. optional: true
  322. },
  323. # cross_attention_mask shape `[bs, seq_len, num_images*num_tiles]`
  324. {
  325. name: "cross_attention_mask"
  326. data_type: TYPE_BOOL
  327. dims: [ -1, -1 ]
  328. optional: true
  329. allow_ragged_batch: true
  330. },
  331. # Mrope param when mrope is used
  332. {
  333. name: "mrope_rotary_cos_sin"
  334. data_type: TYPE_FP32
  335. dims: [ -1 ]
  336. optional: true
  337. },
  338. {
  339. name: "mrope_position_deltas"
  340. data_type: TYPE_INT64
  341. dims: [ 1 ]
  342. optional: true
  343. },
  344. # the unique task ID for the given LoRA.
  345. # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
  346. # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
  347. # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
  348. {
  349. name: "lora_task_id"
  350. data_type: TYPE_UINT64
  351. dims: [ 1 ]
  352. reshape: { shape: [ ] }
  353. optional: true
  354. },
  355. # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
  356. # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
  357. # each of the in / out tensors are first flattened and then concatenated together in the format above.
  358. # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
  359. {
  360. name: "lora_weights"
  361. data_type: TYPE_FP16
  362. dims: [ -1, -1 ]
  363. optional: true
  364. allow_ragged_batch: true
  365. },
  366. # module identifier (same size a first dimension of lora_weights)
  367. # See LoraModule::ModuleType for model id mapping
  368. #
  369. # "attn_qkv": 0 # compbined qkv adapter
  370. # "attn_q": 1 # q adapter
  371. # "attn_k": 2 # k adapter
  372. # "attn_v": 3 # v adapter
  373. # "attn_dense": 4 # adapter for the dense layer in attention
  374. # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
  375. # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
  376. # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
  377. #
  378. # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
  379. {
  380. name: "lora_config"
  381. data_type: TYPE_INT32
  382. dims: [ -1, 3 ]
  383. optional: true
  384. allow_ragged_batch: true
  385. },
  386. {
  387. name: "context_phase_params"
  388. data_type: TYPE_UINT8
  389. dims: [ -1 ]
  390. optional: true
  391. allow_ragged_batch: true
  392. },
  393. # skip_cross_attn_blocks shape `[bs, 1]`, only used in mllama
  394. {
  395. name: "skip_cross_attn_blocks"
  396. data_type: TYPE_BOOL
  397. dims: [ 1 ]
  398. optional: true
  399. allow_ragged_batch: true
  400. },
  401. {
  402. name: "retention_token_range_starts"
  403. data_type: TYPE_INT32
  404. dims: [ -1 ]
  405. optional: true
  406. allow_ragged_batch: true
  407. },
  408. {
  409. name: "retention_token_range_ends"
  410. data_type: TYPE_INT32
  411. dims: [ -1 ]
  412. optional: true
  413. allow_ragged_batch: true
  414. },
  415. {
  416. name: "retention_token_range_priorities"
  417. data_type: TYPE_INT32
  418. dims: [ -1 ]
  419. optional: true
  420. allow_ragged_batch: true
  421. },
  422. {
  423. name: "retention_token_range_durations_ms"
  424. data_type: TYPE_INT32
  425. dims: [ -1 ]
  426. optional: true
  427. allow_ragged_batch: true
  428. },
  429. {
  430. name: "retention_decode_priority"
  431. data_type: TYPE_INT32
  432. dims: [ 1 ]
  433. optional: true
  434. allow_ragged_batch: true
  435. },
  436. {
  437. name: "retention_decode_duration_ms"
  438. data_type: TYPE_INT32
  439. dims: [ 1 ]
  440. optional: true
  441. allow_ragged_batch: true
  442. },
  443. {
  444. name: "guided_decoding_guide_type"
  445. data_type: TYPE_STRING
  446. dims: [ 1 ]
  447. optional: true
  448. allow_ragged_batch: true
  449. },
  450. {
  451. name: "guided_decoding_guide"
  452. data_type: TYPE_STRING
  453. dims: [ 1 ]
  454. optional: true
  455. allow_ragged_batch: true
  456. },
  457. {
  458. name: "lookahead_window_size"
  459. data_type: TYPE_INT32
  460. dims: [ 1 ]
  461. optional: true
  462. allow_ragged_batch: true
  463. },
  464. {
  465. name: "lookahead_ngram_size"
  466. data_type: TYPE_INT32
  467. dims: [ 1 ]
  468. optional: true
  469. allow_ragged_batch: true
  470. },
  471. {
  472. name: "lookahead_verification_set_size"
  473. data_type: TYPE_INT32
  474. dims: [ 1 ]
  475. optional: true
  476. allow_ragged_batch: true
  477. }
  478. ]
  479. output [
  480. {
  481. name: "output_ids"
  482. data_type: TYPE_INT32
  483. dims: [ -1, -1 ]
  484. },
  485. {
  486. name: "sequence_length"
  487. data_type: TYPE_INT32
  488. dims: [ -1 ]
  489. },
  490. {
  491. name: "cum_log_probs"
  492. data_type: TYPE_FP32
  493. dims: [ -1 ]
  494. },
  495. {
  496. name: "output_log_probs"
  497. data_type: TYPE_FP32
  498. dims: [ -1, -1 ]
  499. },
  500. {
  501. name: "context_logits"
  502. data_type: ${logits_datatype}
  503. dims: [ -1, -1 ]
  504. },
  505. {
  506. name: "generation_logits"
  507. data_type: ${logits_datatype}
  508. dims: [ -1, -1, -1 ]
  509. },
  510. {
  511. name: "batch_index"
  512. data_type: TYPE_INT32
  513. dims: [ 1 ]
  514. },
  515. {
  516. name: "sequence_index"
  517. data_type: TYPE_INT32
  518. dims: [ 1 ]
  519. },
  520. {
  521. name: "context_phase_params"
  522. data_type: TYPE_UINT8
  523. dims: [ -1 ]
  524. },
  525. {
  526. name: "kv_cache_alloc_new_blocks"
  527. data_type: TYPE_INT32
  528. dims: [ 1 ]
  529. },
  530. {
  531. name: "kv_cache_reused_blocks"
  532. data_type: TYPE_INT32
  533. dims: [ 1 ]
  534. },
  535. {
  536. name: "kv_cache_alloc_total_blocks"
  537. data_type: TYPE_INT32
  538. dims: [ 1 ]
  539. },
  540. {
  541. name: "arrival_time_ns"
  542. data_type: TYPE_INT64
  543. dims: [ 1 ]
  544. },
  545. {
  546. name: "first_scheduled_time_ns"
  547. data_type: TYPE_INT64
  548. dims: [ 1 ]
  549. },
  550. {
  551. name: "first_token_time_ns"
  552. data_type: TYPE_INT64
  553. dims: [ 1 ]
  554. },
  555. {
  556. name: "last_token_time_ns"
  557. data_type: TYPE_INT64
  558. dims: [ 1 ]
  559. },
  560. {
  561. name: "acceptance_rate"
  562. data_type: TYPE_FP32
  563. dims: [ 1 ]
  564. },
  565. {
  566. name: "total_accepted_draft_tokens"
  567. data_type: TYPE_INT32
  568. dims: [ 1 ]
  569. },
  570. {
  571. name: "total_draft_tokens"
  572. data_type: TYPE_INT32
  573. dims: [ 1 ]
  574. }
  575. ]
  576. instance_group [
  577. {
  578. count: 1
  579. kind : KIND_CPU
  580. }
  581. ]
  582. parameters: {
  583. key: "max_beam_width"
  584. value: {
  585. string_value: "${max_beam_width}"
  586. }
  587. }
  588. parameters: {
  589. key: "FORCE_CPU_ONLY_INPUT_TENSORS"
  590. value: {
  591. string_value: "no"
  592. }
  593. }
  594. parameters: {
  595. key: "gpt_model_type"
  596. value: {
  597. string_value: "${batching_strategy}"
  598. }
  599. }
  600. parameters: {
  601. key: "gpt_model_path"
  602. value: {
  603. string_value: "${engine_dir}"
  604. }
  605. }
  606. parameters: {
  607. key: "encoder_model_path"
  608. value: {
  609. string_value: "${encoder_engine_dir}"
  610. }
  611. }
  612. parameters: {
  613. key: "max_tokens_in_paged_kv_cache"
  614. value: {
  615. string_value: "${max_tokens_in_paged_kv_cache}"
  616. }
  617. }
  618. parameters: {
  619. key: "max_attention_window_size"
  620. value: {
  621. string_value: "${max_attention_window_size}"
  622. }
  623. }
  624. parameters: {
  625. key: "sink_token_length"
  626. value: {
  627. string_value: "${sink_token_length}"
  628. }
  629. }
  630. parameters: {
  631. key: "batch_scheduler_policy"
  632. value: {
  633. string_value: "${batch_scheduler_policy}"
  634. }
  635. }
  636. parameters: {
  637. key: "kv_cache_free_gpu_mem_fraction"
  638. value: {
  639. string_value: "${kv_cache_free_gpu_mem_fraction}"
  640. }
  641. }
  642. parameters: {
  643. key: "cross_kv_cache_fraction"
  644. value: {
  645. string_value: "${cross_kv_cache_fraction}"
  646. }
  647. }
  648. parameters: {
  649. key: "kv_cache_host_memory_bytes"
  650. value: {
  651. string_value: "${kv_cache_host_memory_bytes}"
  652. }
  653. }
  654. # kv_cache_onboard_blocks is for internal implementation.
  655. parameters: {
  656. key: "kv_cache_onboard_blocks"
  657. value: {
  658. string_value: "${kv_cache_onboard_blocks}"
  659. }
  660. }
  661. # enable_trt_overlap is deprecated and doesn't have any effect on the runtime
  662. # parameters: {
  663. # key: "enable_trt_overlap"
  664. # value: {
  665. # string_value: "${enable_trt_overlap}"
  666. # }
  667. # }
  668. parameters: {
  669. key: "exclude_input_in_output"
  670. value: {
  671. string_value: "${exclude_input_in_output}"
  672. }
  673. }
  674. parameters: {
  675. key: "cancellation_check_period_ms"
  676. value: {
  677. string_value: "${cancellation_check_period_ms}"
  678. }
  679. }
  680. parameters: {
  681. key: "stats_check_period_ms"
  682. value: {
  683. string_value: "${stats_check_period_ms}"
  684. }
  685. }
  686. parameters: {
  687. key: "iter_stats_max_iterations"
  688. value: {
  689. string_value: "${iter_stats_max_iterations}"
  690. }
  691. }
  692. parameters: {
  693. key: "request_stats_max_iterations"
  694. value: {
  695. string_value: "${request_stats_max_iterations}"
  696. }
  697. }
  698. parameters: {
  699. key: "enable_kv_cache_reuse"
  700. value: {
  701. string_value: "${enable_kv_cache_reuse}"
  702. }
  703. }
  704. parameters: {
  705. key: "normalize_log_probs"
  706. value: {
  707. string_value: "${normalize_log_probs}"
  708. }
  709. }
  710. parameters: {
  711. key: "enable_chunked_context"
  712. value: {
  713. string_value: "${enable_chunked_context}"
  714. }
  715. }
  716. parameters: {
  717. key: "gpu_device_ids"
  718. value: {
  719. string_value: "${gpu_device_ids}"
  720. }
  721. }
  722. parameters: {
  723. key: "participant_ids"
  724. value: {
  725. string_value: "${participant_ids}"
  726. }
  727. }
  728. parameters: {
  729. key: "lora_cache_optimal_adapter_size"
  730. value: {
  731. string_value: "${lora_cache_optimal_adapter_size}"
  732. }
  733. }
  734. parameters: {
  735. key: "lora_cache_max_adapter_size"
  736. value: {
  737. string_value: "${lora_cache_max_adapter_size}"
  738. }
  739. }
  740. parameters: {
  741. key: "lora_cache_gpu_memory_fraction"
  742. value: {
  743. string_value: "${lora_cache_gpu_memory_fraction}"
  744. }
  745. }
  746. parameters: {
  747. key: "lora_cache_host_memory_bytes"
  748. value: {
  749. string_value: "${lora_cache_host_memory_bytes}"
  750. }
  751. }
  752. parameters: {
  753. key: "lora_prefetch_dir"
  754. value: {
  755. string_value: "${lora_prefetch_dir}"
  756. }
  757. }
  758. parameters: {
  759. key: "decoding_mode"
  760. value: {
  761. string_value: "${decoding_mode}"
  762. }
  763. }
  764. parameters: {
  765. key: "executor_worker_path"
  766. value: {
  767. string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
  768. }
  769. }
  770. parameters: {
  771. key: "lookahead_window_size"
  772. value: {
  773. string_value: "${lookahead_window_size}"
  774. }
  775. }
  776. parameters: {
  777. key: "lookahead_ngram_size"
  778. value: {
  779. string_value: "${lookahead_ngram_size}"
  780. }
  781. }
  782. parameters: {
  783. key: "lookahead_verification_set_size"
  784. value: {
  785. string_value: "${lookahead_verification_set_size}"
  786. }
  787. }
  788. parameters: {
  789. key: "medusa_choices"
  790. value: {
  791. string_value: "${medusa_choices}"
  792. }
  793. }
  794. parameters: {
  795. key: "eagle_choices"
  796. value: {
  797. string_value: "${eagle_choices}"
  798. }
  799. }
  800. parameters: {
  801. key: "gpu_weights_percent"
  802. value: {
  803. string_value: "${gpu_weights_percent}"
  804. }
  805. }
  806. parameters: {
  807. key: "enable_context_fmha_fp32_acc"
  808. value: {
  809. string_value: "${enable_context_fmha_fp32_acc}"
  810. }
  811. }
  812. parameters: {
  813. key: "multi_block_mode"
  814. value: {
  815. string_value: "${multi_block_mode}"
  816. }
  817. }
  818. parameters: {
  819. key: "cuda_graph_mode"
  820. value: {
  821. string_value: "${cuda_graph_mode}"
  822. }
  823. }
  824. parameters: {
  825. key: "cuda_graph_cache_size"
  826. value: {
  827. string_value: "${cuda_graph_cache_size}"
  828. }
  829. }
  830. parameters: {
  831. key: "speculative_decoding_fast_logits"
  832. value: {
  833. string_value: "${speculative_decoding_fast_logits}"
  834. }
  835. }
  836. parameters: {
  837. key: "tokenizer_dir"
  838. value: {
  839. string_value: "${tokenizer_dir}"
  840. }
  841. }
  842. parameters: {
  843. key: "guided_decoding_backend"
  844. value: {
  845. string_value: "${guided_decoding_backend}"
  846. }
  847. }
  848. parameters: {
  849. key: "xgrammar_tokenizer_info_path"
  850. value: {
  851. string_value: "${xgrammar_tokenizer_info_path}"
  852. }
  853. }