TensorRT-LLM Build Engine Process
model:
model_dir: ./path/to/model # Path to the pretrained model directory
output_dir: ./path/to/output # Path to save the built engine
dtype: float16 # Data type for the model (choices: float32, float16, bfloat16)
checkpoint:
tp_size: 1 # Tensor parallelism size, increase for multi-GPU tensor parallelism
pp_size: 1 # Pipeline parallelism size, increase for multi-GPU pipeline parallelism
vocab_size: 32000 # Vocabulary size of the model
n_positions: 2048 # Maximum number of positions (sequence length)
n_layer: 32 # Number of layers in the model
n_head: 32 # Number of attention heads
n_embd: 4096 # Hidden size of the model
inter_size: 11008 # Intermediate size of the model's feed-forward layers
#meta_ckpt_dir: # Path to the meta checkpoint directory
#n_kv_head: # Number of key-value heads (defaults to n_head if not specified)
#rms_norm_eps: 1e-6 # Epsilon value for RMS normalization
#use_weight_only: false # Enable weight-only quantization
#weight_only_precision: int8 # Precision for weight-only quantization (choices: int8, int4)
#smoothquant: 0.5 # Smoothquant parameter for quantization
#per_channel: false # Enable per-channel quantization
#per_token: false # Enable per-token quantization
#int8_kv_cache: false # Enable int8 quantization for key-value cache
#ammo_quant_ckpt_path: # Path to the quantized checkpoint file in .npz format
#per_group: false # Enable per-group quantization for GPTQ/AWQ quantization
#load_by_shard: false # Load the pretrained model shard-by-shard
#hidden_act: silu # Activation function used in the model (default: silu)
#rotary_base: 10000.0 # Base value for rotary positional embeddings
#group_size: 128 # Group size used in GPTQ quantization
#dataset_cache_dir: # Path to the dataset cache directory
#load_model_on_cpu: false # Load the model on CPU
#use_parallel_embedding: false # Enable embedding parallelism
#embedding_sharding_dim: 0 # Dimension for embedding sharding (choices: 0, 1)
#use_embedding_sharing: false # Enable embedding sharing to reduce engine size
#workers: 1 # Number of workers for parallel checkpoint conversion
#moe_num_experts: 0 # Number of experts for Mixture of Experts (MoE) layers
#moe_top_k: 0 # Top-k value for MoE layers (defaults to 1 if moe_num_experts is set)
#moe_tp_mode: 0 # Parallelism mode for distributing MoE experts in tensor parallelism
#moe_renorm_mode: 1 # Renormalization mode for MoE gate logits
#save_config_only: false # Only save the model configuration without building the engine
#disable_weight_only_quant_plugin: false # Disable the weight-only quantization plugin
build:
max_input_len: 256 # Maximum input sequence length
max_output_len: 256 # Maximum output sequence length
max_batch_size: 8 # Maximum batch size
max_beam_width: 1 # Maximum beam width for beam search
#max_num_tokens: # Maximum number of tokens to generate
#opt_num_tokens: # Optimal number of tokens to generate
max_prompt_embedding_table_size: 0 # Maximum size of the prompt embedding table
gather_context_logits: false # Gather context logits during generation
gather_generation_logits: false # Gather generation logits during generation
strongly_typed: false # Enable strongly typed network definition
#builder_opt: # Builder optimization level
profiling_verbosity: layer_names_only # Profiling verbosity level (choices: layer_names_only, detailed, none)
enable_debug_output: false # Enable debug output
max_draft_len: 0 # Maximum draft length for Medusa-style generation
use_refit: false # Enable engine refitting
#input_timing_cache: # Path to the input timing cache file
#output_timing_cache: # Path to save the output timing cache file
lora_config: # Configuration for LoRA (Low-Rank Adaptation)
#lora_dir: # Path to the LoRA checkpoint directory
#lora_target_modules: # Target modules for LoRA adaptation
#lora_ckpt_source: hf # Source of LoRA checkpoints (choices: hf, nemo)
#max_lora_rank: 4 # Maximum rank for LoRA adaptation
auto_parallel_config: # Configuration for automatic parallelization
#enabled: false # Enable automatic parallelization
#tp_size: 1 # Tensor parallelism size for automatic parallelization
#pp_size: 1 # Pipeline parallelism size for automatic parallelization
#max_memory_MB: 80000 # Maximum memory in MB for automatic parallelization
#max_dram_memory_MB: 30000 # Maximum DRAM memory in MB for automatic parallelization
#compile_max_memory_MB: 17000 # Maximum memory in MB for compilation during automatic parallelization
#compile_max_dram_memory_MB: 8000 # Maximum DRAM memory in MB for compilation during automatic parallelization
#debug_mode: false # Enable debug mode for automatic parallelization
weight_sparsity: false # Enable weight sparsity
plugin_config: # Configuration for plugins
#use_custom_all_reduce: false # Use custom all-reduce plugin
#use_fp8_all_reduce: false # Use FP8 all-reduce plugin
#use_fp8_cast_plugin: false # Use FP8 cast plugin
#use_async_malloc: false # Use asynchronous memory allocation plugin
#use_paged_context_fmha: false # Use paged context fused multi-head attention plugin
#use_fp8_context_fmha: false # Use FP8 context fused multi-head attention plugin
#lora_plugin: # Configuration for LoRA plugin
#type: # Type of LoRA plugin
max_encoder_input_len: 1024 # Maximum encoder input sequence length for encoder-decoder models
use_fused_mlp: false # Use fused MLP layers
dry_run: false # Perform a dry run without building the engine
visualize_network: false # Visualize the network graphOverview
Build Process
Create a TensorRT-LLM Model Object
Configure the Build Settings
Build the TensorRT Engine
Save the TensorRT Engine
Using the trtllm-build CLI Tool
Building from a Checkpoint
Conclusion
Last updated
Was this helpful?

