model:model_dir:./llama-2-7b-chat-hf# Path to the pretrained model directoryoutput_dir:./llama-2-7b-chat-engine# Path to save the built enginedtype:float16# Data type for the model (choices: float32, float16, bfloat16)checkpoint:checkpoint_dir:../llama-2-7b-chat-hf-output# Path to the TensorRT-LLM checkpoint directorytp_size:1# Tensor parallelism size, increase for multi-GPU tensor parallelismpp_size:1# Pipeline parallelism size, increase for multi-GPU pipeline parallelismvocab_size:32000# Vocabulary size of the modeln_positions:4096# Maximum number of positions (sequence length)n_layer:32# Number of layers in the modeln_head:32# Number of attention headsn_embd:4096# Hidden size of the modelinter_size:11008# Intermediate size of the model's feed-forward layers#meta_ckpt_dir: # Path to the meta checkpoint directoryn_kv_head:32# Number of key-value heads (defaults to n_head if not specified)rms_norm_eps:1e-5# Epsilon value for RMS normalizationbos_token_id:1# Beginning of sequence token IDeos_token_id:2# End of sequence token IDtie_word_embeddings:false# Tie the word embeddingsuse_cache:true# Enable caching for faster generationtorch_dtype:float16# PyTorch data type for the model#use_weight_only: false # Enable weight-only quantization#weight_only_precision: int8 # Precision for weight-only quantization (choices: int8, int4)#smoothquant: 0.5 # Smoothquant parameter for quantization#per_channel: false # Enable per-channel quantization#per_token: false # Enable per-token quantization#int8_kv_cache: false # Enable int8 quantization for key-value cache#ammo_quant_ckpt_path: # Path to the quantized checkpoint file in .npz format#per_group: false # Enable per-group quantization for GPTQ/AWQ quantization#load_by_shard: false # Load the pretrained model shard-by-shardhidden_act:silu# Activation function used in the model (default: silu)#group_size: 128 # Group size used in GPTQ quantization#dataset_cache_dir: # Path to the dataset cache directory#load_model_on_cpu: false # Load the model on CPU#use_parallel_embedding: false # Enable embedding parallelism#embedding_sharding_dim: 0 # Dimension for embedding sharding (choices: 0, 1)#use_embedding_sharing: false # Enable embedding sharing to reduce engine size#workers: 1 # Number of workers for parallel checkpoint conversion#moe_num_experts: 0 # Number of experts for Mixture of Experts (MoE) layers#moe_top_k: 0 # Top-k value for MoE layers (defaults to 1 if moe_num_experts is set)#moe_tp_mode: 0 # Parallelism mode for distributing MoE experts in tensor parallelism#moe_renorm_mode: 1 # Renormalization mode for MoE gate logits#save_config_only: false # Only save the model configuration without building the engine#disable_weight_only_quant_plugin: false # Disable the weight-only quantization pluginbuild:max_input_len:256# Maximum input sequence lengthmax_output_len:256# Maximum output sequence lengthmax_batch_size:8# Maximum batch sizemax_beam_width:1# Maximum beam width for beam search#max_num_tokens: # Maximum number of tokens to generate#opt_num_tokens: # Optimal number of tokens to generatemax_prompt_embedding_table_size:0# Maximum size of the prompt embedding tablegather_context_logits:false# Gather context logits during generationgather_generation_logits:false# Gather generation logits during generationstrongly_typed:false# Enable strongly typed network definition#builder_opt: # Builder optimization levelprofiling_verbosity:layer_names_only# Profiling verbosity level (choices: layer_names_only, detailed, none)enable_debug_output:false# Enable debug outputmax_draft_len:0# Maximum draft length for Medusa-style generationuse_refit:false# Enable engine refitting#input_timing_cache: # Path to the input timing cache file#output_timing_cache: # Path to save the output timing cache filelora_config:# Configuration for LoRA (Low-Rank Adaptation)#lora_dir: # Path to the LoRA checkpoint directory#lora_target_modules: # Target modules for LoRA adaptation#lora_ckpt_source: hf # Source of LoRA checkpoints (choices: hf, nemo)#max_lora_rank: 4 # Maximum rank for LoRA adaptationauto_parallel_config:# Configuration for automatic parallelization#enabled: false # Enable automatic parallelization#tp_size: 1 # Tensor parallelism size for automatic parallelization#pp_size: 1 # Pipeline parallelism size for automatic parallelization#max_memory_MB: 80000 # Maximum memory in MB for automatic parallelization#max_dram_memory_MB: 30000 # Maximum DRAM memory in MB for automatic parallelization#compile_max_memory_MB: 17000 # Maximum memory in MB for compilation during automatic parallelization#compile_max_dram_memory_MB: 8000 # Maximum DRAM memory in MB for compilation during automatic parallelization#debug_mode: false # Enable debug mode for automatic parallelizationweight_sparsity:false# Enable weight sparsityplugin_config:# Configuration for plugins#use_custom_all_reduce: false # Use custom all-reduce plugin#use_fp8_all_reduce: false # Use FP8 all-reduce plugin#use_fp8_cast_plugin: false # Use FP8 cast plugin#use_async_malloc: false # Use asynchronous memory allocation plugin#use_paged_context_fmha: false # Use paged context fused multi-head attention plugin#use_fp8_context_fmha: false # Use FP8 context fused multi-head attention plugin#lora_plugin: # Configuration for LoRA plugin#type: # Type of LoRA pluginmax_encoder_input_len:1024# Maximum encoder input sequence length for encoder-decoder modelsuse_fused_mlp:false# Use fused MLP layersdry_run:false# Perform a dry run without building the enginevisualize_network:false# Visualize the network graph
model:model_dir:./path/to/model# Path to the pretrained model directoryoutput_dir:./path/to/output# Path to save the built enginedtype:float16# Data type for the model (choices: float32, float16, bfloat16)checkpoint:tp_size:1# Tensor parallelism size, increase for multi-GPU tensor parallelismpp_size:1# Pipeline parallelism size, increase for multi-GPU pipeline parallelismvocab_size:32000# Vocabulary size of the modeln_positions:2048# Maximum number of positions (sequence length)n_layer:32# Number of layers in the modeln_head:32# Number of attention headsn_embd:4096# Hidden size of the modelinter_size:11008# Intermediate size of the model's feed-forward layers#meta_ckpt_dir: # Path to the meta checkpoint directory#n_kv_head: # Number of key-value heads (defaults to n_head if not specified)#rms_norm_eps: 1e-6 # Epsilon value for RMS normalization#use_weight_only: false # Enable weight-only quantization#weight_only_precision: int8 # Precision for weight-only quantization (choices: int8, int4)#smoothquant: 0.5 # Smoothquant parameter for quantization#per_channel: false # Enable per-channel quantization#per_token: false # Enable per-token quantization#int8_kv_cache: false # Enable int8 quantization for key-value cache#ammo_quant_ckpt_path: # Path to the quantized checkpoint file in .npz format#per_group: false # Enable per-group quantization for GPTQ/AWQ quantization#load_by_shard: false # Load the pretrained model shard-by-shard#hidden_act: silu # Activation function used in the model (default: silu)#rotary_base: 10000.0 # Base value for rotary positional embeddings#group_size: 128 # Group size used in GPTQ quantization#dataset_cache_dir: # Path to the dataset cache directory#load_model_on_cpu: false # Load the model on CPU#use_parallel_embedding: false # Enable embedding parallelism#embedding_sharding_dim: 0 # Dimension for embedding sharding (choices: 0, 1)#use_embedding_sharing: false # Enable embedding sharing to reduce engine size#workers: 1 # Number of workers for parallel checkpoint conversion#moe_num_experts: 0 # Number of experts for Mixture of Experts (MoE) layers#moe_top_k: 0 # Top-k value for MoE layers (defaults to 1 if moe_num_experts is set)#moe_tp_mode: 0 # Parallelism mode for distributing MoE experts in tensor parallelism#moe_renorm_mode: 1 # Renormalization mode for MoE gate logits#save_config_only: false # Only save the model configuration without building the engine#disable_weight_only_quant_plugin: false # Disable the weight-only quantization pluginbuild:max_input_len:256# Maximum input sequence lengthmax_output_len:256# Maximum output sequence lengthmax_batch_size:8# Maximum batch sizemax_beam_width:1# Maximum beam width for beam search#max_num_tokens: # Maximum number of tokens to generate#opt_num_tokens: # Optimal number of tokens to generatemax_prompt_embedding_table_size:0# Maximum size of the prompt embedding tablegather_context_logits:false# Gather context logits during generationgather_generation_logits:false# Gather generation logits during generationstrongly_typed:false# Enable strongly typed network definition#builder_opt: # Builder optimization levelprofiling_verbosity:layer_names_only# Profiling verbosity level (choices: layer_names_only, detailed, none)enable_debug_output:false# Enable debug outputmax_draft_len:0# Maximum draft length for Medusa-style generationuse_refit:false# Enable engine refitting#input_timing_cache: # Path to the input timing cache file#output_timing_cache: # Path to save the output timing cache filelora_config:# Configuration for LoRA (Low-Rank Adaptation)#lora_dir: # Path to the LoRA checkpoint directory#lora_target_modules: # Target modules for LoRA adaptation#lora_ckpt_source: hf # Source of LoRA checkpoints (choices: hf, nemo)#max_lora_rank: 4 # Maximum rank for LoRA adaptationauto_parallel_config:# Configuration for automatic parallelization#enabled: false # Enable automatic parallelization#tp_size: 1 # Tensor parallelism size for automatic parallelization#pp_size: 1 # Pipeline parallelism size for automatic parallelization#max_memory_MB: 80000 # Maximum memory in MB for automatic parallelization#max_dram_memory_MB: 30000 # Maximum DRAM memory in MB for automatic parallelization#compile_max_memory_MB: 17000 # Maximum memory in MB for compilation during automatic parallelization#compile_max_dram_memory_MB: 8000 # Maximum DRAM memory in MB for compilation during automatic parallelization#debug_mode: false # Enable debug mode for automatic parallelizationweight_sparsity:false# Enable weight sparsityplugin_config:# Configuration for plugins#use_custom_all_reduce: false # Use custom all-reduce plugin#use_fp8_all_reduce: false # Use FP8 all-reduce plugin#use_fp8_cast_plugin: false # Use FP8 cast plugin#use_async_malloc: false # Use asynchronous memory allocation plugin#use_paged_context_fmha: false # Use paged context fused multi-head attention plugin#use_fp8_context_fmha: false # Use FP8 context fused multi-head attention plugin#lora_plugin: # Configuration for LoRA plugin#type: # Type of LoRA pluginmax_encoder_input_len:1024# Maximum encoder input sequence length for encoder-decoder modelsuse_fused_mlp:false# Use fused MLP layersdry_run:false# Perform a dry run without building the enginevisualize_network:false# Visualize the network graph
TensorRT-LLM is a framework that optimises large language models (LLMs) for fast inference using NVIDIA's TensorRT library.
The process of converting a pre-trained LLM into a TensorRT engine involves several steps, including model conversion, quantization, and engine building.
In this document, we will focus on the build process using the tensorrt_llm.build API and the trtllm-build CLI tool.
Overview
The tensorrt_llm.build API is a high-level function that simplifies the process of building a TensorRT engine from a TensorRT-LLM model object.
It replaces the older workflow, which required creating a builder, creating a network object, tracing the model to the network, and building TensorRT engines manually.
The trtllm-buildCLI tool is a convenient wrapper around the tensorrt_llm.build API, allowing users to build engines from the command line without writing Python code.
Build Process
Create a TensorRT-LLM Model Object
To build a TensorRT engine, you first need to create a TensorRT-LLM model object.
This object represents the pre-trained LLM that you want to optimise. For example, to create a LLaMA model object, you can use the following code:
The from_pretrained method loads the pre-trained weights and initialises the LLaMA model object.
Configure the Build Settings
Next, you need to create a BuildConfigobject to specify the build settings. The BuildConfig class has several important arguments that control the optimization process, such as:
max_batch_size: The maximum batch size for the engine.
max_input_len: The maximum length of the input sequence.
max_output_len: The maximum length of the output sequence.
precision: The precision of the engine (e.g., float16, float32).
Here's an example of creating a BuildConfig object:
With the TensorRT-LLM model object and the build configuration ready, you can now build the TensorRT engine using the tensorrt_llm.build API:
engine = tensorrt_llm.build(llama, build_config)
The build function takes the model object and the build configuration as input and returns a TensorRT engine object.
Internally, it creates a TensorRT builder, a network object, traces the model to the network, and builds the engine based on the provided configuration.
Save the TensorRT Engine
After building the engine, you can save it to disk for later use:
engine.save("path/to/save/engine")
The save method serializes the engine and saves it to the specified directory.
Using the trtllm-build CLI Tool
The trtllm-build CLI tool provides a convenient way to build TensorRT engines from the command line. It is a thin wrapper around the tensorrt_llm.build API, and its flags closely match the fields of the BuildConfigclass.
Here's an example of using the trtllm-build CLI tool:
The --checkpoint_dir flag specifies the directory containing the TensorRT-LLM model checkpoint, and the --output_dir flag specifies where to save the built engine. The other flags correspond to the fields of the BuildConfig class.
Building from a Checkpoint
If you have previously saved a TensorRT-LLM model checkpoint to disk and want to build an engine from it later, you can use the from_checkpoint API to deserialize the checkpoint:
The from_checkpointmethod loads the model weights from the specified checkpoint directory and creates a TensorRT-LLM model object.
You can then use this object with the tensorrt_llm.build API to build the engine as described earlier.
Conclusion
The TensorRT-LLM build engine process simplifies the conversion of pre-trained LLMs into optimised TensorRT engines.
By using the tensorrt_llm.build API or the trtllm-build CLI tool, you can easily configure the build settings and generate high-performance engines for fast inference.
The from_checkpoint API allows you to resume the build process from a previously saved checkpoint, providing flexibility in your workflow.