checkpoint configuration file
Download into your container the following github repository which contains the configuration scripts and the execution scripts for the convert_checkpoint function
After having used the helper scripts, you can now run the checkpoint.py command
git clone https://github.com/Continuum-Labs-HQ/tensorrt-continuum.git
This will download the following files:
Checkpoint Conversation Configuration YAML File
A run_convert_checkpoint.py script that you execute when you are comfortable with the configuration
YAML Configuration File
Enter in your arguments based on the configuration files from the Huggingface files - they have to be consistent
model:
model_dir: ./llama-2-7b-chat-hf
output_dir: ../llama-2-7b-chat-hf-output
dtype: float16 # Choices: float32, bfloat16, float16
# Suggestions:
# - Use float16 for better performance with minimal accuracy loss
# - Use bfloat16 for a balance between performance and accuracy
# - Use float32 for maximum accuracy but slower performance
checkpoint:
tp_size: 1 # Tensor parallelism size
pp_size: 1 # Pipeline parallelism size
# Suggestions:
# - Increase tp_size and pp_size for distributed training across multiple GPUs
# - Keep tp_size and pp_size as 1 for single GPU training
vocab_size: 32000
# Suggestions:
# - Adjust vocab_size based on the specific tokenizer and model requirements
n_positions: 2048
# Suggestions:
# - Increase n_positions for longer sequence lengths
# - Decrease n_positions for shorter sequence lengths to save memory
n_layer: 32
# Suggestions:
# - Adjust n_layer based on the desired model depth
# - Increase n_layer for more complex models
# - Decrease n_layer for simpler models or faster training
n_head: 32
# Suggestions:
# - Adjust n_head based on the desired number of attention heads
# - Increase n_head for more fine-grained attention
# - Decrease n_head for faster training or smaller models
n_embd: 4096
# Suggestions:
# - Adjust n_embd based on the desired hidden size
# - Increase n_embd for larger models with more capacity
# - Decrease n_embd for smaller models or faster training
inter_size: 11008
# Suggestions:
# - Adjust inter_size based on the desired intermediate size in the feed-forward layers
# - Increase inter_size for more capacity in the feed-forward layers
# - Decrease inter_size for smaller models or faster training
# Additional checkpoint arguments
meta_ckpt_dir: null # ./path/to/meta/checkpoint
n_kv_head: null # 32
rms_norm_eps: 1e-6
use_weight_only: false
disable_weight_only_quant_plugin: false
weight_only_precision: int8 # Choices: int8, int4, int4_gptq
smoothquant: null # 0.5
per_channel: false
per_token: false
int8_kv_cache: false
ammo_quant_ckpt_path: null # ./path/to/ammo/quant/checkpoint
per_group: false
load_by_shard: false
hidden_act: silu
rotary_base: 10000.0
group_size: 128
dataset_cache_dir: null # ./path/to/dataset/cache
load_model_on_cpu: false
use_parallel_embedding: false
embedding_sharding_dim: 0 # Choices: 0, 1
use_embedding_sharing: false
workers: 1
moe_num_experts: 0
moe_top_k: 0
moe_tp_mode: 0
moe_renorm_mode: 1
save_config_only: false
model:
model_dir: ./llama-2-7b-chat-hf
output_dir: ../llama-2-7b-chat-hf-output
dtype: float16 # Choices: float32, bfloat16, float16
# Suggestions:
# - Use float16 for better performance with minimal accuracy loss
# - Use bfloat16 for a balance between performance and accuracy
# - Use float32 for maximum accuracy but slower performance
checkpoint:
tp_size: 1 # Tensor parallelism size
pp_size: 1 # Pipeline parallelism size
# Suggestions:
# - Increase tp_size and pp_size for distributed training across multiple GPUs
# - Keep tp_size and pp_size as 1 for single GPU training
vocab_size: 32000
# Suggestions:
# - Adjust vocab_size based on the specific tokenizer and model requirements
n_positions: 2048
# Suggestions:
# - Increase n_positions for longer sequence lengths
# - Decrease n_positions for shorter sequence lengths to save memory
n_layer: 32
# Suggestions:
# - Adjust n_layer based on the desired model depth
# - Increase n_layer for more complex models
# - Decrease n_layer for simpler models or faster training
n_head: 32
# Suggestions:
# - Adjust n_head based on the desired number of attention heads
# - Increase n_head for more fine-grained attention
# - Decrease n_head for faster training or smaller models
n_embd: 4096
# Suggestions:
# - Adjust n_embd based on the desired hidden size
# - Increase n_embd for larger models with more capacity
# - Decrease n_embd for smaller models or faster training
inter_size: 11008
# Suggestions:
# - Adjust inter_size based on the desired intermediate size in the feed-forward layers
# - Increase inter_size for more capacity in the feed-forward layers
# - Decrease inter_size for smaller models or faster training
# Additional checkpoint arguments
meta_ckpt_dir: null # ./path/to/meta/checkpoint
n_kv_head: null # 32
rms_norm_eps: 1e-6
use_weight_only: false
disable_weight_only_quant_plugin: false
weight_only_precision: int8 # Choices: int8, int4, int4_gptq
smoothquant: null # 0.5
per_channel: false
per_token: false
int8_kv_cache: false
ammo_quant_ckpt_path: null # ./path/to/ammo/quant/checkpoint
per_group: false
load_by_shard: false
hidden_act: silu
rotary_base: 10000.0
group_size: 128
dataset_cache_dir: null # ./path/to/dataset/cache
load_model_on_cpu: false
use_parallel_embedding: false
embedding_sharding_dim: 0 # Choices: 0, 1
use_embedding_sharing: false
workers: 1
moe_num_experts: 0
moe_top_k: 0
moe_tp_mode: 0
moe_renorm_mode: 1
save_config_only: false
Last updated