checkpoint configuration file

Download into your container the following github repository which contains the configuration scripts and the execution scripts for the convert_checkpoint function

After having used the helper scripts, you can now run the checkpoint.py command

git clone https://github.com/Continuum-Labs-HQ/tensorrt-continuum.git

This will download the following files:

Checkpoint Conversation Configuration YAML File

A run_convert_checkpoint.py script that you execute when you are comfortable with the configuration

YAML Configuration File

Enter in your arguments based on the configuration files from the Huggingface files - they have to be consistent

model:
  model_dir: ./llama-2-7b-chat-hf
  output_dir: ../llama-2-7b-chat-hf-output
  dtype: float16  # Choices: float32, bfloat16, float16
  # Suggestions:
  # - Use float16 for better performance with minimal accuracy loss
  # - Use bfloat16 for a balance between performance and accuracy
  # - Use float32 for maximum accuracy but slower performance

checkpoint:
  tp_size: 1  # Tensor parallelism size
  pp_size: 1  # Pipeline parallelism size
  # Suggestions:
  # - Increase tp_size and pp_size for distributed training across multiple GPUs
  # - Keep tp_size and pp_size as 1 for single GPU training
  vocab_size: 32000
  # Suggestions:
  # - Adjust vocab_size based on the specific tokenizer and model requirements
  n_positions: 2048
  # Suggestions:
  # - Increase n_positions for longer sequence lengths
  # - Decrease n_positions for shorter sequence lengths to save memory
  n_layer: 32
  # Suggestions:
  # - Adjust n_layer based on the desired model depth
  # - Increase n_layer for more complex models
  # - Decrease n_layer for simpler models or faster training
  n_head: 32
  # Suggestions:
  # - Adjust n_head based on the desired number of attention heads
  # - Increase n_head for more fine-grained attention
  # - Decrease n_head for faster training or smaller models
  n_embd: 4096
  # Suggestions:
  # - Adjust n_embd based on the desired hidden size
  # - Increase n_embd for larger models with more capacity
  # - Decrease n_embd for smaller models or faster training
  inter_size: 11008
  # Suggestions:
  # - Adjust inter_size based on the desired intermediate size in the feed-forward layers
  # - Increase inter_size for more capacity in the feed-forward layers
  # - Decrease inter_size for smaller models or faster training
  
  # Additional checkpoint arguments
  meta_ckpt_dir: null  # ./path/to/meta/checkpoint
  n_kv_head: null  # 32
  rms_norm_eps: 1e-6
  use_weight_only: false
  disable_weight_only_quant_plugin: false
  weight_only_precision: int8  # Choices: int8, int4, int4_gptq
  smoothquant: null  # 0.5
  per_channel: false
  per_token: false
  int8_kv_cache: false
  ammo_quant_ckpt_path: null  # ./path/to/ammo/quant/checkpoint
  per_group: false
  load_by_shard: false
  hidden_act: silu
  rotary_base: 10000.0
  group_size: 128
  dataset_cache_dir: null  # ./path/to/dataset/cache
  load_model_on_cpu: false
  use_parallel_embedding: false
  embedding_sharding_dim: 0  # Choices: 0, 1
  use_embedding_sharing: false
  workers: 1
  moe_num_experts: 0
  moe_top_k: 0
  moe_tp_mode: 0
  moe_renorm_mode: 1
  save_config_only: false
model:
  model_dir: ./llama-2-7b-chat-hf
  output_dir: ../llama-2-7b-chat-hf-output
  dtype: float16  # Choices: float32, bfloat16, float16
  # Suggestions:
  # - Use float16 for better performance with minimal accuracy loss
  # - Use bfloat16 for a balance between performance and accuracy
  # - Use float32 for maximum accuracy but slower performance

checkpoint:
  tp_size: 1  # Tensor parallelism size
  pp_size: 1  # Pipeline parallelism size
  # Suggestions:
  # - Increase tp_size and pp_size for distributed training across multiple GPUs
  # - Keep tp_size and pp_size as 1 for single GPU training
  vocab_size: 32000
  # Suggestions:
  # - Adjust vocab_size based on the specific tokenizer and model requirements
  n_positions: 2048
  # Suggestions:
  # - Increase n_positions for longer sequence lengths
  # - Decrease n_positions for shorter sequence lengths to save memory
  n_layer: 32
  # Suggestions:
  # - Adjust n_layer based on the desired model depth
  # - Increase n_layer for more complex models
  # - Decrease n_layer for simpler models or faster training
  n_head: 32
  # Suggestions:
  # - Adjust n_head based on the desired number of attention heads
  # - Increase n_head for more fine-grained attention
  # - Decrease n_head for faster training or smaller models
  n_embd: 4096
  # Suggestions:
  # - Adjust n_embd based on the desired hidden size
  # - Increase n_embd for larger models with more capacity
  # - Decrease n_embd for smaller models or faster training
  inter_size: 11008
  # Suggestions:
  # - Adjust inter_size based on the desired intermediate size in the feed-forward layers
  # - Increase inter_size for more capacity in the feed-forward layers
  # - Decrease inter_size for smaller models or faster training
  
  # Additional checkpoint arguments
  meta_ckpt_dir: null  # ./path/to/meta/checkpoint
  n_kv_head: null  # 32
  rms_norm_eps: 1e-6
  use_weight_only: false
  disable_weight_only_quant_plugin: false
  weight_only_precision: int8  # Choices: int8, int4, int4_gptq
  smoothquant: null  # 0.5
  per_channel: false
  per_token: false
  int8_kv_cache: false
  ammo_quant_ckpt_path: null  # ./path/to/ammo/quant/checkpoint
  per_group: false
  load_by_shard: false
  hidden_act: silu
  rotary_base: 10000.0
  group_size: 128
  dataset_cache_dir: null  # ./path/to/dataset/cache
  load_model_on_cpu: false
  use_parallel_embedding: false
  embedding_sharding_dim: 0  # Choices: 0, 1
  use_embedding_sharing: false
  workers: 1
  moe_num_experts: 0
  moe_top_k: 0
  moe_tp_mode: 0
  moe_renorm_mode: 1
  save_config_only: false

Last updated