Using the models - running Llama
To run a TensorRT-LLM LLaMA model using the engines generated by build.py
# With fp16 inference
python3 ../run.py --max_output_len=50 \
--tokenizer_dir ./tmp/llama/7B/ \
--engine_dir=./tmp/llama/7B/trt_engines/fp16/1-gpu/
# With bf16 inference
python3 ../run.py --max_output_len=50 \
--tokenizer_dir ./tmp/llama/7B/ \
--engine_dir=./tmp/llama/7B/trt_engines/bf16/1-gpu/
Summarization using the LLaMA model
# Run summarization using the LLaMA 7B model in FP16.
python ../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/llama/7B/ \
--data_type fp16 \
--engine_dir ./tmp/llama/7B/trt_engines/fp16/1-gpu/
# Run summarization using the LLaMA 7B model quantized to INT8.
python ../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/llama/7B/ \
--data_type fp16 \
--engine_dir ./tmp/llama/7B/trt_engines/weight_only/1-gpu/
# Run summarization using the LLaMA 7B model in FP16 using two GPUs.
mpirun -n 2 --allow-run-as-root \
python ../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/llama/7B/ \
--data_type fp16 \
--engine_dir ./tmp/llama/7B/trt_engines/fp16/2-gpu/
# Run summarization using the LLaMA 30B model in FP16 using two GPUs.
mpirun -n 2 --allow-run-as-root \
python ../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/llama/30B/ \
--data_type fp16 \
--engine_dir ./tmp/llama/30B/trt_engines/fp16/2-gpu/
Last updated