recipes/configs/generation.yaml

# Config for running the InferenceRecipe in generate.py to generate output
# from Llama2 7B model
#
# This config assumes that you've run the following command before launching
# this run:
#   tune download meta-llama/Llama-2-7b-hf --output-dir /tmp/Llama-2-7b-hf --ignore-patterns "*.safetensors" --hf-token <HF_TOKEN>
#
# To launch, run the following command from root torchtune directory:
#    tune run generate --config generation

output_dir: ./ # Not needed

# Model arguments
model:
  _component_: torchtune.models.llama2.llama2_7b

checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-2-7b-hf/
  checkpoint_files: [
    pytorch_model-00001-of-00002.bin,
    pytorch_model-00002-of-00002.bin,
  ]
  output_dir: ${output_dir}
  model_type: LLAMA2

device: cuda
dtype: bf16

seed: 1234

# Tokenizer arguments
tokenizer:
  _component_: torchtune.models.llama2.llama2_tokenizer
  path: /tmp/Llama-2-7b-hf/tokenizer.model
  max_seq_len: null
  prompt_template: null

# Generation arguments; defaults taken from gpt-fast
prompt:
  system: null
  user: "Tell me a joke."
max_new_tokens: 300
temperature: 0.6 # 0.8 and 0.6 are popular values to try
top_k: 300

enable_kv_cache: True

quantizer: null