From eefb0ef3651f686b1217eb91ed5e7333dd388a77 Mon Sep 17 00:00:00 2001 From: Xingfu Yi <54938415+Xingfu-Yi@users.noreply.github.com> Date: Thu, 31 Oct 2024 00:13:05 +0800 Subject: [PATCH] Remove the fixed `eot_token` mechanism for SFT (#927) Not all pretrained LLMs use `<|endoftext|>` as the `eot_token`, therefore it's inappropriate to fix it. Co-authored-by: Olatunji Ruwase Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- .../training/step1_supervised_finetuning/main.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py index d9527af54..aa505a25d 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py @@ -191,7 +191,13 @@ def parse_args(): parser.add_argument( "--add_eot_token", action='store_true', - help="Add <|endoftext|> as additional special token to tokenizer") + help="Add `eot_token` as additional special token to tokenizer") + parser.add_argument( + "--eot_token", + type=str, + default="<|endoftext|>", + help="Specify the format of the `eot_token`", + ) ## Print loss parser.add_argument('--print_loss', action='store_true', @@ -234,8 +240,7 @@ def main(): torch.distributed.barrier() # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family - args.end_of_conversation_token = "<|endoftext|>" - additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None + additional_special_tokens = args.eot_token if args.add_eot_token else None tokenizer = load_hf_tokenizer(args.model_name_or_path, fast_tokenizer=True, add_special_tokens=additional_special_tokens)