LLM Fine-tuning 실습 프로젝트 - Part 5

Contents

  1. SFT 용 데이터
  2. Eval 용 데이터


1. SFT 용 데이터

from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd


데이터셋 1. (앞서서 생성했던) Insurance 데이터셋

df = pd.read_csv('final_evolving_insurance.xlsx')
df_new = df[['instruction','response']]
df_new['input'] = ''
N = len(df_new)


데이터셋 2. 기존 Dataset

data = load_dataset('DopeorNope/Ko-Optimize_Dataset_train')
data_df = pd.DataFrame(data['train'])
total_N = 1000
sample = data_df.sample(int(total_N-N))
sample = sample.reset_index(drop=True)


최종 SFT용 데이터셋 (1000개)

  • 데이터셋1 + 데이터셋 2
insurance_train = pd.concat([sample, df_new], axis=0)
sample = insurance_train.reset_index(drop=True)


최종 데이터셋 업로드

insurance_data = DatasetDict({
  'train': Dataset.from_pandas(insurance_train)
})
insurance_data.push_to_hub('DopeorNope/insurance_train')


2. Eval 용 데이터 ( + DPO )

Evolving 한번 더 수행하여 생성함!

(과정 생략)

DPO 데이터셋

insurance_data = DatasetDict({
  'train': Dataset.from_pandas(df2),
  'validation': Dataset.from_pandas(df1)
})

insurance_data.push_to_hub('DopeorNope/insurance_DPO')


Validation 데이터셋

insurance_data = DatasetDict({
  'validation': Dataset.from_pandas(df1)
})

insurance_data.push_to_hub('DopeorNope/insurance_eval')


(아까 만들었던) SFT train에 validation도 추가

data_train = load_dataset('DopeorNope/insurance_train')
data_train['validation'] = Dataset.from_pandas(df1)
data_train.push_to_hub('DopeorNope/insurance_train_final')


3. SFT 수행하기

FSDP가 아닌 MP로써 full fine-tuning을 수행할 것!


#-m torch.distributed.launch --nproc_per_node=8
export TOKENIZERS_PARALLELISM=true

#CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"

python fullfinetuning.py \
    --base_model google/gemma-2-9b-it \
    --data-path DopeorNope/insurance_train_final \
    --output_dir gemma2-fast_campus-insurance \
    --batch_size 32 \
    --micro_batch_size 2 \
    --num_epochs 10 \
    --learning_rate 5e-5 \
    --cutoff_len 2048 \
    --train_on_inputs True \
    --add_eos_token False \
    --group_by_length False \
    --prompt_template_name alpaca1 \
    --lr_scheduler 'cosine' \
    --warmup_steps 0 \
    --wandb_project 'fast_campus_insurance_sft' \
    --wandb_run_name 'fast_campus_insurance_sft_FFT'


import os
import sys
from typing import List
import torch.nn as nn 
import fire
import torch
import transformers

from datasets import load_dataset
from accelerate import Accelerator
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers import CONFIG_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING,AutoConfig
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from torch.nn import functional as F

from peft import set_peft_model_state_dict

from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

from utils.prompter import Prompter
import warnings
warnings.filterwarnings("ignore", message="torch.utils.checkpoint: please pass in use_reentrant")

class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        kwargs["model"].save_pretrained(checkpoint_folder)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        torch.save({}, pytorch_model_path)
        return control


class LoadBestPeftModelCallback(TrainerCallback):
    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        print(f"Loading best peft model from {state.best_model_checkpoint} (score: {state.best_metric}).")
        best_model_path = os.path.join(state.best_model_checkpoint, "adapter_model.bin")
        adapters_weights = torch.load(best_model_path)
        model = kwargs["model"]
        set_peft_model_state_dict(model, adapters_weights)
        return control


def train(
    base_model: str = "", 
    data_path: str = "",
    output_dir: str = "",
    batch_size: int = 128,
    micro_batch_size: int = 8,
    num_epochs: int = 1,
    learning_rate: float = 3e-4,
    cutoff_len: int = 4096,
    lr_scheduler: str = "cosine",
    warmup_steps: int = 100, 
    train_on_inputs: bool = False,  # if False, masks out inputs in loss
    add_eos_token: bool = False,
    group_by_length: bool = False,  # faster, but produces an odd training loss curve
    wandb_run_name: str = "",
    resume_from_checkpoint: str = None,  # either training checkpoint or final adapter
    prompt_template_name: str = "alpaca",

):
    if int(os.environ.get("LOCAL_RANK", 0)) == 0:
        print(
            f"Params using prompt template {prompt_template_name}:\n"
            f"base_model: {base_model}\n"
            f"data_path: {data_path}\n"
            f"output_dir: {output_dir}\n"
            f"batch_size: {batch_size}\n"
            f"micro_batch_size: {micro_batch_size}\n"
            f"num_epochs: {num_epochs}\n"
            f"learning_rate: {learning_rate}\n"
            f"cutoff_len: {cutoff_len}\n"
            f"lr_scheduler: {lr_scheduler}\n"
            f"warmup_steps: {warmup_steps}\n"
            f"train_on_inputs: {train_on_inputs}\n"
            f"add_eos_token: {add_eos_token}\n"
            f"group_by_length: {group_by_length}\n"
            f"wandb_run_name: {wandb_run_name}\n"
            f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
        )
    assert (
        base_model
    ), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"

    from huggingface_hub import login
    
    login(token='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')

    prompter = Prompter(prompt_template_name)
    device_map = "auto"

    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        torch_dtype = torch.bfloat16,
        attn_implementation = "flash_attention_2", #eager
        device_map=device_map,
        max_memory= {0: "3GB", 1: "10GB", 2: "10GB", 3: "10GB"}
    )
    tokenizer = AutoTokenizer.from_pretrained(base_model)

    print(type(model))
    print(model)
    print("length of tokenizer:",len(tokenizer))

    bos = tokenizer.bos_token_id
    eos = tokenizer.eos_token_id
    
    #tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
    pad = tokenizer.pad_token_id
    #model.config.pad_token_id = tokenizer.pad_token_id 
    tokenizer.padding_side = "right"


    def tokenize(prompt, add_eos_token=True):
        result = tokenizer(
            prompt,
            truncation=True,
            max_length=cutoff_len,
            padding=False,
            return_tensors=None,
        )
        if (
            result["input_ids"][-1] != tokenizer.eos_token_id
            and len(result["input_ids"]) < cutoff_len
            and add_eos_token
        ):
            result["input_ids"].append(tokenizer.eos_token_id)
            result["attention_mask"].append(1)

        result["labels"] = result["input_ids"].copy()

        return result

    def generate_and_tokenize_prompt(data_point):
        full_prompt = prompter.generate_prompt(
            data_point["instruction"],
            data_point["input"],
            data_point["output"])
        
        tokenized_full_prompt = tokenize(full_prompt)
        if not train_on_inputs: 
            user_prompt = prompter.generate_prompt(
                data_point["instruction"], data_point["input"])
            tokenized_user_prompt = tokenize(
                user_prompt, add_eos_token=add_eos_token)            
            user_prompt_len = len(tokenized_user_prompt["input_ids"])

            if add_eos_token:
                user_prompt_len -= 1

            tokenized_full_prompt["labels"] = [
                -100
            ] * user_prompt_len + tokenized_full_prompt["labels"][
                user_prompt_len:
            ]  # TODO: Speed up?
        return tokenized_full_prompt


    if data_path.endswith(".json") or data_path.endswith(".jsonl"):
        data = load_dataset("json", data_files=data_path)
    else:
        print("================== private dataset")
        data = load_dataset(data_path, token=True)

    if resume_from_checkpoint:
        checkpoint_name = os.path.join(
            resume_from_checkpoint, "pytorch_model.bin"
        )  
        if not os.path.exists(checkpoint_name):
            checkpoint_name = os.path.join(
                resume_from_checkpoint, "adapter_model.bin"
            ) 
            resume_from_checkpoint = (
                True 
            ) 
            
        # The two files above have a different name depending on how they were saved, but are actually the same.
        if os.path.exists(checkpoint_name):
            print(f"Restarting from {checkpoint_name}")
            adapters_weights = torch.load(checkpoint_name)
            set_peft_model_state_dict(model, adapters_weights)
        else:
            print(f"Checkpoint {checkpoint_name} not found")

    train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
    val_data = data['validation'].shuffle().map(generate_and_tokenize_prompt)
    

    train_data = train_data.remove_columns(data["train"].column_names)    
    if val_data != None:
        val_data = val_data.remove_columns(data["validation"].column_names)

    trainer = transformers.Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=val_data,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=micro_batch_size,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=batch_size,
            warmup_ratio=0.06,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
            remove_unused_columns=True,
            dataloader_num_workers=4,
            bf16=True,
            logging_steps=1,
            optim='lomo',#"paged_adamw_8bit",
            #max_grad_norm = 1,
            weight_decay = 0.01,
            evaluation_strategy="steps", 
            save_strategy="steps",
            eval_steps = 10,
            save_steps = 10, # oringinal: 1000
            lr_scheduler_type=lr_scheduler,
            output_dir=output_dir,
            gradient_checkpointing=True,
            load_best_model_at_end=True,
            group_by_length=group_by_length,
            report_to = "wandb",
            run_name=wandb_run_name
        ),
        data_collator=transformers.DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        ),
        # callbacks=[SavePeftModelCallback, LoadBestPeftModelCallback], # ONLY USE LoadBestPeftModelCallback if val_set_size > 0
    )
    model.config.use_cache = False

    if torch.__version__ >= "2" and sys.platform != "win32":
        model = torch.compile(model)
    
    print('##########################################################################################')
    print('##################################### Training Start #####################################')
    print('##########################################################################################')
    
    trainer.train(resume_from_checkpoint=resume_from_checkpoint)

    print('##########################################################################################')
    print('##################################### Saving now.... #####################################')
    print('##########################################################################################')
    #model.save_pretrained(output_dir)
    
    tokenizer.save_pretrained(output_dir)
    trainer.save_model()
    model.base_model.save_pretrained(output_dir)
    pytorch_model_path = os.path.join(output_dir, "pytorch_model.bin")
    torch.save({}, pytorch_model_path)


if __name__ == "__main__":
    torch.cuda.empty_cache() 
    fire.Fire(train)

Reference

https://fastcampus.co.kr/data_online_gpu

Categories: , ,

Updated: