LLM Fine-tuning 실습 프로젝트 - Part 6

Contents

  1. Import Packages
  2. Load Dataset & Models
  3. Sampling 준비
    1. Sampling hyperparameters
    2. Prompts
  4. Sampling
  5. DPO 데이터셋
    1. Train & Validation 구축
    2. 선호 & 비선호
  6. 데이터 업로드
  7. DPO
    1. Import Packages
    2. Argument를 sh파일로부터 받아오기
    3. DPO 데이터셋 처리 함수
    4. DPO 실행


1. Import Packages

from datasets import load_dataset
import vllm
from vllm import LLM, SamplingParams
import pandas as pd
from typing import List
from vllm.outputs import RequestOutput


2. Load Dataset & Models

Dataset

data= load_dataset('DopeorNope/insurance_DPO')

df1 = pd.DataFrame(data['train'])
df2 = pd.DataFrame(data['validation'])


Models

base_model= 'google/gemma-2-9b-it' # SFT모델 경로
gpu_num= 4
max_token = 4096

model = LLM(model=base_model, tensor_parallel_size=gpu_num, max_model_len=max_token, gpu_memory_utilization=0.6)


3. Sampling 준비

(1) Sampling hyperparameters

sampling_params = SamplingParams(top_k=5, top_p=0.8, max_tokens=max_token,temperature=0.001,
                                     stop=['<|endoftext|>', '</s>', '<|im_end|>','<|end_of_text|>'])


(2) Prompts

  • prompts1: Train 용
  • prompts2: Validation 용
prompts1 = list(map(lambda x: 
    f"아래는 문제를 설명하는 지시사항입니다. 이 요청에 대해 적절하게 답변해주세요.\n\n###지시사항:\n{x['instruction']}\n\n###답변:\n", 
    df1.to_dict('records')))
    
prompts2 = list(map(lambda x: 
    f"아래는 문제를 설명하는 지시사항입니다. 이 요청에 대해 적절하게 답변해주세요.\n\n###지시사항:\n{x['instruction']}\n\n###답변:\n", 
    df2.to_dict('records')))    


(3) SFT 데이터 사용

  • prompts3: (SFT 데이터 중 일부를) Train에 사용
df_sft= pd.read_excel('final_evolving_insurance.xlsx')
df_sft=df_sft[['instruction','response']]
df_sft.columns=['instruction','output']
prompts3 = list(map(lambda x: 
    f"아래는 문제를 설명하는 지시사항입니다. 이 요청에 대해 적절하게 답변해주세요.\n\n###지시사항:\n{x['instruction']}\n\n###답변:\n", 
    df_sft.to_dict('records')))


4. Sampling

Goal: reject될 비선호 데이터셋 생성

  • outputs1: Train 용
  • outputs2: Validation 용
  • outputs3: Train 용
outputs1: List[RequestOutput] = model.generate(prompts1, sampling_params)
outputs2: List[RequestOutput] = model.generate(prompts2, sampling_params)
outputs3: List[RequestOutput] = model.generate(prompts3, sampling_params)


5. DPO 데이터셋

(1) Train & Validation 구축

extracted_prompts1 = list(map(lambda x: x.prompt, outputs1)) # DPO_train
extracted_prompts2 = list(map(lambda x: x.prompt, outputs2)) # DPO_valid
extracted_prompts3 = list(map(lambda x: x.prompt, outputs3)) # SFT_train

extracted_generated_texts1 = list(map(lambda x: x.outputs[0].text, outputs1))
extracted_generated_texts2 = list(map(lambda x: x.outputs[0].text, outputs2))
extracted_generated_texts3 = list(map(lambda x: x.outputs[0].text, outputs3))


train_question = prompts1 + prompts3 
valid_question = prompts2
print(len(train_question))
print(len(valid_question))
924
100


(2) 선호 & 비선호

  • 선호 (chosen) vs. 비선호 (reject)
train_chosen = list(df1['output'])+ list(df_sft['output'])
valid_chosen = list(df2['output'])

train_reject = extracted_generated_texts1+ extracted_generated_texts3
valid_reject = extracted_generated_texts2


dpo_train_df= pd.DataFrame(
    {'question': train_question,
     'response_j': train_chosen,
     'response_k': train_reject}
)

dpo_valid_df= pd.DataFrame(
    {'question': valid_question,
     'response_j': valid_chosen,
     'response_k' :  valid_reject
    }
)


6. 데이터 업로드

from datasets import load_dataset,DatasetDict,Dataset

insurance_data_DPO=DatasetDict({
    'train':Dataset.from_pandas(dpo_train_df),
    'validation': Dataset.from_pandas(dpo_valid_df)
})

insurance_data_DPO.push_to_hub('DopeorNope/insurance_DPO_final')


7. DPO

( 참고: https://github.com/DopeorNope-Lee/Easy_DPO )

(1) Import Packages

import os
from dataclasses import dataclass, field
from typing import Dict, Optional
import torch
from datasets import Dataset, load_dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments, BitsAndBytesConfig
from accelerate import dispatch_model, infer_auto_device_map
from accelerate.utils import get_balanced_memory
from trl import DPOTrainer
from huggingface_hub import login
import argparse


(2) Argument를 sh파일로부터 받아오기

def get_args():
    """
		DPO로 훈련시키기 위해, 중요한 인수인자를 sh 파일로 부터 받아오기
    """
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--num_epochs", type=int, default=1)
    parser.add_argument("--beta", type=float, default=0.1)
    parser.add_argument("--datapath", type=str, default="")
    parser.add_argument("--model_name_or_path", type=str, default="")
    parser.add_argument("--learning_rate", type=float, default=1e-5)
    parser.add_argument("--lr_scheduler_type", type=str, default="linear")
    parser.add_argument("--warmup_steps",type=int,default=100)
    parser.add_argument("--token_id", type =str, default='',help='please enter yout huggingface token ID')
    parser.add_argument("--weight_decay",type=float,default=0.05)
    parser.add_argument("--optimizer_type", type=str, default="paged_adamw_32bit")
    parser.add_argument("--per_device_train_batch_size", type=int, default=32)
    parser.add_argument("--per_device_eval_batch_size", type=int, default=1)
    parser.add_argument("--gradient_accumulation_steps", type=int, default=4)
    parser.add_argument("--gradient_checkpointing", type=bool, default=True)
    parser.add_argument("--lora_alpha", type=float, default=16)
    parser.add_argument("--lora_dropout", type=float, default=0.05)
    parser.add_argument("--lora_r", type=int, default=16)
    parser.add_argument("--max_prompt_length", type=int, default=4096)
    parser.add_argument("--max_length", type=int, default=4096)
    parser.add_argument("--max_step", type=int, default=1000)
    parser.add_argument("--logging_steps", type=int, default=10)
    parser.add_argument("--save_steps", type=int, default=100)
    parser.add_argument("--eval_steps", type=int, default=100)
    parser.add_argument("--output_dir", type=str, default="./results")
    parser.add_argument("--log_freq", type=int, default=1)
    parser.add_argument("--sanity_check", type=bool, default=False)
    parser.add_argument("--report_to", type=str, default="wandb")
    parser.add_argument("--ignore_bias_buffers", type=bool, default=False)
    parser.add_argument("--lora_target_modules",type=list, default = ['embed_tokens', 'q_proj', 'k_proj', 'v_proj', 'gate_proj', 'down_proj', 'up_proj', 'lm_head'] )
    
    return parser.parse_args()


(3) DPO 데이터셋 처리 함수

  • DPO 데이터셋 특징: pair ( 선호 & 비선호 )

  • 이를 위한 함수: paired_data_preparation

def paired_data_preparation(
    data_dir: str = "", #default
    sanity_check: bool = False,
    cache_dir: str = None,
    split_criteria: str = "train",
    num_proc: int=24,
) -> Dataset:
    """
    이 데이터셋은 이후 딕셔너리 형태로 변환되며  다음과 같은 형태로 prompt, chosen, reject로 담기게 됩니다.
    {
        'prompt': List[str],
        'chosen': List[str],
        'rejected': List[str],
    }

    Prompt의 구조는 다음과 같이 담기게 됩니다(알파카 프롬프트):  
      "###질문: " + <prompt> + "\n\n###답변: "
    """

    dataset = load_dataset(data_dir, split=split_criteria ,cache_dir=cache_dir)    
    original_columns = dataset.column_names

    if sanity_check:
        dataset = dataset.select(range(min(len(dataset), 1000)))

    def return_prompt_and_responses(samples) -> Dict[str, str]:
        return {
            "prompt": ["###질문:\n" + question + "\n\n###답변:\n" for question in samples["question"]],
            "chosen": samples["response_j"],
            "rejected": samples["response_k"],
        }

    return dataset.map(
        return_prompt_and_responses,
        batched=True,
        num_proc=num_proc,
        remove_columns=original_columns,
    )


(4) DPO 실행

def main():
    args = get_args()
    print(
            f"################################ Training Arguments ################################"
            f"model_name_or_path: {args.model_name_or_path}\n"
            f"datapath: {args.datapath}\n"
            f"output_dir: {args.output_dir}\n"
            f"per_device_train_batch_size: {args.per_device_train_batch_size}\n"
            f"per_device_eval_batch_size: {args.per_device_eval_batch_size}\n"
            f"num_epochs: {args.num_epochs}\n"
            f"max_step: {args.max_step}\n"
            f"learning_rate: {args.learning_rate}\n"
            f"cutoff_len(max_length): {args.max_length}\n"
            f"lora_r: {args.lora_r}\n"
            f"lora_alpha: {args.lora_alpha}\n"
            f"lora_dropout: {args.lora_dropout}\n"
            f"lora_target_modules: {args.lora_target_modules}\n"
            f"max_prompt_length: {args.max_prompt_length}\n"
            f"##################################################################################\n"
            
    )

    login(token=args.token_id)

    # SFT MODEL
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name_or_path,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        trust_remote_code=True,      
        device_map="auto")

    model.config.use_cache = False
    model.is_parallelizable = True
    model.model_parallel = True
    model.config.max_position_embeddings= args.max_prompt_length

    print("model's max_position_embeddings :",model.config.max_position_embeddings)
    
    if args.ignore_bias_buffers:
        # torch distributed hack
        model._ddp_params_and_buffers_to_ignore = [
            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
        ]
    
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    tokenizer.pad_token = tokenizer.eos_token
    #model.resize_token_embeddings(len(tokenizer))
    
    train_dataset = paired_data_preparation(data_dir= args.datapath, split_criteria= "train", sanity_check=args.sanity_check)
    train_dataset = train_dataset.filter(
        lambda x: len(x["prompt"]) + len(x["chosen"]) <= args.max_length
        and len(x["prompt"]) + len(x["rejected"]) <= args.max_length)
    

    eval_dataset = paired_data_preparation(data_dir= args.datapath, split_criteria= "validation", sanity_check=True)
    eval_dataset = eval_dataset.filter(
       lambda x: len(x["prompt"]) + len(x["chosen"]) <= args.max_length
       and len(x["prompt"]) + len(x["rejected"]) <= args.max_length)
    
    training_args = TrainingArguments(
        num_train_epochs= args.num_epochs,
        per_device_train_batch_size = args.per_device_train_batch_size,
        per_device_eval_batch_size = args.per_device_eval_batch_size,
       # max_steps = args.max_step,
        logging_steps = args.logging_steps,
        save_steps = args.save_steps,
        gradient_accumulation_steps = args.gradient_accumulation_steps,
        gradient_checkpointing = args.gradient_checkpointing,
        learning_rate = args.learning_rate,
        evaluation_strategy="steps",
        eval_steps=args.eval_steps,
        output_dir = args.output_dir,
        #report_to = args.report_to,
        lr_scheduler_type = args.lr_scheduler_type,
        warmup_steps = args.warmup_steps,
        optim = args.optimizer_type,
        bf16 = True,
        remove_unused_columns = False,
        run_name = "dpo_llama2",
    )

    peft_config = LoraConfig(
        r = args.lora_r,
        lora_alpha = args.lora_alpha,
        lora_dropout = args.lora_dropout,
        target_modules = args.lora_target_modules,
        bias = "none",
        task_type = "CAUSAL_LM")
    
        
    print("###################################################################################")    
    print("############################  MODEL was Lodaded in GPU ############################")
    print("###################################################################################")    

    dpo_trainer = DPOTrainer(
        model,
        ref_model = None,   # ref 모델을 None으로 놓게 되면 SFT + adapter가 붙은 모델에서 adapter를 떼고, policy에 따른 최적화를 진행하게 됩니다. 두개의 모델을 로드할 필요가 없어 메모리 이득을 꾀할 수 있습니다.
        args = training_args,
        beta = args.beta,
        train_dataset= train_dataset,
        eval_dataset = eval_dataset,
        tokenizer = tokenizer,
        peft_config = peft_config,
        max_prompt_length = args.max_prompt_length,
        max_length = args.max_length,
    )

    print("###################################################################################")    
    print("########################  Trainin Process is preparing now  #######################")
    print("###################################################################################")    

    dpo_trainer.train()
    dpo_trainer.save_model(args.output_dir)

    output_dir = os.path.join(args.output_dir, "final_checkpoint")
    dpo_trainer.model.save_pretrained(output_dir)


if __name__ == "__main__" :
    main()

Categories: , ,

Updated: