[ Recommender System ]

22.Review data로 RS 성능 올리기

( 참고 : Fastcampus 추천시스템 강의 )


1. 데이터 소개

  • 사용할 데이터 : Amazon Review data (2018) 中 ‘Software’
  • Software.csv & Software_5.json
import os, json
import pandas as pd	
import re

2. 데이터 전처리

(1) Softward.csv

ratings_df = pd.read_csv(os.path.join(path, 'Software.csv'), header=None, names=['item','user','rating','timestamp'], encoding='utf-8')

(2) Software_5.json

data = []
with open(os.path.join(path, 'Software_5.json'), 'r', encoding='utf-8') as f:
    for l in f:
        data.append(json.loads(l.strip()))
all_df = pd.DataFrame.from_dict(data)
all_df = all_df[['reviewerID', 'asin', 'vote', 'reviewText', 'overall']]

아래의 전처리를 수행한다

  • 1) 불필요한 문자들 제거
  • 2) 리뷰는 앞에서 부터 최대 30글자만 사용
  • 3) 10글자 미만의 리뷰는 사용 X
removal_list =  "‘, ’, ◇, ‘, ”,  ’, ', ·, \“, ·, △, ●,  , ■, (, ), \", >>, `, /, -,∼,=,ㆍ<,>, .,?, !,【,】, …, ◆,%"

def preprocess_sent(sentence):
    sentence = re.sub("[.,\'\"’‘”“!?]", "", sentence)
    sentence = re.sub("[^0-9a-zA-Z\\s]", " ", sentence)
    sentence = re.sub("\s+", " ", sentence)
    
    sentence = sentence.translate(str.maketrans(removal_list, ' '*len(removal_list)))
    sentence = sentence.strip()

    return sentence
all_df = all_df[['reviewText','overall']]
all_df.dropna(how='any', inplace=True)
all_df['reviewText'] = all_df['reviewText'].apply(lambda x: preprocess_sent(str(x).replace('\t',' ').replace('\n',' ')))
all_df['reviewText'] = all_df['reviewText'].apply(lambda x: ' '.join(x.split(' ')[:30]))

all_df['num_lengths'] = all_df['reviewText'].apply(lambda x: len(x.split(' ')))
all_df = all_df[all_df.num_lengths > 10]
all_df.drop('num_lengths', axis=1, inplace=True)

최종 데이터 저장하기

all_df.to_csv(os.path.join(path, 'Software_reviewText.csv'), sep='\t', encoding='utf-8')

3. 전처리한 데이터 불러오기

Torchtext 개요 : 다음의 과정들을 손쉽게 할 수 있게 해준다

  • 1) Tokenization
  • 2) Build Vocabulary
  • 3) Numericalize all tokens
  • 4) Create Data Loader
import torch
from torchtext import data
from torchtext import datasets
import random

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

다음의 2개의 Field 객체를 만든다

함수 : data.Field, data.LabelField

  • 1) TEXT : 리뷰 텍스트를 담을 field
  • 2) LABEL : 리뷰의 평점을 담을 field
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField()

위에서 지정한 2개의 필드를 기반으로, 데이터를 불러온다.

함수 : data.TabularDataset

fields = [(None, None), ('text', TEXT), ('label', LABEL)]

training_data = data.TabularDataset(path=os.path.join(path,'Software_reviewText.csv'),
                                  format ='tsv', fields = fields, skip_header = True)

예시로, 하나의 sample 텍스트를 불러오면 아래와 같다.

print(vars(training_data.examples[0]))
{'text': ['I', 've', 'been', 'using', 'Dreamweaver', 'and', 'its', 'predecessor', 'Macromedias', 'UltraDev', 'for', 'many', 'years', 'For', 'someone', 'who', 'is', 'an', 'experienced', 'web', 'designer', 'this', 'course', 'is', 'a', 'high', 'level', 'review', 'of', 'the', 'CS5'], 'label': '4.0'}

단어 사전 (Vocabulary)을 만든다

함수 : TEXT.build_vocab

MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(training_data, 
                 max_size = MAX_VOCAB_SIZE)

LABEL.build_vocab(training_data)

예시로써, 각 평점(1~5점)이 어떻게 라벨링 되었는지 확인!

print(LABEL.vocab.stoi)
defaultdict(<function _default_unk_index at 0x7f33c4bac6a8>, {'5.0': 0, '4.0': 1, '3.0': 2, '1.0': 3, '2.0': 4})

Data Loader를 만들어준다.

함수 : data.BucketIterator.splits

BATCH_SIZE = 64

train_data, valid_data = training_data.split(split_ratio=0.8, random_state = random.seed(1234))

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    sort_key = lambda x: len(x.text),
    sort_within_batch=False,
    batch_size = BATCH_SIZE
    )

3. 모델 아키텍쳐

- PyTorch Text Classification

- Yoon Kim, 2014, Convolutional Neural Networks for Sentence Classification(논문 링크)

figure2

구성 :

  • 1) Embedding layer
  • 2) Convolutional layer
  • 3) Fully Connected layer
  • 4) Dropout
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        ## 1) Permutation : [sent len, batch size] -> [batch size, sent len]
        text = text.permute(1, 0)        
        
        ## 2) Embedding : [batch size, sent len] -> [batch size, sent len, emb dim]
        embedded = self.embedding(text)
        
        ## 3) Unsqueeze : [batch size, sent len, emb dim] -> [batch size, 1, sent len, emb dim]
        embedded = embedded.unsqueeze(1)       
       
        ## 4) Convolutional layer : 
        ### before ) [batch size, 1, sent len, emb dim] 
        ### after ) [batch size, n_filters, sent len - filter_sizes[n]]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        ## 5) Maxpooling 1d : [batch size, n_filters]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        ## 6) Concatenate & dropout : [batch size, n_filters * len(filter_sizes)]
        cat = self.dropout(torch.cat(pooled, dim = 1))           
        return self.fc(cat)

모델 생성하기

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

Unknown단어 : embedding weight 0으로!

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

4. 모델 학습하기

  • Optimizer : Adam optimizer
  • Loss function : Cross Entropy
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
  • GPU / CPU 사용 여부
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)
  • 배치(batch)별 정확도
def categorical_accuracy(preds, y):
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

Train 함수

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0    
    model.train()
    
    for batch in iterator:        
        optimizer.zero_grad()        
        predictions = model(batch.text)        
        loss = criterion(predictions, batch.label)        
        acc = categorical_accuracy(predictions, batch.label        
        loss.backward()        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Evaluate 함수

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0    
    model.eval()
    
    with torch.no_grad():    
        for batch in iterator:
            predictions = model(batch.text)            
            loss = criterion(predictions, batch.label)            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

epoch_time : 학습에 소요되는 시간을 함께 출력하기 위한 함수이다

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

< 학습 결과 >

N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'cnn-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
Epoch: 01 | Epoch Time: 0m 10s
	Train Loss: 1.498 | Train Acc: 39.40%
	 Val. Loss: 1.354 |  Val. Acc: 43.87%
Epoch: 02 | Epoch Time: 0m 10s
	Train Loss: 1.333 | Train Acc: 45.24%
	 Val. Loss: 1.285 |  Val. Acc: 45.54%
Epoch: 03 | Epoch Time: 0m 11s
	Train Loss: 1.254 | Train Acc: 48.85%
	 Val. Loss: 1.253 |  Val. Acc: 46.23%
Epoch: 04 | Epoch Time: 0m 11s
	Train Loss: 1.188 | Train Acc: 51.71%
	 Val. Loss: 1.225 |  Val. Acc: 47.24%
Epoch: 05 | Epoch Time: 0m 11s
	Train Loss: 1.113 | Train Acc: 55.51%
	 Val. Loss: 1.202 |  Val. Acc: 49.88%
Epoch: 06 | Epoch Time: 0m 10s
	Train Loss: 1.041 | Train Acc: 58.82%
	 Val. Loss: 1.208 |  Val. Acc: 49.07%
Epoch: 07 | Epoch Time: 0m 10s
	Train Loss: 0.963 | Train Acc: 61.46%
	 Val. Loss: 1.180 |  Val. Acc: 51.23%
Epoch: 08 | Epoch Time: 0m 10s
	Train Loss: 0.884 | Train Acc: 65.54%
	 Val. Loss: 1.193 |  Val. Acc: 51.65%
Epoch: 09 | Epoch Time: 0m 10s
	Train Loss: 0.802 | Train Acc: 69.40%
	 Val. Loss: 1.187 |  Val. Acc: 51.83%
Epoch: 10 | Epoch Time: 0m 10s
	Train Loss: 0.711 | Train Acc: 73.29%
	 Val. Loss: 1.186 |  Val. Acc: 51.74%

5. 모델 평가하기

model.load_state_dict(torch.load('cnn-model.pt'))

import spacy
nlp = spacy.load('en')

def predict_class(model, sentence, min_len = 4):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed)
    tensor = tensor.unsqueeze(1)
    preds = model(tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()

예시) 긍정적인 리뷰 (“best item”)이라는 말을 남겼을 때, 5.0점의 평점으로 예측을 한다

pred_class = predict_class(model, "best item")
print(f'Predicted class is: {pred_class} = {LABEL.vocab.itos[pred_class]}')

Predicted class is: 0 = 5.0

6. BERT embedding

import nltk
nltk.download('punkt')
from nltk import tokenize
from transformers import BertModel, BertTokenizer

유명한 pre-trained된 모델인 BERT를 사용하여 임베딩한다.

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_embedding(sentence):
    sent_vectors = []
    for sent in tokenize.sent_tokenize(sentence):
        text = "[CLS] " + sent + " [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)

        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensor = torch.tensor([segments_ids])

        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensor)
            encoded_layers = outputs[0] # last hidden state          
            sentence_embedding = torch.mean(encoded_layers[0], dim=0)
            sent_vectors.append(sentence_embedding.detach().numpy())

    return np.array(sent_vectors).mean(axis=0)
bert_embedding = get_embedding('hello, today is thursday')

Categories:

Updated: