[ Recommender System ]

10.[code] Neural Collaborative Filtering

( 참고 : Fastcampus 추천시스템 강의 )

paper : Neural Collaborative Filtering ( He, et al., 2017 ) (https://arxiv.org/abs/1708.05031)


구성

  • 1) Data 불러오기
  • 2) Modeling
  • 3) Training

1. Data 불러오기

(1) DataLoader

class DataLoader:
    def __init__(self, data_path):
        self.train_df, val_temp_df = read_data(data_path)

        self.min_rate = min(self.train_df.rate)
        self.max_rate = max(self.train_df.rate)

        self.users = self.train_df.user.unique()
        self.num_users = len(self.users)
        self.user2idx = {user_name: idx for idx, user_name in enumerate(self.users)}

        self.movies = self.train_df.movie.unique()
        self.num_movies = len(self.movies)
        self.movie2idx = {movie_name: idx for idx, movie_name in enumerate(self.movies)}

        self.val_df = val_temp_df[val_temp_df.user.isin(self.users) & val_temp_df.movie.isin(self.movies)]

    def train_data(self):
        X_train = pd.DataFrame({'user': self.train_df.user.map(self.user2idx),
                     'movie': self.train_df.movie.map(self.movie2idx)})
        y_train = self.train_df['rate'].astype(np.float32)
        return X_train, y_train

    def val_data(self):
        X_val = pd.DataFrame({'user': self.val_df.user.map(self.user2idx),
                              'movie': self.val_df.movie.map(self.movie2idx)})
        y_val = self.val_df['rate'].astype(np.float32)
        return X_val, y_val

(2) Batch Iterator

  • default값으로 batch_size는 32로 설정
class BatchIterator:
    def __init__(self, X, y, batch_size=32, shuffle=True):
        X, y = np.asarray(X), np.asarray(y)

        if shuffle:
            index = np.random.permutation(X.shape[0])
            X, y = X[index], y[index]
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_batches = int(np.ceil(X.shape[0] // batch_size))
        self._current = 0

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        if self._current >= self.num_batches:
            raise StopIteration()
        i = self._current
        self._current += 1
        bs = self.bs
        return self.X[i * bs:(i + 1) * bs], self.y[i * bs:(i + 1) * bs]
def batches(X, y, batch_size=32, shuffle=True):
    for x_batch, y_batch in BatchIterator(X, y, batch_size, shuffle):
        x_batch = torch.LongTensor(x_batch)
        y_batch = torch.FloatTensor(y_batch)
        yield x_batch, y_batch.view(-1, 1)
dataset = Dataloader(data_path)
config = {
  "z_dim": 16,
  "hidden_layers": [64, 32, 16],
  "embedding_dropout": 0.05,
  "dropouts": [0.3, 0.3, 0.3],
  "learning_rate": 1e-3,
  "weight_decay": 1e-5,
  "batch_size": 8,
  "num_epochs": 3,
  "total_patience": 30,
  "save_path": "params.data"
}

2. Modeling

Embedding을 해주는 Neural Network를 작성한다

  • 핵심 : User 임베딩과 Movie 임베딩을 concat해서 network에 흘려보낸다!
  • 변수 소개
    • num_users : 유저의 수로, X1 (user input)의 차원 수
    • num_movies : 영화의 수로, X2(movie input)의 차원 수
    • dropouts : 각각의 hidden layer에 d.o.를 할 비율
    • embed_dropotu : embedding layer에서
class EmbeddingNN(nn.Module):
    def __init__(self, num_users, num_movies, hidden, dropouts, z_dim):
        super().__init__()
        self.embed_user = nn.Embedding(num_users, z_dim)
        self.embed_movie = nn.Embedding(num_movies, z_dim)
        self.hidden_layers = nn.Sequential(*list(self.generate_layers(z_dim*2, hidden, dropouts)))
        self.fc = nn.Linear(hidden[-1], 1)

    def generate_layers(self, z_dim, hidden, dropouts):
        assert len(dropouts) == len(hidden)
        idx = 0
        while idx < len(hidden):
            if idx == 0:
                yield nn.Linear(z_dim, hidden[idx])
            else:
                yield nn.Linear(hidden[idx-1], hidden[idx])
            yield nn.ReLU()
            yield nn.Dropout(dropouts[idx])
            idx += 1

    def forward(self, users, movies, min_rate=0.5, max_rate=5):
        embed_concat = torch.cat([self.embed_user(users), self.embed_movie(movies)], dim=1)
        x = F.relu(self.hidden_layers(embed_concat))
        out = torch.sigmoid(self.fc(x))
        rate = (out * (max_rate - min_rate)) + min_rate
        return rate
model = EmbeddingNN(
    num_users=datasets.num_users, num_movies=dataset.num_movies,
    z_dim=config['z_dim'], hidden=config['hidden_layers'],
    dropouts=config['dropouts']
    )

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

3. Training

def train(model, dataset, config):
    X_train, y_train = dataset.train_data()
	X_valid, y_valid = dataset.val_data()
    model.to(device)
    batch_size = config['batch_size']
    num_epochs = config['num_epochs']
    max_patience = config['total_patience']
    num_patience = 0
    best_loss = np.inf

    loss_fn = nn.MSELoss()
    loss_fn.to(device)
    opt = optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])

    result = dict()
    for epoch in tqdm(range(num_epochs)):
        training_loss = 0.0
        for batch in batches(X_train, y_train, shuffle=True, batch_size=batch_size):
            x_batch, y_batch = [b.to(device) for b in batch]
            opt.zero_grad()
            with torch.set_grad_enabled(True):
                outputs = model(x_batch[:, 0], x_batch[:, 1], dataset.min_rate, dataset.max_rate)
                loss = loss_fn(outputs, y_batch)
                loss.backward()
                opt.step()
            training_loss += loss.item()
        result['train'] = training_loss / len(X_train)

        val_outputs = model(torch.LongTensor(X_valid.user.values).to(device),
                            torch.LongTensor(X_valid.movie.values).to(device), dataset.min_rate, dataset.max_rate)
        val_loss = loss_fn(val_outputs.to(device), torch.FloatTensor(y_valid.values).view(-1, 1).to(device))
        result['val'] = float((val_loss / len(X_valid)).data)

        if val_loss < best_loss:
            print('Save new model on epoch: %d' % (epoch + 1))
            best_loss = val_loss
            result['best_loss'] = val_loss
            torch.save(model.state_dict(), config['save_path'])
            num_patience = 0
        else:
            num_patience += 1

        print(f'[epoch: {epoch+1}] train: {result["train"]} - val: {result["val"]}')

        if num_patience >= max_patience:
            print(f"Early Stopped after epoch {epoch+1}")
            break

    return result
def model_valid(user_id_list, movie_id_list, data_path):
    dataset = Dataloader(data_path)
    val_df = pd.DataFrame({
        'user_id': [dataset.user2idx[x] for x in user_id_list],
        'movie_id': [dataset.movie2idx[x] for x in movie_id_list]
    })
    model = EmbeddingNN(dataset.num_users, dataset.num_movies,
                       config['hidden_layers'], config['dropouts'], config['z_dim'])
	model.load_state_dict(torch.load('params.data'))
    val_output = model(users=torch.LongTensor(val_df.user_id.values), movies=torch.LongTensor(val_df.movie_id.values))
    return val_output

Categories:

Updated: