[ Recommender System ]
10.[code] Neural Collaborative Filtering
( 참고 : Fastcampus 추천시스템 강의 )
paper : Neural Collaborative Filtering ( He, et al., 2017 ) (https://arxiv.org/abs/1708.05031)
구성
- 1) Data 불러오기
- 2) Modeling
- 3) Training
1. Data 불러오기
(1) DataLoader
class DataLoader:
def __init__(self, data_path):
self.train_df, val_temp_df = read_data(data_path)
self.min_rate = min(self.train_df.rate)
self.max_rate = max(self.train_df.rate)
self.users = self.train_df.user.unique()
self.num_users = len(self.users)
self.user2idx = {user_name: idx for idx, user_name in enumerate(self.users)}
self.movies = self.train_df.movie.unique()
self.num_movies = len(self.movies)
self.movie2idx = {movie_name: idx for idx, movie_name in enumerate(self.movies)}
self.val_df = val_temp_df[val_temp_df.user.isin(self.users) & val_temp_df.movie.isin(self.movies)]
def train_data(self):
X_train = pd.DataFrame({'user': self.train_df.user.map(self.user2idx),
'movie': self.train_df.movie.map(self.movie2idx)})
y_train = self.train_df['rate'].astype(np.float32)
return X_train, y_train
def val_data(self):
X_val = pd.DataFrame({'user': self.val_df.user.map(self.user2idx),
'movie': self.val_df.movie.map(self.movie2idx)})
y_val = self.val_df['rate'].astype(np.float32)
return X_val, y_val
(2) Batch Iterator
- default값으로 batch_size는 32로 설정
class BatchIterator:
def __init__(self, X, y, batch_size=32, shuffle=True):
X, y = np.asarray(X), np.asarray(y)
if shuffle:
index = np.random.permutation(X.shape[0])
X, y = X[index], y[index]
self.X = X
self.y = y
self.batch_size = batch_size
self.shuffle = shuffle
self.num_batches = int(np.ceil(X.shape[0] // batch_size))
self._current = 0
def __iter__(self):
return self
def __next__(self):
return self.next()
def next(self):
if self._current >= self.num_batches:
raise StopIteration()
i = self._current
self._current += 1
bs = self.bs
return self.X[i * bs:(i + 1) * bs], self.y[i * bs:(i + 1) * bs]
def batches(X, y, batch_size=32, shuffle=True):
for x_batch, y_batch in BatchIterator(X, y, batch_size, shuffle):
x_batch = torch.LongTensor(x_batch)
y_batch = torch.FloatTensor(y_batch)
yield x_batch, y_batch.view(-1, 1)
dataset = Dataloader(data_path)
config = {
"z_dim": 16,
"hidden_layers": [64, 32, 16],
"embedding_dropout": 0.05,
"dropouts": [0.3, 0.3, 0.3],
"learning_rate": 1e-3,
"weight_decay": 1e-5,
"batch_size": 8,
"num_epochs": 3,
"total_patience": 30,
"save_path": "params.data"
}
2. Modeling
Embedding을 해주는 Neural Network를 작성한다
- 핵심 : User 임베딩과 Movie 임베딩을 concat해서 network에 흘려보낸다!
- 변수 소개
- num_users : 유저의 수로, X1 (user input)의 차원 수
- num_movies : 영화의 수로, X2(movie input)의 차원 수
- dropouts : 각각의 hidden layer에 d.o.를 할 비율
- embed_dropotu : embedding layer에서
class EmbeddingNN(nn.Module):
def __init__(self, num_users, num_movies, hidden, dropouts, z_dim):
super().__init__()
self.embed_user = nn.Embedding(num_users, z_dim)
self.embed_movie = nn.Embedding(num_movies, z_dim)
self.hidden_layers = nn.Sequential(*list(self.generate_layers(z_dim*2, hidden, dropouts)))
self.fc = nn.Linear(hidden[-1], 1)
def generate_layers(self, z_dim, hidden, dropouts):
assert len(dropouts) == len(hidden)
idx = 0
while idx < len(hidden):
if idx == 0:
yield nn.Linear(z_dim, hidden[idx])
else:
yield nn.Linear(hidden[idx-1], hidden[idx])
yield nn.ReLU()
yield nn.Dropout(dropouts[idx])
idx += 1
def forward(self, users, movies, min_rate=0.5, max_rate=5):
embed_concat = torch.cat([self.embed_user(users), self.embed_movie(movies)], dim=1)
x = F.relu(self.hidden_layers(embed_concat))
out = torch.sigmoid(self.fc(x))
rate = (out * (max_rate - min_rate)) + min_rate
return rate
model = EmbeddingNN(
num_users=datasets.num_users, num_movies=dataset.num_movies,
z_dim=config['z_dim'], hidden=config['hidden_layers'],
dropouts=config['dropouts']
)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
3. Training
def train(model, dataset, config):
X_train, y_train = dataset.train_data()
X_valid, y_valid = dataset.val_data()
model.to(device)
batch_size = config['batch_size']
num_epochs = config['num_epochs']
max_patience = config['total_patience']
num_patience = 0
best_loss = np.inf
loss_fn = nn.MSELoss()
loss_fn.to(device)
opt = optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
result = dict()
for epoch in tqdm(range(num_epochs)):
training_loss = 0.0
for batch in batches(X_train, y_train, shuffle=True, batch_size=batch_size):
x_batch, y_batch = [b.to(device) for b in batch]
opt.zero_grad()
with torch.set_grad_enabled(True):
outputs = model(x_batch[:, 0], x_batch[:, 1], dataset.min_rate, dataset.max_rate)
loss = loss_fn(outputs, y_batch)
loss.backward()
opt.step()
training_loss += loss.item()
result['train'] = training_loss / len(X_train)
val_outputs = model(torch.LongTensor(X_valid.user.values).to(device),
torch.LongTensor(X_valid.movie.values).to(device), dataset.min_rate, dataset.max_rate)
val_loss = loss_fn(val_outputs.to(device), torch.FloatTensor(y_valid.values).view(-1, 1).to(device))
result['val'] = float((val_loss / len(X_valid)).data)
if val_loss < best_loss:
print('Save new model on epoch: %d' % (epoch + 1))
best_loss = val_loss
result['best_loss'] = val_loss
torch.save(model.state_dict(), config['save_path'])
num_patience = 0
else:
num_patience += 1
print(f'[epoch: {epoch+1}] train: {result["train"]} - val: {result["val"]}')
if num_patience >= max_patience:
print(f"Early Stopped after epoch {epoch+1}")
break
return result
def model_valid(user_id_list, movie_id_list, data_path):
dataset = Dataloader(data_path)
val_df = pd.DataFrame({
'user_id': [dataset.user2idx[x] for x in user_id_list],
'movie_id': [dataset.movie2idx[x] for x in movie_id_list]
})
model = EmbeddingNN(dataset.num_users, dataset.num_movies,
config['hidden_layers'], config['dropouts'], config['z_dim'])
model.load_state_dict(torch.load('params.data'))
val_output = model(users=torch.LongTensor(val_df.user_id.values), movies=torch.LongTensor(val_df.movie_id.values))
return val_output