[ Recommender System ]
19.[code] AutoEncoder meets Collaborative Filtering
( 참고 : Fastcampus 추천시스템 강의 )
paper : AutoRec : Autoencoders Meet Collaborative Filtering ( Sedhain et al., 2015 )
( https://users.cecs.anu.edu.au/~akmenon/papers/autorec/autorec-paper.pdf )
1. Import Data
KMRD dataset class를 만들어준다.
class KMRDdataset(Dataset):
def __init__(self, df, user2idx, movie2idx, item_based=True):
self.min_rate = min(df.rate)
self.max_rate = max(df.rate)
self.user = [user2idx[u] for u in df.user.values]
self.movie = [movie2idx[m] for m in df.movie.values]
self.rate = df.rate.values
if item_based:
input_tensor = torch.LongTensor([self.movie, self.user])
self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rate),
torch.Size([len(movie2idx), len(user2idx)])).to_dense()
else:
input_tensor = torch.LongTensor([self.user, self.movie])
self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rate),
torch.Size([len(user2idx), len(movie2idx)])).to_dense()
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
train & validation dataset ( + user/movie <-> index dictionary )를 생성한다.
train_df, val_df, user2idx, movie2idx = read_data(data_path=data_path)
train_dataset = KMRDdataset(train_df, user2idx, movie2idx)
val_dataset = KMRDdataset(val_df, user2idx, movie2idx)
batch size=64의 dataloader를 만들어준다.
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)
2. Deep Autoencoder
Deep NN구조의 autoencder를 생성한다.
class DeepAutoEncoder(nn.Module):
def __init__(self, hidden_layers, dropout=False):
super(DeepAutoEncoder, self).__init__()
self.encoder, self.decoder = self.make_ENC_DEC(hidden_layers, dropout)
def forward(self, x):
return self.decoder(self.encoder(x))
def make_ENC_DEC(self, hidden_layers,dropout=0.2):
enc_layers = []
dec_layers = []
for idx, num_hidden in enumerate(hidden_layers):
if idx < len(hidden_layers)-1:
enc_layers.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
enc_layers.append(nn.Sigmoid())
if idx!= len(hidden_layers)-2 :
enc_layers.append(nn.Dropout(dropout))
hidden_layers = list(reversed(hidden_layers))
for idx, num_hidden in enumerate(hidden_layers):
if idx < len(hidden_layers)-1:
dec_layers.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
dec_layers.append(nn.Identity())
if idx!=len(hidden_layers)-2 :
dec_layers.append(nn.Dropout(dropout))
encoder = nn.Sequential(*enc_layers)
decoder = nn.Sequential(*dec_layers)
return encoder, decoder
3. Set optimizer & loss function
num_users = len(user2idx.keys())
num_movies = len(movie2idx.keys())
- AE의 hidden layer 구성이 [30-40-50-40-30]이 되도록한다
model = DeepAutoEncoder(hidden_layers=[30,40,50], dropout=False)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
- weight는 xavier 초기값으로, bias는 0으로 초기값을 지정해준다.
def weights_init(m):
if isinstance(m, nn.Linear):
torch.nn.init.xavier_uniform_(m.weight)
torch.nn.init.zeros_(m.bias)
weights_init(model)
- MMSE loss를 구한다 ( rating이 없는거에 대해서는 고려 X )
# NVIDIA Recommender System
def MSEloss(inputs, targets, size_average=False):
mask = targets != 0
num_rates = torch.sum(mask.float())
criterion = nn.MSELoss(reduction='sum' if not size_average else 'mean')
return criterion(inputs * mask.float(), targets), Variable(torch.Tensor([1.0])) if size_average else num_rates
4. Train
(1) Train
model.train()
train_loss = 0
for idx, batch in enumerate(train_dataloader):
optimizer.zero_grad()
pred = model(batch)
loss, num_rates = MSEloss(pred, batch)
loss = torch.sqrt(loss / num_rates)
loss.backward()
train_loss += loss.item()
optimizer.step()
print(train_loss / (idx+1))
(2) Validation
model.eval()
val_loss = 0
with torch.no_grad():
for idx, batch in enumerate(val_dataloader):
pred = model(batch)
loss, num_ratings = MSEloss(pred, batch)
loss = torch.sqrt(loss / num_ratings)
val_loss += loss.item()
print(val_loss/(idx+1))