[ Recommender System ]

19.[code] AutoEncoder meets Collaborative Filtering

( 참고 : Fastcampus 추천시스템 강의 )

paper : AutoRec : Autoencoders Meet Collaborative Filtering ( Sedhain et al., 2015 )

( https://users.cecs.anu.edu.au/~akmenon/papers/autorec/autorec-paper.pdf )


1. Import Data

KMRD dataset class를 만들어준다.

class KMRDdataset(Dataset):
    def __init__(self, df, user2idx, movie2idx, item_based=True):
        self.min_rate = min(df.rate)
        self.max_rate = max(df.rate)

        self.user = [user2idx[u] for u in df.user.values]
        self.movie = [movie2idx[m] for m in df.movie.values]
        self.rate = df.rate.values

        if item_based:
          input_tensor = torch.LongTensor([self.movie, self.user])
          self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rate),
                                             torch.Size([len(movie2idx), len(user2idx)])).to_dense()
        else:
          input_tensor = torch.LongTensor([self.user, self.movie])
          self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rate),
                                             torch.Size([len(user2idx), len(movie2idx)])).to_dense()


    def __len__(self):
      return len(self.data)
    
    def __getitem__(self, idx):
      return self.data[idx]


train & validation dataset ( + user/movie <-> index dictionary )를 생성한다.

train_df, val_df, user2idx, movie2idx = read_data(data_path=data_path)
train_dataset = KMRDdataset(train_df, user2idx, movie2idx)
val_dataset = KMRDdataset(val_df, user2idx, movie2idx)


batch size=64의 dataloader를 만들어준다.

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)


2. Deep Autoencoder

Deep NN구조의 autoencder를 생성한다.

class DeepAutoEncoder(nn.Module):
  def __init__(self, hidden_layers, dropout=False):
    super(DeepAutoEncoder, self).__init__()
    self.encoder, self.decoder = self.make_ENC_DEC(hidden_layers, dropout)
  
  def forward(self, x):
    return self.decoder(self.encoder(x))
  
  def make_ENC_DEC(self, hidden_layers,dropout=0.2):
    enc_layers = []
    dec_layers = []
    
    for idx, num_hidden in enumerate(hidden_layers):        
        if idx < len(hidden_layers)-1:
            enc_layers.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
            enc_layers.append(nn.Sigmoid())            
            if idx!= len(hidden_layers)-2 :
                enc_layers.append(nn.Dropout(dropout))            
                
    hidden_layers = list(reversed(hidden_layers))
    for idx, num_hidden in enumerate(hidden_layers):
        if idx < len(hidden_layers)-1:
            dec_layers.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
            dec_layers.append(nn.Identity())
            if idx!=len(hidden_layers)-2 :
                dec_layers.append(nn.Dropout(dropout))            
                
    encoder = nn.Sequential(*enc_layers)
    decoder = nn.Sequential(*dec_layers)
    return encoder, decoder


3. Set optimizer & loss function

num_users = len(user2idx.keys())
num_movies = len(movie2idx.keys())
  • AE의 hidden layer 구성이 [30-40-50-40-30]이 되도록한다
model = DeepAutoEncoder(hidden_layers=[30,40,50], dropout=False)
optimizer = optim.Adam(model.parameters(), lr=1e-3)


  • weight는 xavier 초기값으로, bias는 0으로 초기값을 지정해준다.
def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.zeros_(m.bias)

weights_init(model)


  • MMSE loss를 구한다 ( rating이 없는거에 대해서는 고려 X )
# NVIDIA Recommender System 
def MSEloss(inputs, targets, size_average=False):
  mask = targets != 0
  num_rates = torch.sum(mask.float())
  criterion = nn.MSELoss(reduction='sum' if not size_average else 'mean')
  return criterion(inputs * mask.float(), targets), Variable(torch.Tensor([1.0])) if size_average else num_rates


4. Train

(1) Train

model.train()
train_loss = 0
for idx, batch in enumerate(train_dataloader):
    optimizer.zero_grad()
    
    pred = model(batch)
    loss, num_rates = MSEloss(pred, batch)    
    loss = torch.sqrt(loss / num_rates)
    loss.backward()
    train_loss += loss.item() 
    optimizer.step()
    
    print(train_loss / (idx+1))


(2) Validation

model.eval()
val_loss = 0
with torch.no_grad():
  for idx, batch in enumerate(val_dataloader):
    pred = model(batch)
    loss, num_ratings = MSEloss(pred, batch)
    loss = torch.sqrt(loss / num_ratings)
    val_loss += loss.item()

    print(val_loss/(idx+1))

Categories:

Updated: