[ CS224W - Colab 2 ]

( 참고 : CS224W: Machine Learning with Graphs )

( 참고 : https://github.com/luciusssss/CS224W-Colab/blob/main/CS224W-Colab%202.ipynb )


PyTorch Geometric generally has two classes

torch_geometric.datasets

  • variety of common graph datasets

torch_geometric.data

  • provides the data handling of graphs in PyTorch tensors.

[ PyG Datasets ]

torch_geometric.datasets

from torch_geometric.datasets import TUDataset

root = './enzymes'
name = 'ENZYMES'

pyg_dataset= TUDataset(root,name)


1. ENZYMES dataset 소개

  • number of classes
  • number of features
number_of_classes = pyg_dataset.num_classes # 6
number_of_features = pyg_dataset.num_features # 3


2. Label of Graph

  • 특정 index의 graph의 label은?
  • 결과 : 100번 그래프의 label은 4
idx=100
pyg_dataset[idx].y[0] # 4


3. Number of Edges

  • 특정 index의 graph의 edge 개수는?
idx=200
pyg_dataset[idx].edge_index.shape[1]


[ Open Graph Benchmark (OGB) ]

collection of realistic, large-scale, and diverse benchmark datasets for graph

( OGB also supports the PyG dataset and data )


Dataset 다운받기

  • OGB 내에 있는 ogbn-arxiv 데이터셋
import torch_geometric.transforms as T
from ogb.nodeproppred import PygNodePropPredDataset
dataset_name = 'ogbn-arxiv'
dataset = PygNodePropPredDataset(name=dataset_name,
                                 transform=T.ToSparseTensor())

print(len(dataset)) # 1 ... 1개의 graph 가진 데이터셋

data = dataset[0]


4. Number of Features in ogbn-arxiv

data.num_features


[ GNN : Node Property Prediction ]

PyTorch Geometric을 사용하여 GNN을 만들 것

( for node classification )

  • PyG의 GCNConv를 사용할 것


(1) Import Packages

import torch
import torch.nn.functional as F

from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T

from ogb.nodeproppred import PygNodePropPredDataset, Evaluator

device = 'cuda' if torch.cuda.is_available() else 'cpu'


(2) Load & Preprocess Dataset

  • Load data
dataset_name = 'ogbn-arxiv'
dataset = PygNodePropPredDataset(name=dataset_name,
                                 transform=T.ToSparseTensor())
data = dataset[0]
  • Preprocess data
# Adjacency Matrix를 symmetric하게
data.adj_t = data.adj_t.to_symmetric()
data = data.to(device)

# Train / Test index 나누기
split_idx = dataset.get_idx_split()
train_idx = split_idx['train'].to(device)


(3) Modeling ( GCN )

모델 구조 :

figure2


return_embeds

  • True : embedding 결과 그 자체
  • False : softmax 결과값
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers,
                 dropout, return_embeds=False):


        super(GCN, self).__init__()
	
    	# (1) GCN Conv layers
        self.convs = torch.nn.ModuleList(
            [GCNConv(in_channels=input_dim, out_channels=hidden_dim)] +
            [GCNConv(in_channels=hidden_dim, out_channels=hidden_dim)                     
                for i in range(num_layers-2)] + 
            [GCNConv(in_channels=hidden_dim, out_channels=output_dim)])

		# (2) Batch Norm Layers
        self.bns = torch.nn.ModuleList([
            torch.nn.BatchNorm1d(num_features=hidden_dim) 
                for i in range(num_layers-1)])
        
        self.softmax = torch.nn.LogSoftmax()
        self.dropout = dropout
        self.return_embeds = return_embeds

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, adj_t):
        for conv, bn in zip(self.convs[:-1], self.bns):
            x1 = F.relu(bn(conv(x, adj_t)))
            if self.training:
                x1 = F.dropout(x1, p=self.dropout)
            x = x1
        x = self.convs[-1](x, adj_t)
        
        if self.return_embeds:
            return x
       	else:
            return self.softmax(x)


(4) Train 함수

def train(model, data, train_idx, optimizer, loss_fn):
    # (1) train 모드
    model.train()
    
    # (2) optimizer 초기화
    optimizer.zero_grad()
    
    # (3) output 계산 ( 모든 data )
    y_hat = model(data.x, data.adj_t)
    
    # (4) loss 계산 ( train data )
    loss = loss_fn(y_hat[train_idx], data.y[train_idx].reshape(-1))
    
    # (5) back-propagation
    loss.backward()
    optimizer.step()
    
    return loss.item()


(5) Test 함수

@torch.no_grad()
def test(model, data, split_idx, evaluator):
	# (1) test 모드
    model.eval()

    # (2) output 계산 ( 예측값 & 예측 class )
    y_hat = model(data.x, data.adj_t)
    y_pred = y_hat.argmax(dim=-1, keepdim=True)

    # (3) train & val & test 데이터에 대한 accuracy
    train_acc = evaluator.eval({
        'y_true': data.y[split_idx['train']],
        'y_pred': y_pred[split_idx['train']]})['acc']
    valid_acc = evaluator.eval({
        'y_true': data.y[split_idx['valid']],
        'y_pred': y_pred[split_idx['valid']]})['acc']
    test_acc = evaluator.eval({
        'y_true': data.y[split_idx['test']],
        'y_pred': y_pred[split_idx['test']]})['acc']

    return train_acc, valid_acc, test_acc


(6) Arguments

args = {
    'device': device,
    'num_layers': 3,
    'hidden_dim': 256,
    'dropout': 0.5,
    'lr': 0.01,
    'epochs': 100,
}


(7) Build Model & Evaluator

model = GCN(data.num_features, args['hidden_dim'],
            dataset.num_classes, args['num_layers'],
            args['dropout']).to(device)

evaluator = Evaluator(name='ogbn-arxiv')


(8) Train model

import copy

# (1) paramter 초기화
model.reset_parameters()

# (2) optimizer & loss function 설정
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss_fn = F.nll_loss

# (3) 가장 성능 좋은 모델/정확도 기록
best_model = None
best_valid_acc = 0

# (4) 학습 시작
for epoch in range(1, 1 + args["epochs"]):
  ### Train
  loss = train(model, data, train_idx, optimizer, loss_fn)
    
  ### Test
  result = test(model, data, split_idx, evaluator)
    
  ### 결과 기록
  train_acc, valid_acc, test_acc = result
    
  if valid_acc > best_valid_acc:
      best_valid_acc = valid_acc
      best_model = copy.deepcopy(model)
  print(f'Epoch: {epoch:02d}, '
        f'Loss: {loss:.4f}, '
        f'Train: {100 * train_acc:.2f}%, '
        f'Valid: {100 * valid_acc:.2f}% '
        f'Test: {100 * test_acc:.2f}%')


가장 좋은 성능을 냈던 model을 사용하여, 전체 데이터 결과 다시 예측하기

best_result = test(best_model, data, split_idx, evaluator)

train_acc, valid_acc, test_acc = best_result
print(f'Best model: '
      f'Train: {100 * train_acc:.2f}%, '
      f'Valid: {100 * valid_acc:.2f}% '
      f'Test: {100 * test_acc:.2f}%')


[ GNN : Graph Property Prediction ]

이번엔, ogbg-molhiv 데이터 사용

(1) data 불러오기

from ogb.graphproppred import PygGraphPropPredDataset, Evaluator
from torch_geometric.data import DataLoader
from tqdm.notebook import tqdm
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dataset = PygGraphPropPredDataset(name='ogbg-molhiv')
split_idx = dataset.get_idx_split()


(2) Data Loader 생성

train_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, 
                          shuffle=True, num_workers=0)
valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, 
                          shuffle=False, num_workers=0)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, 
                         shuffle=False, num_workers=0)


(3) Argument

args = {
    'device': device,
    'num_layers': 5,
    'hidden_dim': 256,
    'dropout': 0.5,
    'lr': 0.001,
    'epochs': 30,
}


(4) Graph Prediction Model

from ogb.graphproppred.mol_encoder import AtomEncoder
from torch_geometric.nn import global_add_pool, global_mean_pool
class GCN_Graph(torch.nn.Module):
    def __init__(self, hidden_dim, output_dim, num_layers, dropout):
        super(GCN_Graph, self).__init__()
        self.node_encoder = AtomEncoder(hidden_dim)
        self.gnn_node = GCN(hidden_dim, hidden_dim,
            hidden_dim, num_layers, dropout, return_embeds=True)
        self.pool = global_mean_pool
        self.linear = torch.nn.Linear(hidden_dim, output_dim)

    def reset_parameters(self):
      self.gnn_node.reset_parameters()
      self.linear.reset_parameters()

    def forward(self, batched_data):
        x, edge_index, batch = batched_data.x, batched_data.edge_index, batched_data.batch
        # (1) 주어진 node를 hidden dim으로 임베딩
        embed = self.node_encoder(x)
		
        # (2) 임베딩된 node를 GCN 거치기
        embed = self.gnn_node(embed, edge_index)
        
        # (3) Pooling (GAP)
        features = self.pool(embed, batch)
        
        # (4) Linear Layer
        out = self.linear(features)


        return out


(5) Train 함수

def train(model, device, data_loader, optimizer, loss_fn):
	# (1) train 모드
    model.train()

    # (2) iteration
    for step, batch in enumerate(tqdm(data_loader, desc="Iteration")):
      batch = batch.to(device)

      if batch.x.shape[0] == 1 or batch.batch[-1] == 0:
          pass
      else:
        is_labeled = batch.y == batch.y

		# (3) optimizer 초기화
        optimizer.zero_grad()
        
        # (4) output 계산
        y_hat = model(batch)
        
        # (5) loss 계산
        loss = loss_fn(y_hat[is_labeled], batch.y[is_labeled].float())

        # (6) backpropagation
        loss.backward()
        optimizer.step()

    return loss.item()


(6) Evaluation 함수

def eval(model, device, loader, evaluator):
    model.eval()
    y_true_list = []
    y_pred_list = []

    for step, batch in enumerate(tqdm(loader, desc="Iteration")):
        batch = batch.to(device)

        if batch.x.shape[0] == 1:
            pass
        else:
            with torch.no_grad():
                y_hat = model(batch)

            y_true_list.append(batch.y.view(y_hat.shape).detach().cpu())
            y_pred_list.append(y_hat.detach().cpu())

    y_true_list = torch.cat(y_true_list, dim = 0).numpy()
    y_pred_list = torch.cat(y_pred_list, dim = 0).numpy()

    input_dict = {"y_true": y_true_list, "y_pred": y_pred_list}
	eval_result = evaluator.eval(input_dict)
    return eval_result


(7) Model

model = GCN_Graph(args['hidden_dim'],
            dataset.num_tasks, args['num_layers'],
            args['dropout']).to(device)

evaluator = Evaluator(name='ogbg-molhiv')

Tags: ,

Categories:

Updated: