LINE implementation with Negative Sampling (1st order proximity)

  • 1st order proximity를 이용해서 LINE을 구현해볼 것이다.

png

1. Import Dataset & Libraries

import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
from random import shuffle
from copy import copy

%matplotlib inline
edge = pd.read_csv('karate_club.edgelist', sep=' ', names=['x','y','w'])
graph = nx.Graph()
for i in range(edge.shape[0]):
    graph.add_node(node_for_adding = edge['x'][i])
    graph.add_node(node_for_adding = edge['y'][i])
    graph.add_edge(edge['x'][i], edge['y'][i])

1) adjacent matrix & one-hot encoded matrix

A = nx.to_numpy_matrix(graph, nodelist=sorted(graph.nodes()))
OH = np.identity(A.shape[0])
OH = np.identity(A.shape[0])

2) edges

  • only needed in case of “different weights among edges”
edges = edge[['x','y']]
edges['w'] = 1
edges = np.array(edges)



2. Define Functions

1) first_prox

  • 2개 node의 vector를 input하면 first-order proximity 계산
def first_prox(node_vec1,node_vec2):
    mul = np.dot(node_vec1.T,node_vec2)
    return 1/(1+np.exp(-mul))



3. First-order Proximity with Negative Sampling

def FirstOrder_neg_sampling(num_neg,epoch,lr):
    low_dim = np.random.rand(OH.shape[0],2)
    
    for _ in range(epoch):
        embed_vec =np.matmul(OH,low_dim)        
        
        for node in range(embed_vec.shape[0]):
            ######################### find Neighbours & Non-Neighbours ##########################
            
            neighbours_index = np.nonzero(A[node])[1]
            not_neighbours_index = np.setdiff1d(np.arange(embed_vec.shape[0]), np.nonzero(A[node])[1])
            sampled_not_neighbours_index = np.random.choice(not_neighbours_index,num_neg)
            
            ########## find vectors of (1) main node, (2) neighbours, (3) non-neighbours ##########
            
            node_vec = embed_vec[node]  
            
            for positive in neighbours_index:
                pos_vec = embed_vec[positive]
                neg_vecs = embed_vec[sampled_not_neighbours_index]
                
            ############################# UPDATE ###############################################
                embed_vec[node] -= lr*(first_prox(node_vec,pos_vec)-1)*pos_vec
        
                for neg_vec in neg_vecs:
                    embed_vec[node] -= lr*(first_prox(node_vec,neg_vec))*neg_vec
                    
    return embed_vec
Emb = FirstOrder_neg_sampling(num_neg=10,epoch=100,lr=0.1)



4. Visualization

Emb_df = pd.DataFrame({'X':Emb[:,0], 'Y':Emb[:,1],'Label':range(1,35)})

blue = [1,2,3,4,5,6,7,8,9,11,12,13,14,17,18,20,22]
red = list(set(range(0,35))-set(blue))

Emb_df.loc[Emb_df.Label.isin(blue),'Color']=1
Emb_df.loc[Emb_df.Label.isin(red),'Color']=0
plt.scatter(Emb_df['X'], Emb_df['Y'], c=Emb_df['Color'])

for i,txt in enumerate(Emb_df['Label']):
    plt.annotate(txt, (Emb_df['X'][i], Emb_df['Y'][i]))

png