In [1]:
from companykg import CompanyKG

Initialize CompanyKG¶

In [2]:
DATA_ROOT_FOLDER = "./data"

# It may take long time if the data files are not yet downloaded.
comkg = CompanyKG(nodes_feature_type="msbert", load_edges_weights=True, data_root_folder=DATA_ROOT_FOLDER)
In [3]:
# Obtain the list of node IDs using nodes_id
len(comkg.nodes_id)
Out[3]:
1169931

Inspect CompanyKG¶

In [4]:
# Show the important info of the loaded data
comkg.describe()
data_root_folder=./data
n_nodes=1169931, n_edges=50815503
nodes_feature_type=msbert
nodes_feature_dimension=512
edges_weight_dimension=15
sp: 3219 samples
sr: 1856 samples
cr: 400 samples
In [5]:
# Nodes feature: ordered by comkg.nodes_id
comkg.nodes_feature
Out[5]:
tensor([[ 0.0642,  0.0100, -0.0427,  ...,  0.0580,  0.0156, -0.0645],
        [ 0.0563, -0.0086, -0.0114,  ..., -0.0381, -0.0020, -0.1615],
        [ 0.0668, -0.0803,  0.0296,  ...,  0.0003,  0.0872, -0.0662],
        ...,
        [-0.0072,  0.0593, -0.0401,  ...,  0.0861,  0.0681, -0.0302],
        [-0.0445, -0.1243,  0.0048,  ...,  0.0149, -0.0466, -0.0800],
        [-0.0396, -0.0113, -0.0121,  ..., -0.0240, -0.0342, -0.0039]])
In [6]:
# Edges
comkg.edges
Out[6]:
tensor([[ 113091,  412357],
        [ 560244, 1164306],
        [ 388246, 1121544],
        ...,
        [  84160,  837013],
        [ 179090,  917143],
        [ 179090,  226260]])
In [7]:
# Edges weight
comkg.edges_weight
Out[7]:
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.7918],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.7918],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.7918],
        ...,
        [1.0000, 0.0000, 1.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

Inspect Evaluation Data¶

In [8]:
# SP samples
comkg.eval_tasks['sp'][1]
Out[8]:
node_id0 node_id1 label
0 121769 136631 0
1 151107 336852 0
2 333601 363822 0
3 2481 419571 0
4 37253 437144 0
... ... ... ...
3214 361222 883235 1
3215 536154 883235 1
3216 330058 1088487 1
3217 494594 1088487 1
3218 1624 1088487 1

3219 rows × 3 columns

In [9]:
# SR samples
comkg.eval_tasks['sr'][1]
Out[9]:
target_node_id candidate0_node_id candidate1_node_id label split
0 201389 198435 797652 0 test
1 450703 618486 624384 0 test
2 1097415 297978 386584 0 validation
3 81000 244410 1016534 0 test
4 861572 1155658 1115208 0 test
... ... ... ... ... ...
1851 522257 669089 607981 1 test
1852 1083662 203070 482478 1 validation
1853 354276 551887 865995 0 test
1854 830672 504882 1046882 1 test
1855 804707 991173 91744 1 test

1856 rows × 5 columns

In [10]:
# CR samples
comkg.eval_tasks['cr'][1]
Out[10]:
target_node_id competitor_node_id
0 3843 34994
1 3843 263332
2 3843 1034500
3 4981 45823
4 4981 288480
... ... ...
395 1144634 1004440
396 1144634 1077443
397 1163522 172921
398 1163522 268689
399 1163522 1149354

400 rows × 2 columns

Evaluate Node Feature¶

In [11]:
# Run all evaluation tasks on the loaded node feature
eval_results = comkg.evaluate()
Evaluate Node Features msbert:
Evaluate SP ...
SP AUC: 0.8059550091101482
Evaluate SR ...
SR Validation ACC: 0.6956521739130435 SR Test ACC: 0.6713709677419355
Evaluate CR with top-K hit rate (K=[50, 100, 200, 500, 1000, 2000, 5000, 10000]) ...
CR Hit Rates: [0.12955922001974632, 0.18240535049745576, 0.23030967570441258, 0.31102329687856, 0.4143004291030607, 0.47711466165413524, 0.5583993126756285, 0.6349049707602339]
In [12]:
# Show AUC score for SP task
eval_results["sp_auc"]
Out[12]:
0.8059550091101482
In [13]:
# Show test accuracy for SR task
eval_results["sr_test_acc"]
Out[13]:
0.6713709677419355
In [14]:
# Show validation accuracy for SR task
eval_results["sr_validation_acc"]
Out[14]:
0.6956521739130435
In [15]:
# Show Top-K Hit Rate for CR task
eval_results["cr_topk_hit_rate"]
Out[15]:
[0.12955922001974632,
 0.18240535049745576,
 0.23030967570441258,
 0.31102329687856,
 0.4143004291030607,
 0.47711466165413524,
 0.5583993126756285,
 0.6349049707602339]

Evaluate Saved Embedding¶

In [16]:
# Run all evaluation tasks on the specified embeddings saved in torch.Tensor format

EMBEDDINGS_FILE = "./data/nodes_feature_msbert.pt"

eval_results = comkg.evaluate(embeddings_file=EMBEDDINGS_FILE)
Evaluate Node Embeddings ./data/nodes_feature_msbert.pt:
Evaluate SP ...
SP AUC: 0.8059550091101482
Evaluate SR ...
SR Validation ACC: 0.6956521739130435 SR Test ACC: 0.6713709677419355
Evaluate CR with top-K hit rate (K=[50, 100, 200, 500, 1000, 2000, 5000, 10000]) ...
CR Hit Rates: [0.12955922001974632, 0.18240535049745576, 0.23030967570441258, 0.31102329687856, 0.4143004291030607, 0.47711466165413524, 0.5583993126756285, 0.6349049707602339]
In [17]:
# Show AUC score for SP task
eval_results["sp_auc"]
Out[17]:
0.8059550091101482
In [18]:
# Show test accuracy for SR task
eval_results["sr_test_acc"]
Out[18]:
0.6713709677419355
In [19]:
# Show validation accuracy for SR task
eval_results["sr_validation_acc"]
Out[19]:
0.6956521739130435
In [20]:
# Show Top-K Hit Rate for CR task
eval_results["cr_topk_hit_rate"]
Out[20]:
[0.12955922001974632,
 0.18240535049745576,
 0.23030967570441258,
 0.31102329687856,
 0.4143004291030607,
 0.47711466165413524,
 0.5583993126756285,
 0.6349049707602339]

Create DGL Graph¶

In [21]:
# Takes about 15 mins, the graph will be saved to work_folder
g = comkg.get_dgl_graph(work_folder="./experiments")
g
Out[21]:
[Graph(num_nodes=1169931, num_edges=50815503,
       ndata_schemes={'feat': Scheme(shape=(512,), dtype=torch.float32)}
       edata_schemes={'weight': Scheme(shape=(15,), dtype=torch.float32)})]
In [22]:
# When call the same function again, it will load from file directly
g = comkg.get_dgl_graph(work_folder="./experiments")
g
Out[22]:
[Graph(num_nodes=1169931, num_edges=50815503,
       ndata_schemes={'feat': Scheme(shape=(512,), dtype=torch.float32)}
       edata_schemes={'weight': Scheme(shape=(15,), dtype=torch.float32)})]

Create iGraph¶

In [24]:
g = comkg.to_igraph()
g
Out[24]:
<igraph.Graph at 0x7fa0d7ec9e40>