In [1]:
from companykg import CompanyKG
Initialize CompanyKG¶
In [2]:
DATA_ROOT_FOLDER = "./data"
# It may take long time if the data files are not yet downloaded.
comkg = CompanyKG(nodes_feature_type="msbert", load_edges_weights=True, data_root_folder=DATA_ROOT_FOLDER)
In [3]:
# Obtain the list of node IDs using nodes_id
len(comkg.nodes_id)
Out[3]:
1169931
Inspect CompanyKG¶
In [4]:
# Show the important info of the loaded data
comkg.describe()
data_root_folder=./data n_nodes=1169931, n_edges=50815503 nodes_feature_type=msbert nodes_feature_dimension=512 edges_weight_dimension=15 sp: 3219 samples sr: 1856 samples cr: 400 samples
In [5]:
# Nodes feature: ordered by comkg.nodes_id
comkg.nodes_feature
Out[5]:
tensor([[ 0.0642, 0.0100, -0.0427, ..., 0.0580, 0.0156, -0.0645], [ 0.0563, -0.0086, -0.0114, ..., -0.0381, -0.0020, -0.1615], [ 0.0668, -0.0803, 0.0296, ..., 0.0003, 0.0872, -0.0662], ..., [-0.0072, 0.0593, -0.0401, ..., 0.0861, 0.0681, -0.0302], [-0.0445, -0.1243, 0.0048, ..., 0.0149, -0.0466, -0.0800], [-0.0396, -0.0113, -0.0121, ..., -0.0240, -0.0342, -0.0039]])
In [6]:
# Edges
comkg.edges
Out[6]:
tensor([[ 113091, 412357], [ 560244, 1164306], [ 388246, 1121544], ..., [ 84160, 837013], [ 179090, 917143], [ 179090, 226260]])
In [7]:
# Edges weight
comkg.edges_weight
Out[7]:
tensor([[0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.7918], [0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.7918], [0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.7918], ..., [1.0000, 0.0000, 1.0000, ..., 0.0000, 0.0000, 0.0000], [1.0000, 1.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000], [1.0000, 1.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]])
Inspect Evaluation Data¶
In [8]:
# SP samples
comkg.eval_tasks['sp'][1]
Out[8]:
node_id0 | node_id1 | label | |
---|---|---|---|
0 | 121769 | 136631 | 0 |
1 | 151107 | 336852 | 0 |
2 | 333601 | 363822 | 0 |
3 | 2481 | 419571 | 0 |
4 | 37253 | 437144 | 0 |
... | ... | ... | ... |
3214 | 361222 | 883235 | 1 |
3215 | 536154 | 883235 | 1 |
3216 | 330058 | 1088487 | 1 |
3217 | 494594 | 1088487 | 1 |
3218 | 1624 | 1088487 | 1 |
3219 rows × 3 columns
In [9]:
# SR samples
comkg.eval_tasks['sr'][1]
Out[9]:
target_node_id | candidate0_node_id | candidate1_node_id | label | split | |
---|---|---|---|---|---|
0 | 201389 | 198435 | 797652 | 0 | test |
1 | 450703 | 618486 | 624384 | 0 | test |
2 | 1097415 | 297978 | 386584 | 0 | validation |
3 | 81000 | 244410 | 1016534 | 0 | test |
4 | 861572 | 1155658 | 1115208 | 0 | test |
... | ... | ... | ... | ... | ... |
1851 | 522257 | 669089 | 607981 | 1 | test |
1852 | 1083662 | 203070 | 482478 | 1 | validation |
1853 | 354276 | 551887 | 865995 | 0 | test |
1854 | 830672 | 504882 | 1046882 | 1 | test |
1855 | 804707 | 991173 | 91744 | 1 | test |
1856 rows × 5 columns
In [10]:
# CR samples
comkg.eval_tasks['cr'][1]
Out[10]:
target_node_id | competitor_node_id | |
---|---|---|
0 | 3843 | 34994 |
1 | 3843 | 263332 |
2 | 3843 | 1034500 |
3 | 4981 | 45823 |
4 | 4981 | 288480 |
... | ... | ... |
395 | 1144634 | 1004440 |
396 | 1144634 | 1077443 |
397 | 1163522 | 172921 |
398 | 1163522 | 268689 |
399 | 1163522 | 1149354 |
400 rows × 2 columns
Evaluate Node Feature¶
In [11]:
# Run all evaluation tasks on the loaded node feature
eval_results = comkg.evaluate()
Evaluate Node Features msbert: Evaluate SP ... SP AUC: 0.8059550091101482 Evaluate SR ... SR Validation ACC: 0.6956521739130435 SR Test ACC: 0.6713709677419355 Evaluate CR with top-K hit rate (K=[50, 100, 200, 500, 1000, 2000, 5000, 10000]) ... CR Hit Rates: [0.12955922001974632, 0.18240535049745576, 0.23030967570441258, 0.31102329687856, 0.4143004291030607, 0.47711466165413524, 0.5583993126756285, 0.6349049707602339]
In [12]:
# Show AUC score for SP task
eval_results["sp_auc"]
Out[12]:
0.8059550091101482
In [13]:
# Show test accuracy for SR task
eval_results["sr_test_acc"]
Out[13]:
0.6713709677419355
In [14]:
# Show validation accuracy for SR task
eval_results["sr_validation_acc"]
Out[14]:
0.6956521739130435
In [15]:
# Show Top-K Hit Rate for CR task
eval_results["cr_topk_hit_rate"]
Out[15]:
[0.12955922001974632, 0.18240535049745576, 0.23030967570441258, 0.31102329687856, 0.4143004291030607, 0.47711466165413524, 0.5583993126756285, 0.6349049707602339]
Evaluate Saved Embedding¶
In [16]:
# Run all evaluation tasks on the specified embeddings saved in torch.Tensor format
EMBEDDINGS_FILE = "./data/nodes_feature_msbert.pt"
eval_results = comkg.evaluate(embeddings_file=EMBEDDINGS_FILE)
Evaluate Node Embeddings ./data/nodes_feature_msbert.pt: Evaluate SP ... SP AUC: 0.8059550091101482 Evaluate SR ... SR Validation ACC: 0.6956521739130435 SR Test ACC: 0.6713709677419355 Evaluate CR with top-K hit rate (K=[50, 100, 200, 500, 1000, 2000, 5000, 10000]) ... CR Hit Rates: [0.12955922001974632, 0.18240535049745576, 0.23030967570441258, 0.31102329687856, 0.4143004291030607, 0.47711466165413524, 0.5583993126756285, 0.6349049707602339]
In [17]:
# Show AUC score for SP task
eval_results["sp_auc"]
Out[17]:
0.8059550091101482
In [18]:
# Show test accuracy for SR task
eval_results["sr_test_acc"]
Out[18]:
0.6713709677419355
In [19]:
# Show validation accuracy for SR task
eval_results["sr_validation_acc"]
Out[19]:
0.6956521739130435
In [20]:
# Show Top-K Hit Rate for CR task
eval_results["cr_topk_hit_rate"]
Out[20]:
[0.12955922001974632, 0.18240535049745576, 0.23030967570441258, 0.31102329687856, 0.4143004291030607, 0.47711466165413524, 0.5583993126756285, 0.6349049707602339]
Create DGL Graph¶
In [21]:
# Takes about 15 mins, the graph will be saved to work_folder
g = comkg.get_dgl_graph(work_folder="./experiments")
g
Out[21]:
[Graph(num_nodes=1169931, num_edges=50815503, ndata_schemes={'feat': Scheme(shape=(512,), dtype=torch.float32)} edata_schemes={'weight': Scheme(shape=(15,), dtype=torch.float32)})]
In [22]:
# When call the same function again, it will load from file directly
g = comkg.get_dgl_graph(work_folder="./experiments")
g
Out[22]:
[Graph(num_nodes=1169931, num_edges=50815503, ndata_schemes={'feat': Scheme(shape=(512,), dtype=torch.float32)} edata_schemes={'weight': Scheme(shape=(15,), dtype=torch.float32)})]
Create iGraph¶
In [24]:
g = comkg.to_igraph()
g
Out[24]:
<igraph.Graph at 0x7fa0d7ec9e40>