# File: preprocess.py
# Description: This script generates input graph files and stream files for experiments.

import os
import networkx as nx
import sgtl.random


def generate_streams(input_graph_file, streaming_file):
    """
    Generate a stream file from an input graph file.
    The output file encodes whether each edge is positive (+) or negative (-).
    """
    with open(input_graph_file, "r") as f:
        if not f.readline().strip().isdigit():
            graph_file = os.path.join(os.path.dirname(input_graph_file), "edges.txt")
            with open(graph_file, "w") as f1:
                for line in f:
                    f1.write(line.strip().replace(",", " ") + "\n")
            graph = nx.read_edgelist(graph_file, nodetype=int, create_using=nx.Graph())
            print(f"Number of nodes: {graph.number_of_nodes()}")
        else:
            graph = nx.read_edgelist(input_graph_file, nodetype=int, create_using=nx.Graph())

    with open(streaming_file, "w") as f:
        f.write(f"{graph.number_of_nodes()}\n")
        for u in range(graph.number_of_nodes()):
            for v in range(u + 1, graph.number_of_nodes()):
                if graph.has_edge(u, v):
                    f.write(f"{u} {v} +\n")
                else:
                    f.write(f"{u} {v} -\n")


def generate_graph_from_SBM(num_nodes, p, edge_file):
    """
    Generate a graph from the SBM model and save as edge list.
    """
    cluster_sizes = [num_nodes // 2, num_nodes // 2]
    prob_matrix = [[p, 1 - p], [1 - p, p]]

    graph = sgtl.random.sbm(cluster_sizes, prob_matrix).to_networkx()

    with open(edge_file, "w") as f:
        for u, v in graph.edges():
            f.write(f"{u} {v}\n")


def generate_synthetic_datasets(data_dir, num_nodes_list, p_list):
    """
    Generate synthetic datasets (SBM) and corresponding streaming files.
    """
    for num_nodes in num_nodes_list:
        for p in p_list:
            folder = os.path.join(data_dir, f"nodes_{num_nodes}/prob_{p}")
            os.makedirs(folder, exist_ok=True)

            generate_graph_from_SBM(num_nodes, p, folder + f"/edges.txt")
            generate_streams(folder + f"/edges.txt", folder + f"/streaming.txt")


def relabel_subgraph(raw_edge_file, relabeled_edge_file):
    """
    Relabel nodes in the subgraph to start from 0 and save to a new file.
    """
    graph = nx.read_edgelist(path=raw_edge_file, nodetype=int, create_using=nx.Graph())
    mapping = {node: index for index, node in enumerate(graph.nodes())}
    relabeled_graph = nx.relabel_nodes(graph, mapping)

    with open(relabeled_edge_file, "w") as f:
        for u, v in relabeled_graph.edges():
            f.write(f"{u} {v}\n")


def generate_facebook_datasets(data_dir, datasets):
    """
    Preprocess and generate stream files for Facebook datasets.
    """
    for dataset in datasets:
        graph_file = os.path.join(data_dir, dataset, "edges.txt")
        relabel_subgraph(data_dir + "/" + dataset + "/" + dataset[len("facebook"):] + ".edges", graph_file)
        generate_streams(graph_file, data_dir + "/" + dataset + "/streaming.txt")


if __name__ == "__main__":
    # Synthetic datasets from SBM
    generate_synthetic_datasets("../data/sbm", [100, 500, 1000, 1500, 2000, 2500], [0.95, 0.9, 0.8, 0.7])

    # Facebook datasets
    generate_facebook_datasets("../data/facebook", ["facebook0", "facebook414", "facebook3980"])

    # EmailCore and LastFM datasets
    generate_streams("../data/emailcore/email-Eu-core.txt", "../data/emailcore/streaming.txt")
    generate_streams("../data/lastfm/lastfm_asia_edges.csv", "../data/lastfm/streaming.txt")