from preprocess_dataset import preprocess
import csv
import sys


def preprocess_makg(path_in, filename_in, path_out, filename_out, id_field):
	# Run default preprocessing
	preprocess(path_in, filename_in, path_out, filename_out, id_field)

	# In addition to the default-preprocessing, we need to create "field of study" fields
	v_filename = filename_out + "_vertices.csv"
	e_filename = filename_out + "_edges.csv"

	# Read vertices
	vertices = {}
	vertex_fields = ["field"]
	with open(path_out+ v_filename, newline='', encoding='latin-1') as csvfile:
		csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
		headers = next(csvreader)
		for row in csvreader:
			vertex = {field : row[idx] for (idx, field) in enumerate(headers)}
			vertex["field"] = ""
			vertices[int(vertex["id"])] = vertex
			for field in vertex:
				if field not in vertex_fields:
					vertex_fields.append(field)

	# Read edges, extract "hasdiscipline"-edges and add a property "fieldofstudy" to "paper"-vertices
	with open(path_out + e_filename, newline='', encoding='latin-1') as csvfile:
		csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
		headers = next(csvreader)
		for row in csvreader:
			edge = {field : row[idx] for (idx, field) in enumerate(headers)}
			if edge["type"] == "HASDISCIPLINE":
				vertices[int(edge["start"])]["field"] = vertices[int(edge["end"])]["name"]

	# Write enhanced vertex file
	out_file = open(path_out + v_filename, "w")
	writer = csv.writer(out_file)
	writer.writerow(vertex_fields)
	for vertex_id in range(len(vertices)):
		vertex = vertices[vertex_id]
		row = [(vertex[field] if field in vertex else "") for field in vertex_fields]
		writer.writerow(row)
	out_file.close()

if __name__ == '__main__':
	if len(sys.argv) < 2:
		print("Usage: preprocess_makg_dataset.py <datasetname>")

	dataset = sys.argv[1]
	id_field = "entityid"	
	path_in = "raw_data/"
	path_out = "GraphGymPyG/datasets/%s/raw/" % dataset
	preprocess_makg(path_in, dataset, path_out, dataset, id_field)

