import csv
import sys

def analyze(path, dataset, typ, category_field, skip_small = False):
	# Read preprocessed files
	with open(path + dataset + "_" + typ + ".csv", newline='', encoding='latin-1') as csvfile:
		csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
		headers = next(csvreader)
		categories = []
		entity_count = 0
		category_counts = {}
		field_counts = {}
		value_counts = {}

		# Analyze dataset
		for row in csvreader:
			# Level 0: Entity 
			entity_count += 1
			# Level 1: Category
			category = row[headers.index(category_field)]
			if category not in category_counts:	
				category_counts[category] = 0
			category_counts[category] += 1
			if category not in field_counts:	
				field_counts[category] = {}
			if category not in value_counts:	
				value_counts[category] = {}
			# Iterate through fields
			for (idx, field) in enumerate(headers):
				if field in ["id","entityid",category_field,"start","end"] or row[idx] == "":
					continue
				# Level 2: Field
				if field not in field_counts[category]:	
					field_counts[category][field] = 0
				field_counts[category][field] += 1
				if field not in value_counts[category]:	
					value_counts[category][field] = {}
				# Level 3: Value
				value = row[idx]	
				if value not in value_counts[category][field]:	
					value_counts[category][field][value] = 0
				value_counts[category][field][value] += 1

		# Print results 
		print("The dataset contains %d %s" % (entity_count, typ))
		skip_count = 0
		for category in category_counts:
			# Skip if only few entries
			if	category_counts[category] * 1000 < entity_count and skip_small: 
				skip_count += 1
				continue
			print("  %d out of %d %s are of type \"%s\" (%.2f%%)" \
				  % (category_counts[category], entity_count, typ, category, 
					(category_counts[category] * 100/ entity_count)))
			for field in field_counts[category]:
				print("    %d out of %d of those %s have the property \"%s\" (%.2f%%)" \
					  % (field_counts[category][field], category_counts[category], typ,
						field, (field_counts[category][field] * 100/ category_counts[category])))
				if len(value_counts[category][field]) < 10 and field_counts[category][field] > 10:
					for value in value_counts[category][field]:
						print("      %d out of %d of those property contain the value \"%s\" (%.2f%%)" \
							  % (value_counts[category][field][value], field_counts[category][field], value, 
								(value_counts[category][field][value] * 100/ field_counts[category][field])))
		if skip_small:
			print("%d types skipped because they contributed less than 0.1%% to the total number of %s" % (skip_count, typ))
					
if __name__ == '__main__':
	args = sys.argv
	if len(args) < 2:
		print("Usage: analyze_dataset_text.py <datasetname> [skip_small]")
	dataset = args[1] 
	skip_small = (len(args) > 2 and args[2] in ["true","True"])
	analyze("GraphGymPyG/datasets/"+dataset+"/raw/", dataset, "vertices", "labels", skip_small)	
	print()
	analyze("GraphGymPyG/datasets/"+dataset+"/raw/", dataset, "edges", "type", skip_small)	
			
