import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import gc

from deepctr.models import *
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

from sklearn.preprocessing import MinMaxScaler, LabelEncoder

import tensorflow as tf
from sklearn.metrics import roc_auc_score, log_loss

import argparse

parser = argparse.ArgumentParser(
    description="avazu_training"
)
# model related parameters
parser.add_argument("--pipeline-seed", type=int, default=0)
args = parser.parse_args()

pipeline_seed = args.pipeline_seed
print("pipeline_seed: ", pipeline_seed)
training_name = "xDeepFM_no_reg_10M_Adadelta_larger"
print("10000000 samples", training_name)

dtype = {
    'id': np.dtype(int),
    'click': np.dtype(int),
    'hour': np.str,
    'C1': np.dtype(str),
    'banner_pos': np.dtype(str),
    'site_id': np.dtype(str),
    'site_domain': np.dtype(str), 
    'site_category': np.dtype(str),
    'app_id': np.dtype(str),
    'app_domain': np.dtype(str),
    'app_category': np.dtype(str),
    'device_id': np.dtype(str),
    'device_ip': np.dtype(str),
    'device_model': np.dtype(str),
    'device_type': np.dtype(str),
    'device_conn_type': np.dtype(str),
    'C14': np.dtype(str),
    'C15': np.dtype(str),
    'C16': np.dtype(str),
    'C17': np.dtype(str),
    'C18': np.dtype(str),
    'C19': np.dtype(str),
    'C20': np.dtype(str),
    'C21': np.dtype(str),
}
# num_records = 40428967
# sample_size = 40428967
num_records = 10000000
sample_size = 10000000
# skip_values = sorted(random.sample(range(1, num_records), num_records - sample_size))
skip_values = None
parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')

all_data = pd.read_csv("./data/train.gz", parse_dates=['hour'], date_parser=parse_date, dtype=dtype, skiprows=skip_values, nrows=num_records)

sparse_features = all_data.iloc[::, 2:14].columns.values.tolist()
# sparse_features = all_data.iloc[::, 3:14].columns.values.tolist()
dense_features = all_data.iloc[::, 15:].columns.values.tolist()
# dense_features = all_data.iloc[::, 21:].columns.values.tolist()
sparse_features.append('id')
target = ['click']

scaler = MinMaxScaler(feature_range=(0, 1))
encoder = LabelEncoder()

for feat in sparse_features:
    all_data[feat] = encoder.fit_transform(all_data[feat])
all_data[dense_features] = scaler.fit_transform(all_data[dense_features])

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size= all_data[feat].nunique(), embedding_dim=4, use_hash=True, dtype='int32') for feat in sparse_features] + [DenseFeat(feat, 1) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
print(feature_names)

num_train = int(sample_size * 0.8)
num_test = sample_size - num_train

test_index = np.loadtxt("./data/test_index_10M.txt", dtype ='int')
train_index = np.array(sorted(set(range(sample_size)) -  set(test_index)))

train = all_data.iloc[train_index.tolist()]
test = all_data.iloc[test_index.tolist()]
test_model_input = {name: test[name] for name in feature_names}

bce = tf.keras.losses.BinaryCrossentropy(from_logits=False)

def print_result(pred, y): 
    log_loss = bce(y, pred).numpy()
    print("Log Loss: ", log_loss)
    print("AUC: ", roc_auc_score(y, pred))
    print("Calibration: ", np.sum(pred) / np.sum(y))


def pipeline(seed):
    tf.keras.backend.clear_session()
    random_shuffle = np.random.permutation(num_train)
    train_model_input = {name: train[name].iloc[random_shuffle] for name in feature_names}
    # train_model_input = {name: train[name] for name in feature_names}
    
    # model = DeepFM(
    #     linear_feature_columns,
    #     dnn_feature_columns,
    #     task='binary',
    #     # device=device,
    #     dnn_dropout=0.0,
    #     seed=seed,
    # )      
    # model = DCN(
    #     linear_feature_columns,
    #     dnn_feature_columns,
    #     task='binary',
    #     # device=device,
    #     dnn_dropout=0.0,
    #     seed=seed,
    # )
    # model = DCNMix(
    #     linear_feature_columns,
    #     dnn_feature_columns,
    #     task='binary',
    #     # device=device,
    #     dnn_dropout=0.0,
    #     seed=seed,
    # )
    model = xDeepFM(
        linear_feature_columns,
        dnn_feature_columns,
        task='binary',
        # device=device,
        dnn_dropout=0.0,
        seed=seed,
        l2_reg_linear=0,
        l2_reg_embedding=0,
        dnn_use_bn=False,
        dnn_hidden_units=(512, 256, 128),
        cin_layer_size=(256, 256),
    )
    model.compile(
        # optimizer='adam',
        # optimizer='adamw',
        # optimizer='sgd',
        # optimizer='adagrad',
        optimizer='adadelta',
        loss='binary_crossentropy',
        # metrics=['binary_crossentropy', 'auc'],
        metrics=['binary_crossentropy'],
    )
    train_label = train[target].iloc[random_shuffle].values
    history = model.fit(
        train_model_input, 
        # train[target].values, 
        train_label, 
        batch_size=1024, 
        epochs=1, 
        verbose=1,
        validation_split=0.05,
    )
    pred_ans = model.predict(test_model_input, 1024)
    y = test[target].values
    np.savetxt(f"./results/{training_name}/{seed}_{training_name}.txt", pred_ans)
    np.savetxt(f"./results/{training_name}/{seed}_{training_name}_train_label.txt", train_label)
    print_result(pred_ans, y)

pipeline(pipeline_seed)




