"""
The iPinYou dataset can be downloaded from
https://contest.ipinyou.com/
"""


import pandas as pd
import numpy as np
import pickle



for day in [20130606, 20130607, 20130608, 20130609, 20130610, 20130611, 20130612]:

    """
    bids = pd.read_csv("~/win/Downloads/ipinyou/bid.20130606.txt",
                       sep="\t",
                       names=["bid_id", "timestamp", "log_type", "user-agent", "ip", "region", "city", "ad_exchange",
                              "domain", "url", "anonymous_url_id", "ad_slot_id", "ad_slot_width", "ad_slot_height",
                              "ad_slot_visiblity", "ad_slot_format", "ad_slot_floor_price", "creative_id", "bidding_price",
                              "advertiser_id"],
                       index_col=False,
                       nrows=1000)

    bids = bids[["bid_id", "timestamp", "ad_slot_id", "bidding_price", "advertiser_id"]]
    """

    print("reading csv...")
    imps = pd.read_csv(f"~/win/Downloads/ipinyou/imp.{day}.txt",
                       sep="\t",
                       names=["bid_id", "timestamp", "log_type", "ipinyou_id", "user-agent", "ip", "region", "city",
                              "ad_exchange", "domain", "url", "anonymous_url_id", "ad_slot_id", "ad_slot_width",
                              "ad_slot_height", "ad_slot_visiblity", "ad_slot_format", "ad_slot_floor_price", "creative_id",
                              "bidding_price", "advertiser_id"],
                       index_col=False)

    imps = imps[imps["region"] == 15]
    imps = imps[["bid_id", "timestamp", "ad_slot_id", "bidding_price", "advertiser_id"]]
    """
    imps["day"] = imps["timestamp"] // int(1e9)
    print("days:", imps["day"].unique())
    first_day = imps["day"].iloc[0]
    imps = imps[imps["day"] == first_day]
    """

    # ad_slot_id = "mm_10075660_3500949_11453278"

    budgets = np.zeros(imps["advertiser_id"].max() + 1, dtype=int)
    imp_list = []
    print("grouping...")
    grp = imps.groupby("ad_slot_id")
    l = len(grp)
    print("translating...")
    for i, (ad_slot_id, x) in enumerate(grp):
        if i % (l // 100) == 0:
            print(f"{i+1}/{l} {100*i//l}%")
        weights = x.groupby("advertiser_id")["bidding_price"].max()
        budgets[weights.index] += 1
        # if len(weights) > 1:
        imp_list.append(weights.to_dict())

    """
    from tools2 import *
    imps = Impressions(imp_list)
    imps.stats()
    """

    with open(f"ipinyou-{day}-{15}.pickle", 'wb') as handle:
        pickle.dump((budgets, imp_list), handle)

