"""
the yahoo dataset can be downloaded from
https://webscope.sandbox.yahoo.com/catalog.php?datatype=a
"A3 - Yahoo! Search Marketing Advertiser Bid-Impression-Click data on competing Keywords, version 1.0"
"""


import pandas as pd
import numpy as np
import pickle
import dask.dataframe as dd

# ..., blocksize = 25e6).head(n=1000000)

print("reading csv...")
df = dd.read_csv("~/win/Downloads/yahoo/ydata-ysm-keyphrase-bid-imp-click-v1_0", sep="\t",
                 names=["day", "account", "rank", "keyphrase", "avg_bid", "impressions", "clicks"],
                 blocksize=100e6)
keywords = pd.read_csv("~/win/Downloads/yahoo/ydata-ysm-keyphrase-category-v1_0.txt", names=["keyword"])
# keyword_to_id = {keyword: id for id, keyword in keywords.itertuples()}
account_to_id = {account: id for id, account in enumerate(df.account.unique())}
print("done")

print(f"{len(df)} rows, {len(keywords)} keywords, {len(account_to_id)} accounts")

n_most_common_keywords = 20
powerset_matrices = []
powerset_matrix = np.eye(2, dtype=int)[:, [1]]
for i in range(1, n_most_common_keywords + 1):
    powerset_matrices.append(powerset_matrix)
    powerset_matrix = np.vstack((np.hstack((np.zeros((len(powerset_matrix), 1)), powerset_matrix)),
                                 np.hstack((np.ones((len(powerset_matrix), 1)), powerset_matrix)))).astype(int)


days = df["day"].unique().compute()
# for day, day_df in df.groupby("day"):
for day in days:
    day_df = df.groupby("day").get_group(day).compute()
    maxrank = day_df["rank"].value_counts().index[0]
    day_df = day_df[day_df["rank"] == maxrank]
    print(f"creating impressions for day {day} ({len(day_df)} rows)...")


    keyword_count = {}
    for _, day, account, rank, keyphrase, avg_bid, impressions, clicks in day_df.itertuples():
        for keyword in keyphrase.split(" "):
            if keyword not in keyword_count:
                keyword_count[keyword] = 0
            keyword_count[keyword] += 1

    most_common_keywords = set(
        pd.DataFrame(keyword_count.items()).sort_values(1, ascending=False)[:n_most_common_keywords][0])
    keyword_ids = {keyword: i for i, keyword in enumerate(most_common_keywords)}

    imp_list = [[i, {}, 0] for i in range(2**n_most_common_keywords)]

    for _, _, account, rank, keyphrase, avg_bid, impressions, clicks in day_df.itertuples():
        account_id = account_to_id[account]
        keyphrase = set(keyphrase.split(" ")).intersection(most_common_keywords)
        keyphrase_ids = np.array([keyword_ids[keyword] for keyword in keyphrase], dtype=int)
        k = len(keyphrase)
        if k > 0:
            for x in np.sum(powerset_matrices[k-1] * 2**keyphrase_ids, axis=1)[1:]:
                imp_list[x][1][account_id] = 1
            imp_list[x][2] += impressions

    imp_list = list(filter(lambda x: len(x[1]) > 0 and x[2] > 0, imp_list))

    with open(f"yahoo-{day}.pickle", 'wb') as handle:
        pickle.dump(imp_list, handle)
    print(f"done ({len(imp_list)} impression types)")








"""
avg_bids = {}
kwac = []
for (_, day, account, rank, keyphrase, avg_bid, impressions, clicks) in df.itertuples():
    account_id = account_to_id[account]
    for keyword in keyphrase.split(" "):
        keyword_id = keyword_to_id[keyword]
        pair = (keyword_id, account_id)
        kwac.append(pair)
        if pair not in avg_bids: avg_bids[pair] = []
        avg_bids[pair].append(avg_bid)
"""

"""
kw_bid = {}
for pair, avgs in avg_bids.items():
    kw_bid[pair] = sum(avgs) / len(avgs)
"""

"""
rel = pd.DataFrame(kwac, columns=["keyword_id", "account_id"])
reluni = rel.drop_duplicates()

all_bids = []
budgets = np.zeros(len(account_to_id))
for (_, day, account, rank, keyphrase, avg_bid, impressions, clicks) in df.itertuples():
    account_id = account_to_id[account]
    keyword_ids = [keyword_to_id[keyword] for keyword in keyphrase.split(" ")]
    pairs = reluni[reluni["keyword_id"].isin(keyword_ids)]
    x = pairs.groupby("account_id").count()["keyword_id"]
    bids = round(len(x) * avg_bid * x / x.sum(), 2)
    ix_max = bids.idxmax()
    max_bid = x[ix_max]
    bids[ix_max] = bids[account_id]
    bids[account_id] = max_bid
    all_bids.append((day, list(bids.index), list(bids)))
    budgets[account_id] += max_bid

all_bids.sort()

advs = budgets / 2
imps = np.zeros((len(all_bids), len(advs)))
for i, (_, adv_ixs, bids) in enumerate(all_bids):
    imps[i, adv_ixs] = bids
"""


