import pandas as pd
filename_used = "ngrams_results_final/joined_final/arity=2/source=megawika_source=redpajama%2Fstackexchange_source=reddit_source=falcon-refinedweb%2Fdata_source=starcoder_source=gutenberg_source=redpajama%2Farxiv.parquet"
df_gutenberg_unigrams = pd.read_parquet(filename_used, engine='pyarrow')
print('loaded')
# Convert 'ngram' column to tuples using a list comprehension
df_gutenberg_unigrams['ngram'] = [tuple(x) for x in df_gutenberg_unigrams['ngram']]
print('converted')
# Create the dictionary using zip
df_gutenberg_unigrams_dict = dict(zip(df_gutenberg_unigrams['ngram'], df_gutenberg_unigrams['count']))
print('dict')
all_unigrams_count = sum(df_gutenberg_unigrams_dict.values())
print('count', all_unigrams_count)
df_gutenberg_unigrams_dict_normalized = {
    key: value / all_unigrams_count
    for key, value in df_gutenberg_unigrams_dict.items()
}
print('normalized')
# Convert the dictionary to a DataFrame
df = pd.DataFrame(list(df_gutenberg_unigrams_dict_normalized.items()), columns=['bigram', 'normalized_count'])
print('premapped')

df["bigram"] = df["bigram"].astype(str)
print('mapped')
# Save the DataFrame as a Parquet file
df.to_parquet('ngrams_results_final/joined_final/arity=2/df_gutenberg_bigrams_dict_normalized_hashed_last.parquet')