import pandas as pd
import numpy as np
from collections import Counter
import timeit

def generate_test_data(num_records, ngram_length, seed=1):
    """Generate test data with specified number of records and ngram length."""
    np.random.seed(seed)  # For reproducibility
    data = {
        'ngram': [tuple(np.random.randint(0, 100, ngram_length)) for _ in range(num_records)],
        'count': np.random.randint(1, 10, num_records)
    }
    return pd.DataFrame(data)

def sum_with_pandas(df1, df2):
    """Sum counts using Pandas groupby."""
    df1.to_csv('input1.csv')
    df2.to_csv('input2.csv')
    combined_df = pd.concat([df1, df2, df2])
    combined_df['ngram'] = combined_df['ngram'].apply(tuple)
    out = combined_df.groupby('ngram', as_index=False).agg({'count': 'sum'})
    #out.to_csv('pandas.csv')
    #return out

def sum_with_counter(df1, df2):
    """Sum counts using a Python Counter."""

    df1_dict = Counter(dict(zip(df1['ngram'], df1['count'])))
    df2_dict = Counter(dict(zip(df2['ngram'], df2['count'])))
    df1_dict.update(df2_dict)
    df1_dict.update(df2_dict)
    out = pd.DataFrame(list(df1_dict.items()), columns=['ngram', 'count'])
    #out.to_csv('counter.csv')
    #return out

def benchmark_method(method_name, df1, df2):
    """Utility function to time a method using timeit."""
    # Generate a string for the dataframe creation to use in timeit setup
    global_setup = "from __main__ import generate_test_data, sum_with_pandas, sum_with_counter"
    # Define the DataFrame as part of the setup code, including the method to be tested
    setup_code = f"{global_setup}\ndf1 = generate_test_data(2, 100000, 0)\ndf2 = generate_test_data(2, 100000, 1)"
    # Define the statement to be timed
    stmt = f"{method_name}(df1, df2)"
    # Run the timing
    times = timeit.repeat(setup=setup_code, stmt=stmt, repeat=3, number=10)
    return np.mean(times)

def main():
    num_records = 10000  # Number of records in the DataFrame
    ngram_length = 3     # Length of each ngram tuple
    df1 = generate_test_data(num_records, ngram_length, seed=0)
    df2 = generate_test_data(num_records, ngram_length, seed=1)

    # Time the Pandas groupby method
    pandas_time = benchmark_method("sum_with_pandas", df1, df2)
    print(f"Pandas GroupBy Time: {pandas_time:.4f} seconds")

    # Time the Counter method
    counter_time = benchmark_method("sum_with_counter", df1, df2)
    print(f"Python Counter Time: {counter_time:.4f} seconds")

if __name__ == "__main__":
    main()
