import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet'}
df = pd.read_parquet(splits["train"])

ring_cols = [col for col in df.columns if col.startswith('R') and col[1:].isdigit()]

def get_longest_ring(row):
    sizes = [int(col[1:]) for col in ring_cols if row[col] > 0]
    return max(sizes) if sizes else 0

df['longest_ring_size'] = df.apply(get_longest_ring, axis=1)

df['cycle_penalty'] = df['longest_ring_size'].apply(lambda x: max(0, x - 6))

df['plogP'] = df['logP'] - df['SAS'] - df['cycle_penalty']


import pdb;pdb.set_trace()

df.drop(['longest_ring_size', 'cycle_penalty'], axis=1, inplace=True)

## save to csv
df.to_csv('zinc250k_plogP.csv', index=False)