from datasets import load_dataset
import numpy as np 

ds = load_dataset("YuehHanChen/forecasting_raw")["train"]

# columns in dataset
print(ds.column_names)
# ds has column date_resolve_at which is the date when the issue was resolved in YYYY-MM-DD format
# i want to keep only a subset of the rows where the date_resolve_at is after 2023-06-01

useful_subset = ds.filter(lambda x: x["date_resolve_at"] > "2023-06-01")
useful_subset = ds.filter(lambda x: x["date_begin"] > "2023-01-01" and x["date_resolve_at"] < "2023-06-01")

print(len(useful_subset))

# Print random 10 rows of the useful_subset
random_indices = np.random.randint(0, len(useful_subset), 10)
for i in range(10):
    print(useful_subset[i]["date_resolve_at"], useful_subset[i]["date_begin"], useful_subset[i]["question"], useful_subset[i]["resolution"])