"""
Explore the structure of allenai/dolma3_longmino_pool dataset.
Shows splits, subsets, organization by length and topic.
"""

from datasets import load_dataset, get_dataset_config_names

DATASET_NAME = "allenai/dolma3_longmino_pool"

print("="*80)
print(f"Exploring Dataset: {DATASET_NAME}")
print("="*80)

# Get all available configurations/subsets
print("\nFetching dataset configurations...")
try:
    configs = get_dataset_config_names(DATASET_NAME)
    print(f"\nFound {len(configs)} configurations:")
    for i, config in enumerate(configs, 1):
        print(f"  {i:2d}. {config}")
except Exception as e:
    print(f"Error getting configs: {e}")
    configs = None

# Load dataset info (without downloading data)
print("\n" + "="*80)
print("Loading dataset info (streaming mode to avoid download)...")
print("="*80)

try:
    # Try loading with streaming to avoid downloading everything
    if configs and len(configs) > 0:
        # Load first config as example
        config_name = configs[0]
        print(f"\nLoading config '{config_name}' in streaming mode...")
        ds = load_dataset(DATASET_NAME, config_name, streaming=True, split="train")

        print(f"\nDataset features:")
        print(f"  {ds.features}")

        # Get a few examples
        print(f"\nFirst 3 examples:")
        for i, example in enumerate(ds):
            if i >= 3:
                break
            print(f"\n  Example {i+1}:")
            for key, value in example.items():
                if isinstance(value, str) and len(value) > 100:
                    print(f"    {key}: {value[:100]}... ({len(value)} chars)")
                else:
                    print(f"    {key}: {value}")
    else:
        # Try default config
        print(f"\nLoading default configuration in streaming mode...")
        ds = load_dataset(DATASET_NAME, streaming=True, split="train")

        print(f"\nDataset features:")
        print(f"  {ds.features}")

        # Get a few examples
        print(f"\nFirst 3 examples:")
        for i, example in enumerate(ds):
            if i >= 3:
                break
            print(f"\n  Example {i+1}:")
            for key, value in example.items():
                if isinstance(value, str) and len(value) > 100:
                    print(f"    {key}: {value[:100]}... ({len(value)} chars)")
                else:
                    print(f"    {key}: {value}")

except Exception as e:
    print(f"\nError loading dataset: {e}")
    print("\nTrying alternative approach...")

    # Try loading dataset info without streaming
    try:
        from datasets import load_dataset_builder

        builder = load_dataset_builder(DATASET_NAME)
        print(f"\nDataset info:")
        print(f"  Description: {builder.info.description}")
        print(f"  Features: {builder.info.features}")
        print(f"  Splits: {builder.info.splits}")
        print(f"  Size in bytes: {builder.info.size_in_bytes}")
    except Exception as e2:
        print(f"Error with builder approach: {e2}")

print("\n" + "="*80)
print("Exploration complete")
print("="*80)
