import pandas as pd
import numpy as np
from pathlib import Path

def combine_datasets_randomly(dataset1_path, dataset2_path, output_path, percentage=50, random_seed=42):
    """
    Randomly select a specified percentage of data from each dataset with complementary indices.
    
    Parameters:
    - dataset1_path: Path to the first CSV dataset
    - dataset2_path: Path to the second CSV dataset  
    - output_path: Path where the combined dataset will be saved
    - percentage: Percentage of data to select from dataset1 (100 - percentage will be selected from dataset2)
    - random_seed: Seed for reproducibility (default: 42)
    """
    
    # Set random seed for reproducibility
    np.random.seed(random_seed)
    
    # Load the datasets
    print("Loading datasets...")
    df1 = pd.read_csv(dataset1_path)
    df2 = pd.read_csv(dataset2_path)
    
    # Verify datasets have equal number of rows
    if len(df1) != len(df2):
        raise ValueError(f"Datasets must have equal number of rows. "
                        f"Dataset 1: {len(df1)}, Dataset 2: {len(df2)}")
    
    n_rows = len(df1)
    print(f"Each dataset has {n_rows} rows")
    
    # Generate random indices based on the specified percentage
    n_select = int(n_rows * (percentage / 100))
    all_indices = np.arange(n_rows)
    
    # Randomly select indices for dataset 1
    selected_indices_df1 = np.random.choice(all_indices, size=n_select, replace=False)
    
    # The remaining indices go to dataset 2
    selected_indices_df2 = np.setdiff1d(all_indices, selected_indices_df1)
    
    print(f"Selected {len(selected_indices_df1)} rows from dataset 1")
    print(f"Selected {len(selected_indices_df2)} rows from dataset 2")
    
    # Extract the selected rows
    selected_df1 = df1.iloc[selected_indices_df1].copy()
    selected_df2 = df2.iloc[selected_indices_df2].copy()
    
    # Keep only input and output columns
    selected_df1 = selected_df1[['input', 'output']]
    selected_df2 = selected_df2[['input', 'output']]
    
    # Combine the selected data
    combined_df = pd.concat([selected_df1, selected_df2], ignore_index=True)
    
    # Shuffle the combined dataset to mix the rows
    combined_df = combined_df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    
    # Save the combined dataset
    combined_df.to_csv(output_path, index=False)
    print(f"Combined dataset saved to: {output_path}")
    print(f"Final dataset shape: {combined_df.shape}")
    
    return combined_df

def main():
    """
    Command-line interface for the dataset combination tool
    """
    import argparse
    
    parser = argparse.ArgumentParser(description='Combine two datasets with specified percentage split.')
    parser.add_argument('--dataset1', required=True, help='Path to the first CSV dataset')
    parser.add_argument('--dataset2', required=True, help='Path to the second CSV dataset')
    parser.add_argument('--output', required=True, help='Path where the combined dataset will be saved')
    parser.add_argument('--percentage', type=float, default=50.0,
                      help='Percentage of data to select from dataset1 (default: 50.0)')
    parser.add_argument('--seed', type=int, default=42,
                      help='Random seed for reproducibility (default: 42)')
    
    args = parser.parse_args()
    
    # Validate percentage
    if not 0 < args.percentage < 100:
        print("Error: Percentage must be between 0 and 100")
        return 1
    
    try:
        # Combine the datasets
        combined_data = combine_datasets_randomly(
            dataset1_path=args.dataset1,
            dataset2_path=args.dataset2,
            output_path=args.output,
            percentage=args.percentage,
            random_seed=args.seed
        )
        
        # Display some statistics
        print("\n--- Summary ---")
        print(f"Total rows in combined dataset: {len(combined_data)}")
        print(f"Number of columns: {len(combined_data.columns)}")
        print("Columns:", ", ".join(combined_data.columns))
        return 0
        
    except FileNotFoundError as e:
        print(f"Error: Could not find one of the input files. {e}")
        return 1
    except ValueError as e:
        print(f"Error: {e}")
        return 1
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return 1

if __name__ == "__main__":
    import sys
    sys.exit(main())