import pandas as pd
import math

def sample_parquet_file(original_file_path):
    """
    Randomly samples a Parquet file and generates a new file based on the new ratio and data volume.

    Args:
        original_file_path (str): The path to the original Parquet file.
    """
    try:
        # Read the original Parquet file
        print(f"Reading file: {original_file_path}...")
        df = pd.read_parquet(original_file_path)
        original_rows = len(df)
        print(f"File read successfully, containing {original_rows} rows.")

        # Get the sampling ratio from user input
        while True:
            try:
                sample_ratio_input = input("Please enter the percentage you want to sample (e.g., enter 50 for 50%): ")
                sample_ratio = float(sample_ratio_input) / 100
                if 0 < sample_ratio < 1:
                    break
                else:
                    print("Error: Please enter a number between 0 and 100.")
            except ValueError:
                print("Error: Please enter a valid number.")

        # Perform random sampling
        sampled_df = df.sample(frac=sample_ratio, random_state=42) # random_state 保证结果可复现
        new_rows = len(sampled_df)
        print(f"Successfully sampled {new_rows} rows.")

        # --- Generate the new filename ---
        # Extract basic information from the original filename
        parts = original_file_path.replace('.parquet', '').split('_')
        base_name = parts[0]
        original_ratio_str = parts[1]

        # Calculate the new total sampling ratio
        # Assume the ratio in the original filename is relative to the initial dataset
        try:
             # Convert a string like "0.5" to a float
            original_main_ratio = float(original_ratio_str)
        except ValueError:
            print(f"Warning: Could not parse original ratio from '{original_ratio_str}', using 1.0 as base.")
            original_main_ratio = 1.0

        new_total_ratio = original_main_ratio * sample_ratio

        # Format the new data volume into "X.Xm" or "X.Xk" format
        if new_rows >= 1_000_000:
            new_rows_str = f"{new_rows / 1_000_000:.1f}m"
        elif new_rows >= 1_000:
            new_rows_str = f"{new_rows / 1_000:.1f}k"
        else:
            new_rows_str = str(new_rows)

        # Format the new total ratio, removing unnecessary zeros
        new_total_ratio_str = f"{new_total_ratio:.2f}".rstrip('0').rstrip('.')

        # Combine into the final filename
        output_filename = f"{base_name}_{new_total_ratio_str}_{new_rows_str}.parquet"

        # Save as a new Parquet file
        sampled_df.to_parquet(output_filename, index=False)
        print(f"\nSampling complete! Data saved to: {output_filename}")

    except FileNotFoundError:
        print(f"Error: File not found -> {original_file_path}")
    except Exception as e:
        print(f"An error occurred during processing: {e}")

# --- Main program ---
if __name__ == "__main__":
    # Set your original filename
    input_file = "lamini-instruction_0.1_258.6k.parquet"
    sample_parquet_file(input_file)