
import pandas as pd
import requests
from sklearn.datasets import fetch_openml
from io import StringIO
from sklearn.datasets import load_diabetes


def print_features_data(df, features):
    num_features = len(features)
    num_data_points = df.shape[0]
    print(f'Number of features: {num_features}')
    print(f'Number of data points: {num_data_points}')

def get_diabetes_data():
    # Load the diabetes dataset
    data = load_diabetes()
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['target'] = data.target

    # Transform the target to count data (e.g., rounding to nearest integer)
    df['target'] = df['target'].round().astype(int)

    # Select relevant features for Poisson regression
    features = data.feature_names
    target = 'target'

    print_features_data(df, features)
    # Split the data into training and test sets
    X = df[features].values
    y = df[target].values
    return X, y, "diabetess"


def get_SEER_cancer_data():
    # Download a SEER Cancer Incidence dataset (example: "Incidence - All Cancers")
    url = "https://seer.cancer.gov/popdata/yr1969_2017.19ages.txt"
    response = requests.get(url)
    data_lines = response.text.splitlines()

    # Parse and preprocess the SEER dataset (you might need to adjust based on the structure)
    data = []
    for line in data_lines[1:]:  # Skip header
        columns = line.split()  # Assuming space-separated values
        year = int(columns[0])
        population = int(columns[2])  # Example: population column
        cases = int(columns[3])  # Example: number of cancer cases
        age_group = int(columns[4])  # Example: age group
        
        data.append([year, population, age_group, cases])

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['Year', 'Population', 'AgeGroup', 'Cases'])

    # Add additional features (e.g., log of population)
    df['LogPopulation'] = np.log(df['Population'])

    # Select relevant features for Poisson regression
    features = ['Year', 'AgeGroup', 'LogPopulation']
    target = 'Cases'

    # Split the data into training and test sets
    X = df[features].values
    y = df[target].values

    print_features_data(df, features)
    return X, y, "cancer"


def get_bike_share_data():
    # https://www.openml.org/data/download/22044628/dataset
    # URL of the Bike Sharing dataset (hosted on a GitHub repository)
    # Load the "Bike Sharing" dataset from OpenML
    data = fetch_openml(name='Bike_Sharing_Demand', version = 3, as_frame=True)
    df = data.frame
    # Select features and target for Poisson regression
    features = ['temp', 'humidity', 'windspeed',  'feel_temp', 'month', 'hour']  # Example features
    target = 'count'

    # Split the data into training and test sets
    X = df[features].values
    y = df[target].values
    hours = df['hour'].values

    print_features_data(df, features)

    return X, y, hours, "bike_share"
