import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import Ridge
import joblib
import os
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
import seaborn as sns
from scipy.optimize import minimize

def load_data(file_path):
    df = pd.read_csv(file_path, sep=r'\s+', engine='python', encoding='utf-16')
    df.columns = ['INDEX', 'CODE', 'MATH', 'GENERAL', '平均RL-LOSS', '最大RL-LOSS', 'PPL']
    
    # 验证比例总和
    df['比例总和'] = df['CODE'] + df['MATH'] + df['GENERAL']
    df = df[np.isclose(df['比例总和'], 1.0, atol=0.01)].copy()
    
    return df[['CODE', 'MATH', 'GENERAL', 'PPL']]

def load_test_config(file_path):
    """从配置文件加载预测配比"""
    df = pd.read_csv(file_path, sep=r'\s+', engine='python',  encoding='utf-16')
    test_data = list(zip(df['CODE'], df['MATH'], df['GENERAL']))
    return test_data

def train_ppl_model(X, y):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', GradientBoostingRegressor(
            n_estimators=200,
            max_depth=4,
            learning_rate=0.05,
            random_state=42
        ))
    ])
    model = TransformedTargetRegressor(regressor=pipeline, func=np.log1p, inverse_func=np.expm1)
    model.fit(X, y)
    return model



def train_model_ridge(X, y):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', Ridge(alpha=0.5))
    ])
    model = TransformedTargetRegressor(
        regressor=pipeline,
        func=np.log1p,
        inverse_func=np.expm1
    )
    model.fit(X, y)
    if isinstance(model, TransformedTargetRegressor):
        pipeline = model.regressor_
    else:
        pipeline = model  # 直接是 pipeline

    # 从 pipeline 中取出 Ridge 模型
    ridge = pipeline.named_steps['regressor']

    # 打印参数值
    print("🧮 岭回归模型参数值：")
    for feature, coef in zip(X.columns, ridge.coef_):
        print(f"  β({feature}): {coef:.6f}")

    print(f"  截距项 β₀ (intercept): {ridge.intercept_:.6f}")
    return model



def train_model_rf(X, y):
    model = RandomForestRegressor(
        n_estimators=150,
        max_depth=5,
        min_samples_split=5,
        random_state=42
    )
    model = TransformedTargetRegressor(
        regressor=model,
        func=np.log1p,
        inverse_func=np.expm1
    )
    model.fit(X, y)
    top_n = 10
    # 取出随机森林核心模型
    if isinstance(model, TransformedTargetRegressor):
        rf = model.regressor_
        # 可能还包了 Pipeline，按需处理（这里直接假设没包 Pipeline）
        if hasattr(rf, 'named_steps'):
            rf = rf.named_steps['regressor']
    else:
        rf = model

    print("\n🌲  随机森林模型训练后参数值：")

    # 特征重要性
    importances = rf.feature_importances_
    sorted_idx = importances.argsort()[::-1]
    print("\n🔎  特征重要性 (feature_importances_):")
    for i in sorted_idx[:top_n]:
        print(f"  {X.columns[i]}: {importances[i]:.6f}")

    # 估计器数量
    print(f"\n🌳  模型共训练了 {len(rf.estimators_)} 棵树")

    # 展示前3棵树在前3个样本上的预测（示例）
    print("\n📦  前几棵树在前几个样本上的预测值（示例）:")
    for tree_id in range(min(3, len(rf.estimators_))):
        tree = rf.estimators_[tree_id]
        preds = tree.predict(X[:3])
        print(f"  第 {tree_id+1} 棵树预测值: {preds}")

    # 总体预测值（前3个样本）
    print("\n🧮 模型最终预测值（前几个样本）:")
    final_preds = model.predict(X[:3])
    print(final_preds)
    return model


def train_model_lasso(X, y):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', Lasso(alpha=0.01))
    ])
    model = TransformedTargetRegressor(
        regressor=pipeline,
        func=np.log1p,
        inverse_func=np.expm1
    )
    model.fit(X, y)
    # 取出核心模型
    if isinstance(model, TransformedTargetRegressor):
        inner = model.regressor_
        if isinstance(inner, Pipeline):
            lasso = inner.named_steps['regressor']
        else:
            lasso = inner
    else:
        lasso = model

    print("\n🖊️  Lasso 回归训练后回归系数 (coef_):")
    coefs = lasso.coef_
    for i, col in enumerate(X.columns):
        print(f"  {col}: {coefs[i]:.6f}")

    if hasattr(lasso, 'intercept_'):
        print(f"\n截距 intercept_: {lasso.intercept_:.6f}")
    return model


def save_model(model, directory="saved_models"):
    """保存模型到指定目录"""
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # 生成带时间戳的文件名
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"ppl_model_{timestamp}.joblib"
    filepath = os.path.join(directory, filename)
    
    # 保存模型
    joblib.dump(model, filepath)
    print(f"💾 PPL预测模型已保存到: {filepath}")
    return filepath

def predict_ppl(model, data):
    """
    使用模型预测PPL值
    :param model: 训练好的模型
    :param data: 包含CODE, MATH, GENERAL比例的列表或元组
    :return: PPL预测值
    """
    # 创建输入数据DataFrame
    X_input = pd.DataFrame([data], columns=['CODE', 'MATH', 'GENERAL'])
    
    # 预测PPL
    ppl = model.predict(X_input)[0]
    return ppl

def batch_predict_ppl(model, data_list):
    """
    批量预测多个配比的PPL值
    :param model: 训练好的模型
    :param data_list: 包含多个配比的列表，每个配比是(CODE, MATH, GENERAL)元组
    :return: 包含预测结果的DataFrame
    """
    results = []
    for i, data in enumerate(data_list):
        code, math, general = data
        total = code + math + general
        
        if not np.isclose(total, 1.0, atol=0.01):
            print(f"⚠️ 第{i+1}组比例总和应为1.0，当前为{total:.4f} - 跳过此数据")
            continue
            
        ppl = predict_ppl(model, data)
        results.append({
            'CODE': code,
            'MATH': math,
            'GENERAL': general,
            '预测PPL': ppl
        })
    
    return pd.DataFrame(results)

def main():
    # 数据文件路径
    file_path = '1M_data_config.txt'
    config_path = 'config_1b.txt'  # 预测配比配置文件
    #config_path = '1M_data_config.txt'  # 预测配比配置文件
    
    # 加载数据
    print("📂 加载训练数据...")
    df = load_data(file_path)
    print(f"✅ 加载完成，共 {len(df)} 条数据")
    
    # 准备特征和目标
    X = df[['CODE', 'MATH', 'GENERAL']]
    y = df['PPL']
    
    # 训练PPL预测模型
    print("\n" + "="*50)
    print("🔧 训练PPL预测模型...")
    model = train_ppl_model(X, y)
    #model = train_model_ridge(X, y)
    #model = train_model_rf(X, y)
    #model = train_model_lasso(X, y)
    # 保存模型
    model_path = save_model(model)
    
    # ==============================
    # 从文件加载预测配比
    # ==============================
    print("\n" + "="*50)
    print(f"📂 加载预测配比文件: {config_path}")
    test_data = load_test_config(config_path)
    
    # 批量预测
    print("\n" + "="*50)
    print("🚀 开始预测")
    results_df = batch_predict_ppl(model, test_data)
    
    # 打印预测结果
    print("\n📊 预测结果:")
    print(results_df.to_string(index=False, float_format="%.4f"))
    
    # 保存预测结果到CSV
    results_file = "ppl_predictions_60m.csv"
    results_df.to_csv(results_file, index=False)
    print(f"\n💾 预测结果已保存到: {results_file}")

if __name__ == "__main__":
    main()
