{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Read the CSV file\n",
    "df = pd.read_csv('dataNVID1.csv')\n",
    "\n",
    "# List of features to be used for PCA\n",
    "#features = ['NDVI-mean',\t'Temperture(C)',\t'Rainfall(m)',\t'Wind speed (m/s)',\t'Humidity',\t'Sunshine(h)','Acenaphthene-g', 'Acenaphthylene=g', 'Anthracene-g', 'Arsenic-g', 'Benzo(a)anthracene-g', 'Benzo(a)pyrene-g', 'Benzo(b)fluoranthene-g', 'Benzo(g,h,i)perylene-g', 'Benzo(ghi)perylene-g', 'Benzo(k)fluoranthene-g', 'Cadmium-g', 'Carbon dioxide-g', 'Carbon monoxide-g', 'Chromium-g', 'Chrysene-g', 'Dibenz(a,h)anthracene-g', 'Fluoranthene-g', 'Fluorene-g', 'Indeno(1,2,3-cd)pyrene-g', 'Lead-g', 'Mercury-g', 'Methane-g', 'Naphthalene-g', 'Nickel-g', 'Nitrogen oxides-g', 'NMVOC, non-methane volatile organic compounds, unspecified origin-g', 'Particulates-g', 'Particulates, < 10 um-g', 'Phenanthrene-g', 'Pyrene-g', 'Sulfur dioxide-g', 'Vanadium-g', 'Zinc-g']\n",
    "features = ['Acenaphthene-g', 'Acenaphthylene=g', 'Anthracene-g', 'Arsenic-g', 'Benzo(a)anthracene-g', 'Benzo(a)pyrene-g', 'Benzo(b)fluoranthene-g', 'Benzo(g,h,i)perylene-g', 'Benzo(ghi)perylene-g', 'Benzo(k)fluoranthene-g', 'Cadmium-g', 'Carbon dioxide-g', 'Carbon monoxide-g', 'Chromium-g', 'Chrysene-g', 'Dibenz(a,h)anthracene-g', 'Fluoranthene-g', 'Fluorene-g', 'Indeno(1,2,3-cd)pyrene-g', 'Lead-g', 'Mercury-g', 'Methane-g', 'Naphthalene-g', 'Nickel-g', 'Nitrogen oxides-g', 'NMVOC, non-methane volatile organic compounds, unspecified origin-g', 'Particulates-g', 'Particulates, < 10 um-g', 'Phenanthrene-g', 'Pyrene-g', 'Sulfur dioxide-g', 'Vanadium-g', 'Zinc-g']\n",
    "#features=['NDVI-mean',\t'Direct LCHHA (Daly)',\t'Temperture(C)',\t'Rainfall(m)',\t'Wind speed (m/s)',\t'Humidity',\t'Sunshine(h)']\n",
    "# Extract the features from the dataframe\n",
    "X = df[features]\n",
    "# 定义新的X轴标签\n",
    "X_features = ['Acenaphthene', 'Acenaphthylene', 'Anthracene', 'Arsenic', 'Benzo(a)anthracene', 'Benzo(a)pyrene', 'Benzo(b)fluoranthene', 'Benzo(g,h,i)perylene', 'Benzo(ghi)perylene', 'Benzo(k)fluoranthene', 'Cadmium', 'Carbon dioxide', 'Carbon monoxide', 'Chromium', 'Chrysene', 'Dibenz(a,h)anthracene', 'Fluoranthene', 'Fluorene', 'Indeno(1,2,3-cd)pyrene', 'Lead', 'Mercury', 'Methane', 'Naphthalene', 'Nickel', 'Nitrogen oxides', 'NMVOC', 'Particulates', 'Particulates, < 10 um', 'Phenanthrene', 'Pyrene', 'Sulfur dioxide', 'Vanadium', 'Zinc']\n",
    "# Standardize the features\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X)\n",
    "\n",
    "# Apply PCA\n",
    "pca = PCA(n_components=1)\n",
    "X_pca = pca.fit_transform(X_scaled)\n",
    "\n",
    "# Add the PCA result as a new column to the original dataframe\n",
    "df['PCA_Component'] = X_pca\n",
    "\n",
    "# Save the result to a new CSV file\n",
    "df.to_csv('datafinal_with_pca1.csv', index=False)\n",
    "\n",
    "# Print the explained variance ratio\n",
    "print(f\"Explained variance ratio: {pca.explained_variance_ratio_[0]:.4f}\")\n",
    "\n",
    "# Print the first few rows of the result\n",
    "print(df[['PCA_Component'] + features].head())\n",
    "\n",
    "# Calculate feature contributions\n",
    "feature_contributions = pd.DataFrame({\n",
    "    'Feature': features,\n",
    "    'Contribution': np.abs(pca.components_[0]),\n",
    "    'Contribution_Percentage': np.abs(pca.components_[0]) / np.sum(np.abs(pca.components_[0])) * 100\n",
    "})\n",
    "\n",
    "# Sort features by contribution\n",
    "feature_contributions = feature_contributions.sort_values('Contribution', ascending=False)\n",
    "\n",
    "# Print feature contributions\n",
    "print(\"\\nFeature Contributions:\")\n",
    "print(feature_contributions)\n",
    "\n",
    "# Plot feature contributions\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "plt.bar(feature_contributions['Feature'], feature_contributions['Contribution_Percentage'])\n",
    "plt.xticks(ticks=range(len(X_features)), labels=X_features, rotation=90)\n",
    "plt.xlabel('Features')\n",
    "plt.ylabel('Contribution Percentage')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Plot cumulative feature contributions\n",
    "cumulative_contributions = np.cumsum(feature_contributions['Contribution_Percentage'])\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "plt.plot(range(1, len(features) + 1), cumulative_contributions)\n",
    "plt.xlabel('Number of Features')\n",
    "plt.ylabel('Cumulative Contribution Percentage')\n",
    "plt.title('Cumulative Feature Contributions to PCA Component')\n",
    "plt.grid(True)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Plot the PCA result\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "plt.scatter(df['PCA_Component'], np.zeros(len(df)), c=df['PCA_Component'], cmap='plasma')\n",
    "plt.xlabel('PCA Component')\n",
    "plt.ylabel('Dummy Variable')\n",
    "plt.title('Data Points Projected onto PCA Component')\n",
    "plt.colorbar()\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.manifold import LocallyLinearEmbedding\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "\n",
    "# Extract the features from the dataframe\n",
    "X = df[features]\n",
    "\n",
    "# Standardize the features\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X)\n",
    "\n",
    "# Apply LLE\n",
    "n_neighbors = min(len(X) - 1, 5)  # Ensure n_neighbors is not larger than n_samples - 1\n",
    "lle = LocallyLinearEmbedding(n_components=1, n_neighbors=n_neighbors, random_state=42)\n",
    "X_lle = lle.fit_transform(X_scaled)\n",
    "\n",
    "# Add the LLE result as a new column to the original dataframe\n",
    "df['LLE_Component'] = X_lle\n",
    "\n",
    "# Save the result to a new CSV file\n",
    "df.to_csv('datafinal_with_lle.csv', index=False)\n",
    "\n",
    "# Print the first few rows of the result\n",
    "print(df[['LLE_Component'] + features].head())\n",
    "\n",
    "# Calculate and print the reconstruction error\n",
    "reconstruction_error = lle.reconstruction_error_\n",
    "print(f\"Reconstruction error: {reconstruction_error:.4f}\")\n",
    "\n",
    "# Estimate feature contributions using correlation\n",
    "feature_contributions = np.abs(np.corrcoef(X_scaled.T, X_lle.T)[:-1, -1])\n",
    "contribution_percentages = feature_contributions / np.sum(feature_contributions) * 100\n",
    "\n",
    "# Create a dataframe of feature contributions\n",
    "feature_importance = pd.DataFrame({\n",
    "    'Feature': features,\n",
    "    'Contribution': feature_contributions,\n",
    "    'Contribution_Percentage': contribution_percentages\n",
    "})\n",
    "\n",
    "# Sort features by contribution\n",
    "feature_importance = feature_importance.sort_values('Contribution', ascending=False)\n",
    "\n",
    "# Print feature contributions\n",
    "print(\"\\nEstimated Feature Contributions:\")\n",
    "print(feature_importance)\n",
    "\n",
    "# Plot feature contributions\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "plt.bar(feature_importance['Feature'], feature_importance['Contribution_Percentage'])\n",
    "plt.xticks(rotation=90)\n",
    "plt.xlabel('Features')\n",
    "plt.ylabel('Estimated Contribution Percentage')\n",
    "plt.title('Estimated Feature Contributions to LLE Component')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Plot cumulative feature contributions\n",
    "cumulative_contributions = np.cumsum(feature_importance['Contribution_Percentage'])\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "plt.plot(range(1, len(features) + 1), cumulative_contributions)\n",
    "plt.xlabel('Number of Features')\n",
    "plt.ylabel('Cumulative Contribution Percentage')\n",
    "plt.title('Cumulative Estimated Feature Contributions to LLE Component')\n",
    "plt.grid(True)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Plot the LLE result\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "plt.scatter(df['LLE_Component'], np.zeros(len(df)), c=df['LLE_Component'], cmap='plasma')\n",
    "plt.xlabel('LLE Component')\n",
    "plt.ylabel('Dummy Variable')\n",
    "plt.title('Data Points Projected onto LLE Component')\n",
    "plt.colorbar()\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Load the data\n",
    "df = pd.read_csv('dataNVID1.csv')\n",
    "\n",
    "# List of features to be used for PCA\n",
    "features = ['Acenaphthene-g', 'Acenaphthylene=g', 'Anthracene-g', 'Arsenic-g', 'Benzo(a)anthracene-g', 'Benzo(a)pyrene-g', 'Benzo(b)fluoranthene-g', 'Benzo(g,h,i)perylene-g', 'Benzo(ghi)perylene-g', 'Benzo(k)fluoranthene-g', 'Cadmium-g', 'Carbon dioxide-g', 'Carbon monoxide-g', 'Chromium-g', 'Chrysene-g', 'Dibenz(a,h)anthracene-g', 'Fluoranthene-g', 'Fluorene-g', 'Indeno(1,2,3-cd)pyrene-g', 'Lead-g', 'Mercury-g', 'Methane-g', 'Naphthalene-g', 'Nickel-g', 'Nitrogen oxides-g', 'NMVOC, non-methane volatile organic compounds, unspecified origin-g', 'Particulates-g', 'Particulates, < 10 um-g', 'Phenanthrene-g', 'Pyrene-g', 'Sulfur dioxide-g', 'Vanadium-g', 'Zinc-g']\n",
    "\n",
    "# Extract the features from the dataframe\n",
    "X = df[features]\n",
    "\n",
    "# Define new X-axis labels\n",
    "X_features = ['Acenaphthene', 'Acenaphthylene', 'Anthracene', 'Arsenic', 'Benzo(a)anthracene', 'Benzo(a)pyrene', 'Benzo(b)fluoranthene', 'Benzo(g,h,i)perylene', 'Benzo(ghi)perylene', 'Benzo(k)fluoranthene', 'Cadmium', 'Carbon dioxide', 'Carbon monoxide', 'Chromium', 'Chrysene', 'Dibenz(a,h)anthracene', 'Fluoranthene', 'Fluorene', 'Indeno(1,2,3-cd)pyrene', 'Lead', 'Mercury', 'Methane', 'Naphthalene', 'Nickel', 'Nitrogen oxides', 'NMVOC', 'Particulates', 'Particulates, < 10 um', 'Phenanthrene', 'Pyrene', 'Sulfur dioxide', 'Vanadium', 'Zinc']\n",
    "\n",
    "# Standardize the features\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X)\n",
    "\n",
    "# Apply PCA\n",
    "pca = PCA(n_components=1)\n",
    "X_pca = pca.fit_transform(X_scaled)\n",
    "\n",
    "# Add the PCA result as a new column to the original dataframe\n",
    "df['PCA_Component'] = X_pca\n",
    "\n",
    "# Save the result to a new CSV file\n",
    "df.to_csv('datafinal_with_pca1.csv', index=False)\n",
    "\n",
    "# Print the explained variance ratio\n",
    "print(f\"Explained variance ratio: {pca.explained_variance_ratio_[0]:.4f}\")\n",
    "\n",
    "# Print the first few rows of the result\n",
    "print(df[['PCA_Component'] + features].head())\n",
    "\n",
    "# Calculate feature contributions\n",
    "feature_contributions = pd.DataFrame({\n",
    "    'Feature': features,\n",
    "    'Contribution': np.abs(pca.components_[0]),\n",
    "    'Contribution_Percentage': np.abs(pca.components_[0]) / np.sum(np.abs(pca.components_[0])) * 100\n",
    "})\n",
    "\n",
    "# Sort features by contribution\n",
    "feature_contributions = feature_contributions.sort_values('Contribution', ascending=False)\n",
    "\n",
    "# Print feature contributions\n",
    "print(\"\\nFeature Contributions:\")\n",
    "print(feature_contributions)\n",
    "\n",
    "# Plot feature contributions\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "plt.bar(feature_contributions['Feature'], feature_contributions['Contribution_Percentage'])\n",
    "plt.xticks(ticks=range(len(X_features)), labels=X_features, rotation=90)\n",
    "plt.xlabel('Features')\n",
    "plt.ylabel('Contribution Percentage')\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
    "from sklearn.kernel_approximation import RBFSampler\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import silhouette_score\n",
    "\n",
    "\n",
    "\n",
    "# Extract the features from the dataframe\n",
    "X = df[features]\n",
    "\n",
    "# Standardize the features\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X)\n",
    "\n",
    "# Apply RBF kernel approximation\n",
    "rbf_feature = RBFSampler(n_components=100, random_state=42)\n",
    "X_rbf = rbf_feature.fit_transform(X_scaled)\n",
    "\n",
    "# Create a dummy target variable with two classes\n",
    "y = np.random.randint(0, 2, size=len(X))\n",
    "\n",
    "# Split the data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X_rbf, y, test_size=0.2, random_state=42)\n",
    "\n",
    "# Apply LDA\n",
    "lda = LinearDiscriminantAnalysis(n_components=1)\n",
    "X_kfda_train = lda.fit_transform(X_train, y_train)\n",
    "X_kfda_test = lda.transform(X_test)\n",
    "\n",
    "# Transform the entire dataset\n",
    "X_kfda = lda.transform(X_rbf)\n",
    "\n",
    "# Add the KFDA result as a new column to the original dataframe\n",
    "df['KFDA_Component'] = X_kfda\n",
    "\n",
    "# Save the result to a new CSV file\n",
    "df.to_csv('datafinal_with_kfda.csv', index=False)\n",
    "\n",
    "# Print the first few rows of the result\n",
    "print(df[['KFDA_Component'] + features].head())\n",
    "\n",
    "# Evaluate the KFDA results\n",
    "\n",
    "# 1. Explained variance ratio\n",
    "explained_variance_ratio = lda.explained_variance_ratio_[0]\n",
    "print(f\"\\nExplained variance ratio: {explained_variance_ratio:.4f}\")\n",
    "\n",
    "# 2. Silhouette score\n",
    "silhouette_avg = silhouette_score(X_kfda.reshape(-1, 1), y)\n",
    "print(f\"Silhouette score: {silhouette_avg:.4f}\")\n",
    "\n",
    "# 3. Classification accuracy\n",
    "from sklearn.metrics import accuracy_score\n",
    "y_pred = lda.predict(X_test)\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "print(f\"Classification accuracy: {accuracy:.4f}\")\n",
    "\n",
    "# 4. Correlation with original features\n",
    "correlations = df[features].corrwith(df['KFDA_Component'])\n",
    "print(\"\\nCorrelations with original features:\")\n",
    "print(correlations.sort_values(ascending=False).head())\n",
    "\n",
    "# 5. Visualization\n",
    "import matplotlib.pyplot as plt\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "plt.scatter(X_kfda, np.zeros_like(X_kfda), c=y, cmap='viridis')\n",
    "plt.colorbar(label='Class')\n",
    "plt.title('KFDA Component Visualization')\n",
    "plt.xlabel('KFDA Component')\n",
    "plt.yticks([])\n",
    "plt.show()\\\n",
    "# Calculate approximate feature contributions\n",
    "X_inverse = kpca.inverse_transform(X_kfda)\n",
    "feature_contributions = np.abs(X_inverse).mean(axis=0)\n",
    "total_contribution = feature_contributions.sum()\n",
    "feature_proportions = feature_contributions / total_contribution\n",
    "\n",
    "# Create a dataframe with feature contributions and proportions\n",
    "contribution_df = pd.DataFrame({\n",
    "    'Feature': features,\n",
    "    'Contribution': feature_contributions,\n",
    "    'Proportion': feature_proportions\n",
    "})\n",
    "contribution_df = contribution_df.sort_values('Proportion', ascending=False)\n",
    "\n",
    "# Print the contributions and proportions\n",
    "print(\"\\nFeature Contributions and Proportions:\")\n",
    "print(contribution_df)\n",
    "\n",
    "# Visualize the feature proportions\n",
    "plt.figure(figsize=(12, 8))\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "plt.bar(contribution_df['Feature'], contribution_df['Proportion'])\n",
    "plt.xticks(rotation=90)\n",
    "plt.xlabel('Features')\n",
    "plt.ylabel('Proportion')\n",
    "plt.title('Feature Proportions in KFDA Component')\n",
    "plt.tight_layout()\n",
    "plt.savefig('feature_proportions.png')\n",
    "plt.show()\n",
    "\n",
    "# Calculate and print explained variance\n",
    "explained_variance = 1 - ((X_scaled - X_inverse) ** 2).sum() / (X_scaled ** 2).sum()\n",
    "print(f\"\\nApproximated explained variance: {explained_variance:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import RobustScaler\n",
    "from sklearn.manifold import Isomap\n",
    "from sklearn.impute import SimpleImputer\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "# Read the CSV file\n",
    "df = pd.read_csv('dataNVID1.csv')\n",
    "\n",
    "# List of features to be used for PCA\n",
    "#features = ['NDVI-mean',\t'Temperture(C)',\t'Rainfall(m)',\t'Wind speed (m/s)',\t'Humidity',\t'Sunshine(h)','Acenaphthene-g', 'Acenaphthylene=g', 'Anthracene-g', 'Arsenic-g', 'Benzo(a)anthracene-g', 'Benzo(a)pyrene-g', 'Benzo(b)fluoranthene-g', 'Benzo(g,h,i)perylene-g', 'Benzo(ghi)perylene-g', 'Benzo(k)fluoranthene-g', 'Cadmium-g', 'Carbon dioxide-g', 'Carbon monoxide-g', 'Chromium-g', 'Chrysene-g', 'Dibenz(a,h)anthracene-g', 'Fluoranthene-g', 'Fluorene-g', 'Indeno(1,2,3-cd)pyrene-g', 'Lead-g', 'Mercury-g', 'Methane-g', 'Naphthalene-g', 'Nickel-g', 'Nitrogen oxides-g', 'NMVOC, non-methane volatile organic compounds, unspecified origin-g', 'Particulates-g', 'Particulates, < 10 um-g', 'Phenanthrene-g', 'Pyrene-g', 'Sulfur dioxide-g', 'Vanadium-g', 'Zinc-g']\n",
    "features = ['Acenaphthene-g', 'Acenaphthylene=g', 'Anthracene-g', 'Arsenic-g', 'Benzo(a)anthracene-g', 'Benzo(a)pyrene-g', 'Benzo(b)fluoranthene-g', 'Benzo(g,h,i)perylene-g', 'Benzo(ghi)perylene-g', 'Benzo(k)fluoranthene-g', 'Cadmium-g', 'Carbon dioxide-g', 'Carbon monoxide-g', 'Chromium-g', 'Chrysene-g', 'Dibenz(a,h)anthracene-g', 'Fluoranthene-g', 'Fluorene-g', 'Indeno(1,2,3-cd)pyrene-g', 'Lead-g', 'Mercury-g', 'Methane-g', 'Naphthalene-g', 'Nickel-g', 'Nitrogen oxides-g', 'NMVOC, non-methane volatile organic compounds, unspecified origin-g', 'Particulates-g', 'Particulates, < 10 um-g', 'Phenanthrene-g', 'Pyrene-g', 'Sulfur dioxide-g', 'Vanadium-g', 'Zinc-g']\n",
    "#features=['NDVI-mean',\t'Direct LCHHA (Daly)',\t'Temperture(C)',\t'Rainfall(m)',\t'Wind speed (m/s)',\t'Humidity',\t'Sunshine(h)']\n",
    "# Extract the features from the dataframe\n",
    "# Extract the features from the dataframe\n",
    "X = df[features]\n",
    "\n",
    "# Replace infinite values with NaN\n",
    "X = X.replace([np.inf, -np.inf], np.nan)\n",
    "\n",
    "# Impute NaN values with median\n",
    "#imputer = SimpleImputer(strategy='median')\n",
    "#X_imputed = imputer.fit_transform(X)\n",
    "\n",
    "# Use RobustScaler instead of StandardScaler\n",
    "scaler = RobustScaler()\n",
    "X_scaled = scaler.fit_transform(X)\n",
    "\n",
    "# Apply Isomap\n",
    "isomap = Isomap(n_components=1, n_neighbors=5)\n",
    "X_isomap = isomap.fit_transform(X_scaled)\n",
    "\n",
    "# Add the Isomap result as a new column to the original dataframe\n",
    "df['Isomap_Component'] = X_isomap\n",
    "\n",
    "# Save the result to a new CSV file\n",
    "df.to_csv('datafinal_with_isomap.csv', index=False)\n",
    "\n",
    "# Print the first few rows of the result\n",
    "print(df[['Isomap_Component'] + features].head())\n",
    "\n",
    "# Evaluate the Isomap results\n",
    "\n",
    "# 1. Reconstruction error\n",
    "reconstruction_error = isomap.reconstruction_error()\n",
    "print(f\"\\nReconstruction error: {reconstruction_error:.4f}\")\n",
    "\n",
    "# 2. Correlation with original features\n",
    "correlations = df[features].corrwith(df['Isomap_Component'].squeeze())\n",
    "print(\"\\nTop 5 correlations with original features:\")\n",
    "print(correlations.sort_values(ascending=False).head())\n",
    "\n",
    "# 3. Visualization\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "plt.scatter(X_isomap, np.zeros_like(X_isomap), c=X_isomap, cmap='viridis')\n",
    "plt.colorbar(label='Isomap Component')\n",
    "plt.title('Isomap Component Visualization')\n",
    "plt.xlabel('Isomap Component')\n",
    "plt.yticks([])\n",
    "plt.show()\n",
    "\n",
    "# 4. Variance explained (approximation)\n",
    "total_variance = np.var(X_scaled, axis=0).sum()\n",
    "projected_variance = np.var(X_isomap)\n",
    "variance_explained = projected_variance / total_variance\n",
    "print(f\"\\nApproximate variance explained: {variance_explained:.4f}\")\n",
    "\n",
    "# 5. Feature importance (based on correlation)\n",
    "feature_importance = abs(correlations).sort_values(ascending=False)\n",
    "print(\"\\nTop 5 important features:\")\n",
    "print(feature_importance.head())\n",
    "\n",
    "# Plot feature importance\n",
    "plt.figure(figsize=(12, 6))\n",
    "\n",
    "feature_importance.plot(kind='bar')\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "plt.title('Feature Importance based on Correlation with Isomap Component')\n",
    "plt.xlabel('Features')\n",
    "plt.ylabel('Absolute Correlation')\n",
    "plt.xticks(rotation=90)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(int(3.3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA, KernelPCA\n",
    "from sklearn.manifold import LocallyLinearEmbedding, Isomap\n",
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
    "from sklearn.metrics.pairwise import rbf_kernel\n",
    "\n",
    "# 读取数据\n",
    "df = pd.read_csv('databig2.csv')\n",
    "\n",
    "features = [\n",
    "    'Acenaphthene', 'Acenaphthylene', 'Anthracene', 'Arsenic', 'Benzo(a)anthracene',\n",
    "    'Benzo(a)pyrene', 'Benzo(b)fluoranthene', 'Benzo(g,h,i)perylene', 'Benzo(ghi)perylene',\n",
    "    'Benzo(k)fluoranthene', 'Cadmium', 'Carbon dioxide', 'Carbon monoxide', 'Chromium',\n",
    "    'Chrysene', 'Dibenz(a,h)anthracene', 'Fluoranthene', 'Fluorene', 'Indeno(1,2,3-cd)pyrene',\n",
    "    'Lead', 'Mercury', 'Methane', 'Naphthalene', 'Nickel', 'Nitrogen oxides',\n",
    "    'NMVOC, non-methane volatile organic compounds, unspecified origin', 'Particulates',\n",
    "    'Particulates, < 10 um', 'Phenanthrene', 'Pyrene', 'Sulfur dioxide', 'Vanadium', 'Zinc'\n",
    "]\n",
    "\n",
    "X = df[features]\n",
    "\n",
    "# 数据预处理\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X)\n",
    "\n",
    "# 创建一个假的目标变量用于LDA（因为LDA需要标签）\n",
    "y = np.random.randint(0, 2, size=X.shape[0])\n",
    "\n",
    "# 定义降维方法\n",
    "methods = {\n",
    "    'PCA': PCA(n_components=1),\n",
    "    'KPCA': KernelPCA(n_components=1, kernel='rbf'),\n",
    "    'LLE': LocallyLinearEmbedding(n_components=1, n_neighbors=5),\n",
    "    'KFDA': LinearDiscriminantAnalysis(n_components=1),\n",
    "    'ISOMAP': Isomap(n_components=1, n_neighbors=5)\n",
    "}\n",
    "\n",
    "# 函数用于计算特征重要性\n",
    "def get_feature_importance(method, X, transformed_X):\n",
    "    if hasattr(method, 'components_'):\n",
    "        return np.abs(method.components_[0])\n",
    "    elif hasattr(method, 'feature_importances_'):\n",
    "        return method.feature_importances_\n",
    "    else:\n",
    "        return np.abs(np.corrcoef(X.T, transformed_X.T)[:-1, -1])\n",
    "\n",
    "# 应用降维方法并计算特征重要性\n",
    "results = {}\n",
    "for name, method in methods.items():\n",
    "    if name == 'KFDA':\n",
    "        transformed_X = method.fit_transform(X_scaled, y)\n",
    "    else:\n",
    "        transformed_X = method.fit_transform(X_scaled)\n",
    "    \n",
    "    importance = get_feature_importance(method, X_scaled, transformed_X)\n",
    "    results[name] = importance / importance.sum()\n",
    "\n",
    "# 可视化结果\n",
    "fig, axes = plt.subplots(len(methods), 1, figsize=(12, 6*len(methods)))\n",
    "for i, (name, importance) in enumerate(results.items()):\n",
    "    ax = axes[i]\n",
    "    ax.bar(features, importance)\n",
    "    ax.set_title(f'Feature Importance - {name}')\n",
    "    ax.set_xticklabels(features, rotation=90)\n",
    "    ax.set_ylabel('Relative Importance')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# 打印每种方法的前5个最重要特征\n",
    "for name, importance in results.items():\n",
    "    print(f\"\\nTop 5 important features for {name}:\")\n",
    "    top_features = pd.Series(importance, index=features).nlargest(5)\n",
    "    print(top_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.impute import KNNImputer\n",
    "from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score\n",
    "from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV\n",
    "from sklearn.neural_network import MLPRegressor\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "\n",
    "df = pd.read_csv('dataNAVID.csv')\n",
    "# Impute missing values using KNN\n",
    "imputer = KNNImputer(n_neighbors=10, weights='uniform', missing_values=np.nan)\n",
    "X = df.drop(columns='TB incidence (cases)')\n",
    "X = imputer.fit_transform(X)\n",
    "X = pd.DataFrame(X, columns=['Year', 'Month', 'NDVI-mean', 'Daly from GDB2019',  'Acenaphthene-g', 'Acenaphthylene=g', 'Anthracene-g', 'Arsenic-g', 'Benzo(a)anthracene-g', 'Benzo(a)pyrene-g', 'Benzo(b)fluoranthene-g', 'Benzo(g,h,i)perylene-g', 'Benzo(ghi)perylene-g', 'Benzo(k)fluoranthene-g', 'Cadmium-g', 'Carbon dioxide-g', 'Carbon monoxide-g', 'Chromium-g', 'Chrysene-g', 'Dibenz(a,h)anthracene-g', 'Fluoranthene-g', 'Fluorene-g', 'Indeno(1,2,3-cd)pyrene-g', 'Lead-g', 'Mercury-g', 'Methane-g', 'Naphthalene-g', 'Nickel-g', 'Nitrogen oxides-g', 'NMVOC, non-methane volatile organic compounds, unspecified origin-g', 'Particulates-g', 'Particulates, < 10 um-g', 'Phenanthrene-g', 'Pyrene-g', 'Sulfur dioxide-g', 'Vanadium-g', 'Zinc-g', 'Direct LCHHA (Daly)', 'Temperture(C)',\t'Rainfall(m)',\t'Wind speed (m/s)',\t'Humidity',\t'Sunshine(h)'\n",
    "])\n",
    "X.to_csv(\"input_withknn.csv\", index=False)\n",
    "# Split data into input features and target variable\n",
    "#X = df.drop(columns='mouse_oral_LD50_(log(mol/kg))')\n",
    "Y = df['TB incidence (cases)']\n",
    "result_input = pd.concat([X, pd.Series(Y, name=\"TB incidence (cases)\")], axis=1)\n",
    "result_input.to_csv(\"input_withknn-a.csv\", index=False)\n",
    "# Split the data into training and testing sets"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
